You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2010/07/03 19:59:30 UTC
svn commit: r960248 - in /nutch/branches/branch-1.2: ./ conf/
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/
src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/indexer/field/
src/java/org/apache/nutch/indexer/solr/ src/ja...
Author: mattmann
Date: Sat Jul 3 17:59:29 2010
New Revision: 960248
URL: http://svn.apache.org/viewvc?rev=960248&view=rev
Log:
fix for NUTCH-838 Add timing information to all Tool classes backported to 1.2-branch
Added:
nutch/branches/branch-1.2/src/java/org/apache/nutch/util/TimingUtil.java (with props)
Modified:
nutch/branches/branch-1.2/CHANGES.txt
nutch/branches/branch-1.2/conf/log4j.properties
nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/CrawlDb.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/Generator.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/Injector.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/LinkDb.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/LinkDbMerger.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/LinkDbReader.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/fetcher/Fetcher.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/fetcher/OldFetcher.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/IndexMerger.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/IndexSorter.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/Indexer.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/field/AnchorFields.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/field/BasicFields.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/field/CustomFields.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/field/FieldIndexer.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/parse/ParseSegment.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/Loops.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/CrawlDBScanner.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/FreeGenerator.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java
nutch/branches/branch-1.2/src/java/org/apache/nutch/util/domain/DomainStatistics.java
Modified: nutch/branches/branch-1.2/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/CHANGES.txt?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/CHANGES.txt (original)
+++ nutch/branches/branch-1.2/CHANGES.txt Sat Jul 3 17:59:29 2010
@@ -2,6 +2,8 @@ Nutch Change Log
Release 1.2 - Current Development
+* NUTCH-838 Add timing information to all Tool classes (Jeroen van Vianen, mattmann)
+
* NUTCH-835 Document deduplication failed using MD5Signature (Sebastian Nagel via ab)
* NUTCH-831 Allow configuration of how fields crawled by Nutch are stored / indexed /
Modified: nutch/branches/branch-1.2/conf/log4j.properties
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/conf/log4j.properties?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/conf/log4j.properties (original)
+++ nutch/branches/branch-1.2/conf/log4j.properties Sat Jul 3 17:59:29 2010
@@ -25,6 +25,9 @@ log4j.logger.org.apache.nutch.crawl.Link
log4j.logger.org.apache.nutch.indexer.Indexer=INFO,cmdstdout
log4j.logger.org.apache.nutch.indexer.DeleteDuplicates=INFO,cmdstdout
log4j.logger.org.apache.nutch.indexer.IndexMerger=INFO,cmdstdout
+log4j.logger.org.apache.nutch.indexer.solr.SolrIndexer=INFO,cmdstdout
+log4j.logger.org.apache.nutch.indexer.solr.SolrDeleteDuplicates=INFO,cmdstdout
+
log4j.logger.org.apache.nutch=INFO
log4j.logger.org.apache.hadoop=WARN
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/CrawlDb.java Sat Jul 3 17:59:29 2010
@@ -18,6 +18,7 @@
package org.apache.nutch.crawl;
import java.io.*;
+import java.text.SimpleDateFormat;
import java.util.*;
// Commons Logging imports
@@ -34,6 +35,7 @@ import org.apache.nutch.util.HadoopFSUti
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
/**
* This class takes the output of the fetcher and updates the
@@ -63,8 +65,10 @@ public class CrawlDb extends Configured
FileSystem fs = FileSystem.get(getConf());
Path lock = new Path(crawlDb, LOCK_NAME);
LockUtil.createLockFile(fs, lock, force);
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
if (LOG.isInfoEnabled()) {
- LOG.info("CrawlDb update: starting");
+ LOG.info("CrawlDb update: starting at " + sdf.format(start));
LOG.info("CrawlDb update: db: " + crawlDb);
LOG.info("CrawlDb update: segments: " + Arrays.asList(segments));
LOG.info("CrawlDb update: additions allowed: " + additionsAllowed);
@@ -100,7 +104,8 @@ public class CrawlDb extends Configured
}
CrawlDb.install(job, crawlDb);
- if (LOG.isInfoEnabled()) { LOG.info("CrawlDb update: done"); }
+ long end = System.currentTimeMillis();
+ LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
public static JobConf createJob(Configuration config, Path crawlDb)
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/CrawlDbMerger.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/CrawlDbMerger.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/CrawlDbMerger.java Sat Jul 3 17:59:29 2010
@@ -18,6 +18,7 @@
package org.apache.nutch.crawl;
import java.io.IOException;
+import java.text.SimpleDateFormat;
import java.util.*;
import java.util.Map.Entry;
@@ -35,6 +36,7 @@ import org.apache.hadoop.util.*;
import org.apache.hadoop.conf.*;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
/**
* This tool merges several CrawlDb-s into one, optionally filtering
@@ -112,6 +114,10 @@ public class CrawlDbMerger extends Confi
}
public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) throws Exception {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("CrawlDb merge: starting at " + sdf.format(start));
+
JobConf job = createMergeJob(getConf(), output, normalize, filter);
for (int i = 0; i < dbs.length; i++) {
FileInputFormat.addInputPath(job, new Path(dbs[i], CrawlDb.CURRENT_NAME));
@@ -120,6 +126,8 @@ public class CrawlDbMerger extends Confi
FileSystem fs = FileSystem.get(getConf());
fs.mkdirs(output);
fs.rename(FileOutputFormat.getOutputPath(job), new Path(output, CrawlDb.CURRENT_NAME));
+ long end = System.currentTimeMillis();
+ LOG.info("CrawlDb merge: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
public static JobConf createMergeJob(Configuration conf, Path output, boolean normalize, boolean filter) {
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/Generator.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/Generator.java Sat Jul 3 17:59:29 2010
@@ -44,6 +44,7 @@ import org.apache.nutch.scoring.ScoringF
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
/**
@@ -472,8 +473,10 @@ public class Generator extends Configure
FileSystem fs = FileSystem.get(getConf());
LockUtil.createLockFile(fs, lock, force);
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("Generator: starting at " + sdf.format(start));
LOG.info("Generator: Selecting best-scoring urls due for fetch.");
- LOG.info("Generator: starting");
LOG.info("Generator: filtering: " + filter);
LOG.info("Generator: normalizing: " + norm);
if (topN != Long.MAX_VALUE) {
@@ -586,9 +589,8 @@ public class Generator extends Configure
LockUtil.removeLockFile(fs, lock);
fs.delete(tempDir, true);
- if (LOG.isInfoEnabled()) {
- LOG.info("Generator: done.");
- }
+ long end = System.currentTimeMillis();
+ LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
Path[] patharray = new Path[generatedSegments.size()];
return generatedSegments.toArray(patharray);
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/Injector.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/Injector.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/Injector.java Sat Jul 3 17:59:29 2010
@@ -18,6 +18,7 @@
package org.apache.nutch.crawl;
import java.io.*;
+import java.text.SimpleDateFormat;
import java.util.*;
// Commons Logging imports
@@ -35,6 +36,7 @@ import org.apache.nutch.scoring.ScoringF
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
/** This class takes a flat file of URLs and adds them to the of pages to be
* crawled. Useful for bootstrapping the system.
@@ -79,6 +81,12 @@ public class Injector extends Configured
OutputCollector<Text, CrawlDatum> output, Reporter reporter)
throws IOException {
String url = value.toString(); // value is line of text
+
+ if (url != null && url.trim().startsWith("#")) {
+ /* Ignore line that start with # */
+ return;
+ }
+
// if tabs : metadata that could be stored
// must be name=value and separated by \t
float customScore = -1f;
@@ -182,9 +190,10 @@ public class Injector extends Configured
}
public void inject(Path crawlDb, Path urlDir) throws IOException {
-
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
if (LOG.isInfoEnabled()) {
- LOG.info("Injector: starting");
+ LOG.info("Injector: starting at " + sdf.format(start));
LOG.info("Injector: crawlDb: " + crawlDb);
LOG.info("Injector: urlDir: " + urlDir);
}
@@ -223,8 +232,9 @@ public class Injector extends Configured
// clean up
FileSystem fs = FileSystem.get(getConf());
fs.delete(tempDir, true);
- if (LOG.isInfoEnabled()) { LOG.info("Injector: done"); }
+ long end = System.currentTimeMillis();
+ LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
public static void main(String[] args) throws Exception {
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/LinkDb.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/LinkDb.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/LinkDb.java Sat Jul 3 17:59:29 2010
@@ -18,6 +18,7 @@
package org.apache.nutch.crawl;
import java.io.*;
+import java.text.SimpleDateFormat;
import java.util.*;
import java.net.*;
@@ -39,6 +40,7 @@ import org.apache.nutch.util.HadoopFSUti
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
/** Maintains an inverted link map, listing incoming links for each url. */
public class LinkDb extends Configured implements Tool, Mapper<Text, ParseData, Text, Inlinks> {
@@ -153,8 +155,11 @@ public class LinkDb extends Configured i
FileSystem fs = FileSystem.get(getConf());
LockUtil.createLockFile(fs, lock, force);
Path currentLinkDb = new Path(linkDb, CURRENT_NAME);
+
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
if (LOG.isInfoEnabled()) {
- LOG.info("LinkDb: starting");
+ LOG.info("LinkDb: starting at " + sdf.format(start));
LOG.info("LinkDb: linkdb: " + linkDb);
LOG.info("LinkDb: URL normalize: " + normalize);
LOG.info("LinkDb: URL filter: " + filter);
@@ -191,7 +196,9 @@ public class LinkDb extends Configured i
fs.delete(newLinkDb, true);
}
LinkDb.install(job, linkDb);
- if (LOG.isInfoEnabled()) { LOG.info("LinkDb: done"); }
+
+ long end = System.currentTimeMillis();
+ LOG.info("LinkDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
private static JobConf createJob(Configuration config, Path linkDb, boolean normalize, boolean filter) {
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/LinkDbMerger.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/LinkDbMerger.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/LinkDbMerger.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/LinkDbMerger.java Sat Jul 3 17:59:29 2010
@@ -17,6 +17,7 @@
package org.apache.nutch.crawl;
import java.io.IOException;
+import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Random;
@@ -42,6 +43,7 @@ import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
/**
* This tool merges several LinkDb-s into one, optionally filtering
@@ -100,6 +102,10 @@ public class LinkDbMerger extends Config
public void close() throws IOException { }
public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) throws Exception {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("LinkDb merge: starting at " + sdf.format(start));
+
JobConf job = createMergeJob(getConf(), output, normalize, filter);
for (int i = 0; i < dbs.length; i++) {
FileInputFormat.addInputPath(job, new Path(dbs[i], LinkDb.CURRENT_NAME));
@@ -108,6 +114,9 @@ public class LinkDbMerger extends Config
FileSystem fs = FileSystem.get(getConf());
fs.mkdirs(output);
fs.rename(FileOutputFormat.getOutputPath(job), new Path(output, LinkDb.CURRENT_NAME));
+
+ long end = System.currentTimeMillis();
+ LOG.info("LinkDb merge: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
public static JobConf createMergeJob(Configuration config, Path linkDb, boolean normalize, boolean filter) {
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/LinkDbReader.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/LinkDbReader.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/crawl/LinkDbReader.java Sat Jul 3 17:59:29 2010
@@ -33,7 +33,9 @@ import org.apache.hadoop.conf.Configurat
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
+import java.text.SimpleDateFormat;
import java.util.Iterator;
/** . */
@@ -89,10 +91,11 @@ public class LinkDbReader extends Config
}
public void processDumpJob(String linkdb, String output) throws IOException {
-
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
if (LOG.isInfoEnabled()) {
- LOG.info("LinkDb dump: starting");
- LOG.info("LinkDb db: " + linkdb);
+ LOG.info("LinkDb dump: starting at " + sdf.format(start));
+ LOG.info("LinkDb dump: db: " + linkdb);
}
Path outFolder = new Path(output);
@@ -108,6 +111,9 @@ public class LinkDbReader extends Config
job.setOutputValueClass(Inlinks.class);
JobClient.runJob(job);
+
+ long end = System.currentTimeMillis();
+ LOG.info("LinkDb dump: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
public static void main(String[] args) throws Exception {
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/fetcher/Fetcher.java Sat Jul 3 17:59:29 2010
@@ -21,6 +21,7 @@ import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.UnknownHostException;
+import java.text.SimpleDateFormat;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.atomic.AtomicInteger;
@@ -1066,8 +1067,10 @@ public class Fetcher extends Configured
checkConfiguration();
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
if (LOG.isInfoEnabled()) {
- LOG.info("Fetcher: starting");
+ LOG.info("Fetcher: starting at " + sdf.format(start));
LOG.info("Fetcher: segment: " + segment);
}
@@ -1102,7 +1105,9 @@ public class Fetcher extends Configured
job.setOutputValueClass(NutchWritable.class);
JobClient.runJob(job);
- if (LOG.isInfoEnabled()) { LOG.info("Fetcher: done"); }
+
+ long end = System.currentTimeMillis();
+ LOG.info("Fetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/fetcher/OldFetcher.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/fetcher/OldFetcher.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/fetcher/OldFetcher.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/fetcher/OldFetcher.java Sat Jul 3 17:59:29 2010
@@ -19,6 +19,7 @@ package org.apache.nutch.fetcher;
import java.io.IOException;
import java.net.MalformedURLException;
+import java.text.SimpleDateFormat;
import java.util.Map.Entry;
// Commons Logging imports
@@ -504,8 +505,10 @@ public class OldFetcher extends Configur
public void fetch(Path segment, int threads)
throws IOException {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
if (LOG.isInfoEnabled()) {
- LOG.info("OldFetcher: starting");
+ LOG.info("OldFetcher: starting at " + sdf.format(start));
LOG.info("OldFetcher: segment: " + segment);
}
@@ -529,7 +532,8 @@ public class OldFetcher extends Configur
job.setOutputValueClass(NutchWritable.class);
JobClient.runJob(job);
- if (LOG.isInfoEnabled()) { LOG.info("OldFetcher: done"); }
+ long end = System.currentTimeMillis();
+ LOG.info("OldFetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Sat Jul 3 17:59:29 2010
@@ -36,6 +36,7 @@ import org.apache.nutch.util.NutchJob;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
+import org.apache.nutch.util.TimingUtil;
/**
* Delete duplicate documents in a set of Lucene indexes.
@@ -418,7 +419,9 @@ public class DeleteDuplicates extends Co
public void dedup(Path[] indexDirs)
throws IOException {
- if (LOG.isInfoEnabled()) { LOG.info("Dedup: starting"); }
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("Dedup: starting at " + sdf.format(start));
Path outDir1 =
new Path("dedup-urls-"+
@@ -492,7 +495,8 @@ public class DeleteDuplicates extends Co
fs.delete(outDir2, true);
- if (LOG.isInfoEnabled()) { LOG.info("Dedup: done"); }
+ long end = System.currentTimeMillis();
+ LOG.info("Dedup: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
public static void main(String[] args) throws Exception {
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/IndexMerger.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/IndexMerger.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/IndexMerger.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/IndexMerger.java Sat Jul 3 17:59:29 2010
@@ -18,6 +18,7 @@
package org.apache.nutch.indexer;
import java.io.*;
+import java.text.SimpleDateFormat;
import java.util.*;
import org.apache.commons.logging.Log;
@@ -37,6 +38,7 @@ import org.apache.lucene.store.FSDirecto
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
+import org.apache.nutch.util.TimingUtil;
/*************************************************************************
* IndexMerger creates an index for the output corresponding to a
@@ -62,7 +64,12 @@ public class IndexMerger extends Configu
* Merge all input indexes to the single output index
*/
public void merge(Path[] indexes, Path outputIndex, Path localWorkingDir) throws IOException {
- LOG.info("merging indexes to: " + outputIndex);
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ if (LOG.isInfoEnabled()) {
+ LOG.info("IndexMerger: starting at " + sdf.format(start));
+ LOG.info("IndexMerger: merging indexes to: " + outputIndex);
+ }
FileSystem localFs = FileSystem.getLocal(getConf());
if (localFs.exists(localWorkingDir)) {
@@ -107,7 +114,8 @@ public class IndexMerger extends Configu
// Put target back
//
fs.completeLocalOutput(outputIndex, tmpLocalOutput);
- LOG.info("done merging");
+ long end = System.currentTimeMillis();
+ LOG.info("IndexMerger: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
/**
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/IndexSorter.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/IndexSorter.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/IndexSorter.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/IndexSorter.java Sat Jul 3 17:59:29 2010
@@ -19,6 +19,7 @@ package org.apache.nutch.indexer;
import java.io.File;
import java.io.IOException;
+import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Arrays;
@@ -35,6 +36,7 @@ import org.apache.commons.logging.LogFac
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.*;
+import org.apache.nutch.util.TimingUtil;
/** Sort a Nutch index by page score. Higher scoring documents are assigned
* smaller document numbers. */
@@ -261,8 +263,9 @@ public class IndexSorter extends Configu
}
public void sort(File directory) throws IOException {
- LOG.info("IndexSorter: starting.");
- Date start = new Date();
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("IndexSorter: starting at " + sdf.format(start));
int termIndexInterval = getConf().getInt("indexer.termIndexInterval", 128);
IndexReader reader = IndexReader.open(
FSDirectory.open(new File(directory, "index")));
@@ -276,9 +279,8 @@ public class IndexSorter extends Configu
writer.setUseCompoundFile(false);
writer.addIndexes(new IndexReader[] { sorter });
writer.close();
- Date end = new Date();
- LOG.info("IndexSorter: done, " + (end.getTime() - start.getTime())
- + " total milliseconds");
+ long end = System.currentTimeMillis();
+ LOG.info("IndexSorter: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
private static int[] oldToNew(IndexReader reader) throws IOException {
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/Indexer.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/Indexer.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/Indexer.java Sat Jul 3 17:59:29 2010
@@ -18,6 +18,7 @@
package org.apache.nutch.indexer;
import java.io.IOException;
+import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.List;
@@ -35,6 +36,7 @@ import org.apache.hadoop.util.ToolRunner
import org.apache.nutch.indexer.lucene.LuceneWriter;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
/** Create indexes for segments. */
public class Indexer extends Configured implements Tool {
@@ -54,7 +56,9 @@ public class Indexer extends Configured
public void index(Path luceneDir, Path crawlDb,
Path linkDb, List<Path> segments)
throws IOException {
- LOG.info("Indexer: starting");
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("Indexer: starting at " + sdf.format(start));
final JobConf job = new NutchJob(getConf());
job.setJobName("index-lucene " + luceneDir);
@@ -70,7 +74,8 @@ public class Indexer extends Configured
NutchIndexWriterFactory.addClassToConf(job, LuceneWriter.class);
JobClient.runJob(job);
- LOG.info("Indexer: done");
+ long end = System.currentTimeMillis();
+ LOG.info("Indexer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
public int run(String[] args) throws Exception {
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/field/AnchorFields.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/field/AnchorFields.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/field/AnchorFields.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/field/AnchorFields.java Sat Jul 3 17:59:29 2010
@@ -17,6 +17,7 @@
package org.apache.nutch.indexer.field;
import java.io.IOException;
+import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
@@ -58,6 +59,7 @@ import org.apache.nutch.scoring.webgraph
import org.apache.nutch.scoring.webgraph.WebGraph;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
/**
* Creates FieldWritable objects for inbound anchor text. These FieldWritable
@@ -357,6 +359,9 @@ public class AnchorFields
*/
public void createFields(Path webGraphDb, Path basicFields, Path output)
throws IOException {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("AnchorFields: starting at " + sdf.format(start));
Configuration conf = getConf();
FileSystem fs = FileSystem.get(conf);
@@ -365,6 +370,8 @@ public class AnchorFields
runExtractor(webGraphDb, tempLinks);
runCollector(basicFields, tempLinks, output);
fs.delete(tempLinks, true);
+ long end = System.currentTimeMillis();
+ LOG.info("AnchorFields: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
public static void main(String[] args)
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/field/BasicFields.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/field/BasicFields.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/field/BasicFields.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/field/BasicFields.java Sat Jul 3 17:59:29 2010
@@ -17,6 +17,7 @@
package org.apache.nutch.indexer.field;
import java.io.IOException;
+import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
@@ -65,6 +66,7 @@ import org.apache.nutch.scoring.webgraph
import org.apache.nutch.scoring.webgraph.WebGraph;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
/**
@@ -691,6 +693,10 @@ public class BasicFields
public void createFields(Path nodeDb, Path[] segments, Path output)
throws IOException {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("BasicFields: starting at " + sdf.format(start));
+
Configuration conf = getConf();
FileSystem fs = FileSystem.get(conf);
Path tempOutput = new Path(output.toString() + "-temp");
@@ -720,6 +726,8 @@ public class BasicFields
// merge all of the segments and delete any temporary output
runMerger(basicFields, output);
fs.delete(tempOutput, true);
+ long end = System.currentTimeMillis();
+ LOG.info("BasicFields: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
public static void main(String[] args)
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/field/CustomFields.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/field/CustomFields.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/field/CustomFields.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/field/CustomFields.java Sat Jul 3 17:59:29 2010
@@ -18,6 +18,7 @@ package org.apache.nutch.indexer.field;
import java.io.IOException;
import java.io.InputStream;
+import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
@@ -62,6 +63,7 @@ import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
/**
* Creates custom FieldWritable objects from a text file containing field
@@ -375,6 +377,10 @@ public class CustomFields
void createFields(Path basicFields, Path[] inputs, Path output)
throws IOException {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("CustomerFields: starting at " + sdf.format(start));
+
Configuration conf = getConf();
FileSystem fs = FileSystem.get(conf);
Path tempFields = new Path(output + "-"
@@ -382,6 +388,8 @@ public class CustomFields
runConverter(inputs, tempFields);
runCollector(basicFields, tempFields, output);
fs.delete(tempFields, true);
+ long end = System.currentTimeMillis();
+ LOG.info("CommonFields: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
public static void main(String[] args)
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/field/FieldIndexer.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/field/FieldIndexer.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/field/FieldIndexer.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/field/FieldIndexer.java Sat Jul 3 17:59:29 2010
@@ -20,6 +20,7 @@ import java.io.DataInput;
import java.io.DataOutput;
import java.io.File;
import java.io.IOException;
+import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
@@ -68,6 +69,7 @@ import org.apache.nutch.indexer.NutchSim
import org.apache.nutch.util.LogUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
public class FieldIndexer
extends Configured
@@ -248,7 +250,9 @@ public class FieldIndexer
public void index(Path[] fields, Path indexDir)
throws IOException {
- LOG.info("FieldIndexer: starting");
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("FieldIndexer: starting at " + sdf.format(start));
JobConf job = new NutchJob(getConf());
job.setJobName("FieldIndexer: " + indexDir);
@@ -270,9 +274,8 @@ public class FieldIndexer
job.setOutputValueClass(LuceneDocumentWrapper.class);
JobClient.runJob(job);
- if (LOG.isInfoEnabled()) {
- LOG.info("FieldIndexer: done");
- }
+ long end = System.currentTimeMillis();
+ LOG.info("FieldIndexer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
public static void main(String[] args)
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java Sat Jul 3 17:59:29 2010
@@ -20,6 +20,7 @@ import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.MalformedURLException;
+import java.text.SimpleDateFormat;
import java.util.Iterator;
import org.apache.commons.logging.Log;
@@ -42,6 +43,7 @@ import org.apache.hadoop.util.ToolRunner
import org.apache.nutch.indexer.DeleteDuplicates;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
@@ -122,14 +124,12 @@ Tool {
tstamp = (Long)doc.getFieldValue(SolrConstants.TIMESTAMP_FIELD);
}
- @Override
public void readFields(DataInput in) throws IOException {
id = Text.readString(in);
boost = in.readFloat();
tstamp = in.readLong();
}
- @Override
public void write(DataOutput out) throws IOException {
Text.writeString(out, id);
out.writeFloat(boost);
@@ -157,23 +157,19 @@ Tool {
return numDocs;
}
- @Override
public long getLength() throws IOException {
return numDocs;
}
- @Override
public String[] getLocations() throws IOException {
return new String[] {} ;
}
- @Override
public void readFields(DataInput in) throws IOException {
docBegin = in.readInt();
numDocs = in.readInt();
}
- @Override
public void write(DataOutput out) throws IOException {
out.writeInt(docBegin);
out.writeInt(numDocs);
@@ -239,30 +235,24 @@ Tool {
private int currentDoc = 0;
- @Override
public void close() throws IOException { }
- @Override
public Text createKey() {
return new Text();
}
- @Override
public SolrRecord createValue() {
return new SolrRecord();
}
- @Override
public long getPos() throws IOException {
return currentDoc;
}
- @Override
public float getProgress() throws IOException {
return currentDoc / (float) numDocs;
}
- @Override
public boolean next(Text key, SolrRecord value) throws IOException {
if (currentDoc >= numDocs) {
return false;
@@ -288,17 +278,14 @@ Tool {
private UpdateRequest updateRequest = new UpdateRequest();
- @Override
public Configuration getConf() {
return conf;
}
- @Override
public void setConf(Configuration conf) {
this.conf = conf;
}
- @Override
public void configure(JobConf job) {
try {
solr = new CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL));
@@ -308,10 +295,10 @@ Tool {
}
- @Override
public void close() throws IOException {
try {
if (numDeletes > 0) {
+ LOG.info("SolrDeleteDuplicates: deleting " + numDeletes + " duplicates");
updateRequest.process(solr);
}
} catch (SolrServerException e) {
@@ -319,7 +306,6 @@ Tool {
}
}
- @Override
public void reduce(Text key, Iterator<SolrRecord> values,
OutputCollector<Text, SolrRecord> output, Reporter reporter)
throws IOException {
@@ -337,6 +323,7 @@ Tool {
numDeletes++;
if (numDeletes >= NUM_MAX_DELETE_REQUEST) {
try {
+ LOG.info("SolrDeleteDuplicates: deleting " + numDeletes + " duplicates");
updateRequest.process(solr);
} catch (SolrServerException e) {
throw new IOException(e);
@@ -348,7 +335,9 @@ Tool {
}
public void dedup(String solrUrl) throws IOException {
- LOG.info("SolrDeleteDuplicates: starting...");
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("SolrDeleteDuplicates: starting at " + sdf.format(start));
LOG.info("SolrDeleteDuplicates: Solr url: " + solrUrl);
JobConf job = new NutchJob(getConf());
@@ -362,8 +351,9 @@ Tool {
job.setReducerClass(SolrDeleteDuplicates.class);
JobClient.runJob(job);
-
- LOG.info("SolrDeleteDuplicates: done.");
+
+ long end = System.currentTimeMillis();
+ LOG.info("SolrDeleteDuplicates: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
public int run(String[] args) throws IOException {
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java Sat Jul 3 17:59:29 2010
@@ -16,11 +16,6 @@
*/
package org.apache.nutch.indexer.solr;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Random;
-
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
@@ -37,9 +32,16 @@ import org.apache.nutch.indexer.IndexerM
import org.apache.nutch.indexer.NutchIndexWriterFactory;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
public class SolrIndexer extends Configured implements Tool {
public static Log LOG = LogFactory.getLog(SolrIndexer.class);
@@ -54,7 +56,9 @@ public class SolrIndexer extends Configu
public void indexSolr(String solrUrl, Path crawlDb, Path linkDb,
List<Path> segments) throws IOException {
- LOG.info("SolrIndexer: starting");
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("SolrIndexer: starting at " + sdf.format(start));
final JobConf job = new NutchJob(getConf());
job.setJobName("index-solr " + solrUrl);
@@ -76,13 +80,14 @@ public class SolrIndexer extends Configu
// do the commits once and for all the reducers in one go
SolrServer solr = new CommonsHttpSolrServer(solrUrl);
solr.commit();
- }
+ long end = System.currentTimeMillis();
+ LOG.info("SolrIndexer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+ }
catch (Exception e){
LOG.error(e);
} finally {
FileSystem.get(job).delete(tmp, true);
}
- LOG.info("SolrIndexer: done");
}
public int run(String[] args) throws Exception {
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/parse/ParseSegment.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/parse/ParseSegment.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/parse/ParseSegment.java Sat Jul 3 17:59:29 2010
@@ -34,6 +34,7 @@ import org.apache.nutch.util.*;
import org.apache.hadoop.fs.Path;
import java.io.*;
+import java.text.SimpleDateFormat;
import java.util.*;
import java.util.Map.Entry;
@@ -42,7 +43,7 @@ public class ParseSegment extends Config
Mapper<WritableComparable, Content, Text, ParseImpl>,
Reducer<Text, Writable, Text, Writable> {
- public static final Log LOG = LogFactory.getLog(Parser.class);
+ public static final Log LOG = LogFactory.getLog(ParseSegment.class);
private ScoringFilters scfilters;
@@ -131,9 +132,11 @@ public class ParseSegment extends Config
public void parse(Path segment) throws IOException {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
if (LOG.isInfoEnabled()) {
- LOG.info("Parse: starting");
- LOG.info("Parse: segment: " + segment);
+ LOG.info("ParseSegment: starting at " + sdf.format(start));
+ LOG.info("ParseSegment: segment: " + segment);
}
JobConf job = new NutchJob(getConf());
@@ -151,7 +154,8 @@ public class ParseSegment extends Config
job.setOutputValueClass(ParseImpl.class);
JobClient.runJob(job);
- if (LOG.isInfoEnabled()) { LOG.info("Parse: done"); }
+ long end = System.currentTimeMillis();
+ LOG.info("ParseSegment: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java Sat Jul 3 17:59:29 2010
@@ -19,6 +19,7 @@ package org.apache.nutch.scoring.webgrap
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
+import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
@@ -62,6 +63,7 @@ import org.apache.nutch.scoring.webgraph
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
/**
* The LinkDumper tool creates a database of node to inlink information that can
@@ -346,7 +348,9 @@ public class LinkDumper
public void dumpLinks(Path webGraphDb)
throws IOException {
- LOG.info("NodeDumper: starting");
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("NodeDumper: starting at " + sdf.format(start));
Configuration conf = getConf();
FileSystem fs = FileSystem.get(conf);
@@ -410,6 +414,8 @@ public class LinkDumper
}
fs.delete(tempInverted, true);
+ long end = System.currentTimeMillis();
+ LOG.info("LinkDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
public static void main(String[] args)
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java Sat Jul 3 17:59:29 2010
@@ -19,6 +19,7 @@ package org.apache.nutch.scoring.webgrap
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
+import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
@@ -64,6 +65,7 @@ import org.apache.nutch.scoring.webgraph
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
public class LinkRank
@@ -579,6 +581,10 @@ public class LinkRank
public void analyze(Path webGraphDb)
throws IOException {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("Analysis: starting at " + sdf.format(start));
+
// store the link rank under the webgraphdb temporarily, final scores get
// upddated into the nodedb
Path linkRank = new Path(webGraphDb, "linkrank");
@@ -606,8 +612,8 @@ public class LinkRank
float rankOneScore = (1f / (float)numLinks);
if (LOG.isInfoEnabled()) {
- LOG.info("Number of links " + numLinks);
- LOG.info("Rank One " + rankOneScore);
+ LOG.info("Analysis: Number of links: " + numLinks);
+ LOG.info("Analysis: Rank One: " + rankOneScore);
}
// run invert and analysis for a given number of iterations to allow the
@@ -616,7 +622,7 @@ public class LinkRank
for (int i = 0; i < numIterations; i++) {
// the input to inverting is always the previous output from analysis
- LOG.info("Running iteration " + (i + 1) + " of " + numIterations);
+ LOG.info("Analysis: Starting iteration " + (i + 1) + " of " + numIterations);
Path tempRank = new Path(linkRank + "-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
fs.mkdirs(tempRank);
@@ -629,19 +635,20 @@ public class LinkRank
rankOneScore);
// replace the temporary NodeDb with the output from analysis
- LOG.info("Installing new link scores");
+ LOG.info("Analysis: Installing new link scores");
FSUtils.replace(fs, linkRank, tempRank, true);
- LOG.info("Finished analysis iteration " + (i + 1) + " of "
+ LOG.info("Analysis: finished iteration " + (i + 1) + " of "
+ numIterations);
}
// replace the NodeDb in the WebGraph with the final output of analysis
- LOG.info("Installing web graph nodes");
+ LOG.info("Analysis: Installing web graph nodes");
FSUtils.replace(fs, wgNodeDb, nodeDb, true);
// remove the temporary link rank folder
fs.delete(linkRank, true);
- LOG.info("Finished analysis");
+ long end = System.currentTimeMillis();
+ LOG.info("Analysis: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
public static void main(String[] args)
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/Loops.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/Loops.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/Loops.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/Loops.java Sat Jul 3 17:59:29 2010
@@ -19,6 +19,7 @@ package org.apache.nutch.scoring.webgrap
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
+import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
@@ -61,6 +62,7 @@ import org.apache.hadoop.util.ToolRunner
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
/**
* The Loops job identifies cycles of loops inside of the web graph. This is
@@ -466,8 +468,10 @@ public class Loops
public void findLoops(Path webGraphDb)
throws IOException {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
if (LOG.isInfoEnabled()) {
- LOG.info("Loops: starting");
+ LOG.info("Loops: starting at " + sdf.format(start));
LOG.info("Loops: webgraphdb: " + webGraphDb);
}
@@ -495,11 +499,11 @@ public class Loops
init.setOutputFormat(SequenceFileOutputFormat.class);
try {
- LOG.info("Initializer: running");
+ LOG.info("Loops: starting initializer");
JobClient.runJob(init);
- LOG.info("Initializer: installing " + routes);
+ LOG.info("Loops: installing initializer " + routes);
FSUtils.replace(fs, routes, tempRoute, true);
- LOG.info("Initializer: finished");
+ LOG.info("Loops: finished initializer");
}
catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
@@ -527,11 +531,11 @@ public class Loops
looper.setBoolean("last", i == (depth - 1));
try {
- LOG.info("Looper: running");
+ LOG.info("Loops: starting looper");
JobClient.runJob(looper);
- LOG.info("Looper: installing " + routes);
+ LOG.info("Loops: installing looper " + routes);
FSUtils.replace(fs, routes, tempRoute, true);
- LOG.info("Looper: finished");
+ LOG.info("Loops: finished looper");
}
catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
@@ -554,14 +558,16 @@ public class Loops
finalizer.setOutputFormat(MapFileOutputFormat.class);
try {
- LOG.info("Finalizer: running");
+ LOG.info("Loops: starting finalizer");
JobClient.runJob(finalizer);
- LOG.info("Finalizer: finished");
+ LOG.info("Loops: finished finalizer");
}
catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
+ long end = System.currentTimeMillis();
+ LOG.info("Loops: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
public static void main(String[] args)
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java Sat Jul 3 17:59:29 2010
@@ -17,6 +17,7 @@
package org.apache.nutch.scoring.webgraph;
import java.io.IOException;
+import java.text.SimpleDateFormat;
import java.util.Iterator;
import org.apache.commons.cli.CommandLine;
@@ -49,6 +50,7 @@ import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
/**
* A tools that dumps out the top urls by number of inlinks, number of outlinks,
@@ -152,9 +154,6 @@ public class NodeDumper
*
* @param webGraphDb The WebGraph from which to pull values.
*
- * @param inlinks
- * @param outlinks
- * @param scores
* @param topN
* @param output
*
@@ -163,7 +162,9 @@ public class NodeDumper
public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output)
throws IOException {
- LOG.info("NodeDumper: starting");
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("NodeDumper: starting at " + sdf.format(start));
Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
Configuration conf = getConf();
@@ -193,6 +194,8 @@ public class NodeDumper
LOG.error(StringUtils.stringifyException(e));
throw e;
}
+ long end = System.currentTimeMillis();
+ LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
public static void main(String[] args)
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java Sat Jul 3 17:59:29 2010
@@ -17,6 +17,7 @@
package org.apache.nutch.scoring.webgraph;
import java.io.IOException;
+import java.text.SimpleDateFormat;
import java.util.Iterator;
import java.util.Random;
@@ -53,6 +54,7 @@ import org.apache.nutch.crawl.CrawlDatum
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
/**
* Updates the score from the WebGraph node database into the crawl database.
@@ -151,6 +153,10 @@ public class ScoreUpdater
public void update(Path crawlDb, Path webGraphDb)
throws IOException {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("ScoreUpdater: starting at " + sdf.format(start));
+
Configuration conf = getConf();
FileSystem fs = FileSystem.get(conf);
@@ -190,8 +196,11 @@ public class ScoreUpdater
}
// install the temp crawl database
- LOG.info("Installing new crawldb " + crawlDb);
+ LOG.info("ScoreUpdater: installing new crawldb " + crawlDb);
CrawlDb.install(updater, crawlDb);
+
+ long end = System.currentTimeMillis();
+ LOG.info("ScoreUpdater: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
public static void main(String[] args)
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java Sat Jul 3 17:59:29 2010
@@ -17,6 +17,7 @@
package org.apache.nutch.scoring.webgraph;
import java.io.IOException;
+import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
@@ -63,6 +64,7 @@ import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
/**
@@ -437,8 +439,10 @@ public class WebGraph
public void createWebGraph(Path webGraphDb, Path[] segments)
throws IOException {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
if (LOG.isInfoEnabled()) {
- LOG.info("WebGraphDb: starting");
+ LOG.info("WebGraphDb: starting at " + sdf.format(start));
LOG.info("WebGraphDb: webgraphdb: " + webGraphDb);
}
@@ -590,6 +594,9 @@ public class WebGraph
// remove the lock file for the webgraph
LockUtil.removeLockFile(fs, lock);
+
+ long end = System.currentTimeMillis();
+ LOG.info("WebGraphDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
public static void main(String[] args)
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/CrawlDBScanner.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/CrawlDBScanner.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/CrawlDBScanner.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/CrawlDBScanner.java Sat Jul 3 17:59:29 2010
@@ -17,6 +17,7 @@
package org.apache.nutch.tools;
import java.io.IOException;
+import java.text.SimpleDateFormat;
import java.util.Iterator;
import org.apache.commons.logging.Log;
@@ -43,6 +44,7 @@ import org.apache.nutch.crawl.CrawlDatum
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
/**
* Dumps all the entries matching a regular expression on their URL. Generates a
@@ -99,6 +101,10 @@ public class CrawlDBScanner extends Conf
private void scan(Path crawlDb, Path outputPath, String regex, String status,
boolean text) throws IOException {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("CrawlDB scanner: starting at " + sdf.format(start));
+
JobConf job = new NutchJob(getConf());
job.setJobName("Scan : " + crawlDb + " for URLS matching : " + regex);
@@ -139,6 +145,9 @@ public class CrawlDBScanner extends Conf
} catch (IOException e) {
throw e;
}
+
+ long end = System.currentTimeMillis();
+ LOG.info("CrawlDb scanner: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
public static void main(String args[]) throws Exception {
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/FreeGenerator.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/FreeGenerator.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/FreeGenerator.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/FreeGenerator.java Sat Jul 3 17:59:29 2010
@@ -18,6 +18,7 @@
package org.apache.nutch.tools;
import java.io.IOException;
+import java.text.SimpleDateFormat;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map.Entry;
@@ -50,6 +51,7 @@ import org.apache.nutch.net.URLNormalize
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
/**
* This tool generates fetchlists (segments to be fetched) from plain text
@@ -157,6 +159,10 @@ public class FreeGenerator extends Confi
}
}
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("FreeGenerator: starting at " + sdf.format(start));
+
JobConf job = new NutchJob(getConf());
job.setBoolean(FILTER_KEY, filter);
job.setBoolean(NORMALIZE_KEY, normalize);
@@ -177,11 +183,13 @@ public class FreeGenerator extends Confi
new Path(segName, CrawlDatum.GENERATE_DIR_NAME)));
try {
JobClient.runJob(job);
- return 0;
} catch (Exception e) {
LOG.fatal("FAILED: " + StringUtils.stringifyException(e));
return -1;
}
+ long end = System.currentTimeMillis();
+ LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+ return 0;
}
public static void main(String[] args) throws Exception {
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java Sat Jul 3 17:59:29 2010
@@ -59,6 +59,7 @@ import org.apache.nutch.util.LogUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.StringUtil;
+import org.apache.nutch.util.TimingUtil;
/**
* <p>The <code>ArcSegmentCreator</code> is a replacement for fetcher that will
@@ -346,8 +347,10 @@ public class ArcSegmentCreator
public void createSegments(Path arcFiles, Path segmentsOutDir)
throws IOException {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
if (LOG.isInfoEnabled()) {
- LOG.info("ArcSegmentCreator: starting");
+ LOG.info("ArcSegmentCreator: starting at " + sdf.format(start));
LOG.info("ArcSegmentCreator: arc files dir: " + arcFiles);
}
@@ -364,9 +367,9 @@ public class ArcSegmentCreator
job.setOutputValueClass(NutchWritable.class);
JobClient.runJob(job);
- if (LOG.isInfoEnabled()) {
- LOG.info("ArcSegmentCreator: done");
- }
+
+ long end = System.currentTimeMillis();
+ LOG.info("ArcSegmentCreator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
public static void main(String args[])
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java Sat Jul 3 17:59:29 2010
@@ -18,6 +18,7 @@
package org.apache.nutch.tools.compat;
import java.io.IOException;
+import java.text.SimpleDateFormat;
import java.util.Iterator;
import java.util.Random;
@@ -47,6 +48,7 @@ import org.apache.nutch.crawl.CrawlDatum
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
/**
* This tool converts CrawlDb created in old <UTF8, CrawlDatum> format
@@ -129,6 +131,10 @@ public class CrawlDbConverter extends Co
if (args.length > 2 && args[2].equalsIgnoreCase("-withMetadata"))
withMetadata = true;
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("CrawlDbConverter: starting at " + sdf.format(start));
+
job.setBoolean(CONVERT_META_KEY, withMetadata);
FileInputFormat.addInputPath(job, oldDb);
job.setInputFormat(SequenceFileInputFormat.class);
@@ -140,10 +146,13 @@ public class CrawlDbConverter extends Co
try {
JobClient.runJob(job);
CrawlDb.install(job, new Path(args[1]));
- return 0;
} catch (Exception e) {
LOG.fatal("Error: " + StringUtils.stringifyException(e));
return -1;
}
+
+ long end = System.currentTimeMillis();
+ LOG.info("CrawlDb scanner: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+ return 0;
}
}
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java Sat Jul 3 17:59:29 2010
@@ -19,6 +19,7 @@ package org.apache.nutch.tools.compat;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
+import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
@@ -59,6 +60,7 @@ import org.apache.nutch.scoring.webgraph
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
/**
@@ -157,13 +159,17 @@ public class ReprUrlFixer
public void update(Path crawlDb, Path[] segments)
throws IOException {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("ReprUrlFixer: starting at " + sdf.format(start));
+
Configuration conf = getConf();
FileSystem fs = FileSystem.get(conf);
// run the crawl database through the repr fixer
if (crawlDb != null) {
- LOG.info("Running ReprUtilFixer " + crawlDb);
+ LOG.info("ReprUrlFixer: crawlDb " + crawlDb);
Path crawlDbCurrent = new Path(crawlDb, CrawlDb.CURRENT_NAME);
Path newCrawlDb = new Path(crawlDb,
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
@@ -180,7 +186,7 @@ public class ReprUrlFixer
try {
JobClient.runJob(updater);
- LOG.info("Installing new crawldb " + crawlDb);
+ LOG.info("ReprUrlFixer: installing new crawldb " + crawlDb);
CrawlDb.install(updater, crawlDb);
}
catch (IOException e) {
@@ -196,13 +202,13 @@ public class ReprUrlFixer
for (int i = 0; i < segments.length; i++) {
Path segment = segments[i];
- LOG.info("Running ReprUtilFixer " + segment + " fetch");
+ LOG.info("ReprUrlFixer: fetching segment " + segment);
Path segFetch = new Path(segment, CrawlDatum.FETCH_DIR_NAME);
Path newSegFetch = new Path(segment, CrawlDatum.FETCH_DIR_NAME + "-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
JobConf fetch = new NutchJob(conf);
- fetch.setJobName("ReprUtilFixer: " + segment.toString());
+ fetch.setJobName("ReprUrlFixer: " + segment.toString());
FileInputFormat.addInputPath(fetch, segFetch);
FileOutputFormat.setOutputPath(fetch, newSegFetch);
fetch.setInputFormat(SequenceFileInputFormat.class);
@@ -213,7 +219,7 @@ public class ReprUrlFixer
try {
JobClient.runJob(fetch);
- LOG.info("Installing new segment fetch directory " + newSegFetch);
+ LOG.info("ReprUrlFixer: installing new segment fetch directory " + newSegFetch);
FSUtils.replace(fs, segFetch, newSegFetch, true);
LOG.info("ReprUrlFixer: finished installing segment fetch directory");
}
@@ -222,13 +228,13 @@ public class ReprUrlFixer
throw e;
}
- LOG.info("Running ReprUtilFixer " + segment + " parse");
+ LOG.info("ReprUrlFixer: parsing segment " + segment);
Path segParse = new Path(segment, CrawlDatum.PARSE_DIR_NAME);
Path newSegParse = new Path(segment, CrawlDatum.PARSE_DIR_NAME + "-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
JobConf parse = new NutchJob(conf);
- parse.setJobName("ReprUtilFixer: " + segment.toString());
+ parse.setJobName("ReprUrlFixer: " + segment.toString());
FileInputFormat.addInputPath(parse, segParse);
FileOutputFormat.setOutputPath(parse, newSegParse);
parse.setInputFormat(SequenceFileInputFormat.class);
@@ -239,7 +245,7 @@ public class ReprUrlFixer
try {
JobClient.runJob(parse);
- LOG.info("Installing new segment parse directry " + newSegParse);
+ LOG.info("ReprUrlFixer: installing new segment parse directry " + newSegParse);
FSUtils.replace(fs, segParse, newSegParse, true);
LOG.info("ReprUrlFixer: finished installing segment parse directory");
}
@@ -249,6 +255,9 @@ public class ReprUrlFixer
}
}
}
+
+ long end = System.currentTimeMillis();
+ LOG.info("ReprUrlFixer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
/**
Added: nutch/branches/branch-1.2/src/java/org/apache/nutch/util/TimingUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/util/TimingUtil.java?rev=960248&view=auto
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/util/TimingUtil.java (added)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/util/TimingUtil.java Sat Jul 3 17:59:29 2010
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.text.NumberFormat;
+
+public class TimingUtil {
+
+ private static long[] TIME_FACTOR = { 60 * 60 * 1000, 60 * 1000, 1000 };
+
+ /**
+ * Calculate the elapsed time between two times specified in milliseconds.
+ * @param start The start of the time period
+ * @param end The end of the time period
+ * @return a string of the form "XhYmZs" when the elapsed time is X hours, Y minutes and Z seconds or null if start > end.
+ */
+ public static String elapsedTime(long start, long end){
+ if (start > end) {
+ return null;
+ }
+
+ long[] elapsedTime = new long[TIME_FACTOR.length];
+
+ for (int i = 0; i < TIME_FACTOR.length; i++) {
+ elapsedTime[i] = start > end ? -1 : (end - start) / TIME_FACTOR[i];
+ start += TIME_FACTOR[i] * elapsedTime[i];
+ }
+
+ NumberFormat nf = NumberFormat.getInstance();
+ nf.setMinimumIntegerDigits(2);
+ StringBuffer buf = new StringBuffer();
+ for (int i = 0; i < elapsedTime.length; i++) {
+ if (i > 0) {
+ buf.append(":");
+ }
+ buf.append(nf.format(elapsedTime[i]));
+ }
+ return buf.toString();
+ }
+}
Propchange: nutch/branches/branch-1.2/src/java/org/apache/nutch/util/TimingUtil.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: nutch/branches/branch-1.2/src/java/org/apache/nutch/util/domain/DomainStatistics.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.2/src/java/org/apache/nutch/util/domain/DomainStatistics.java?rev=960248&r1=960247&r2=960248&view=diff
==============================================================================
--- nutch/branches/branch-1.2/src/java/org/apache/nutch/util/domain/DomainStatistics.java (original)
+++ nutch/branches/branch-1.2/src/java/org/apache/nutch/util/domain/DomainStatistics.java Sat Jul 3 17:59:29 2010
@@ -19,6 +19,7 @@ package org.apache.nutch.util.domain;
import java.io.IOException;
import java.net.URL;
+import java.text.SimpleDateFormat;
import java.util.Iterator;
import org.apache.commons.logging.Log;
@@ -43,6 +44,7 @@ import org.apache.hadoop.util.ToolRunner
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
/**
@@ -70,7 +72,7 @@ implements Tool, Mapper<Text, CrawlDatum
public int run(String[] args) throws IOException {
if (args.length < 3) {
- System.out.println("usage: inputDirs outDir host|domain|suffix [numOfReducer]");
+ System.out.println("usage: DomainStatistics inputDirs outDir host|domain|suffix [numOfReducer]");
return 1;
}
String inputDir = args[0];
@@ -81,6 +83,10 @@ implements Tool, Mapper<Text, CrawlDatum
numOfReducers = Integer.parseInt(args[3]);
}
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("DomainStatistics: starting at " + sdf.format(start));
+
JobConf job = new NutchJob(getConf());
job.setJobName("Domain statistics");
@@ -112,6 +118,8 @@ implements Tool, Mapper<Text, CrawlDatum
JobClient.runJob(job);
+ long end = System.currentTimeMillis();
+ LOG.info("DomainStatistics: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
return 0;
}
@@ -122,12 +130,10 @@ implements Tool, Mapper<Text, CrawlDatum
}
- @Override
public Configuration getConf() {
return conf;
}
- @Override
public void setConf(Configuration conf) {
this.conf = conf;
}