You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2016/01/07 21:57:13 UTC
svn commit: r1723626 - in /nutch/branches/2.x: CHANGES.txt
src/java/org/apache/nutch/crawl/GeneratorJob.java
Author: snagel
Date: Thu Jan 7 20:57:13 2016
New Revision: 1723626
URL: http://svn.apache.org/viewvc?rev=1723626&view=rev
Log:
NUTCH-2143 GeneratorJob ignores batch id passed as argument
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1723626&r1=1723625&r2=1723626&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Thu Jan 7 20:57:13 2016
@@ -3,6 +3,8 @@ Nutch Change Log
Nutch 2.3.1 Release 22092015 (ddmmyyyy)
Release Report - http://s.apache.org/nutch_2.3.1
+* NUTCH-2143 GeneratorJob ignores batch id passed as argument (liuqibj, lewismc, snagel)
+
* NUTCH-2042 parse-html increase chunk size used to detect charset (snagel)
* NUTCH-2107 plugin.xml to validate against plugin.dtd (snagel)
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java?rev=1723626&r1=1723625&r2=1723626&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java Thu Jan 7 20:57:13 2016
@@ -163,17 +163,20 @@ public class GeneratorJob extends NutchT
return fields;
}
+ /** Generate a random batch id */
+ public static String randomBatchId() {
+ long curTime = System.currentTimeMillis();
+ int randomSeed = Math.abs(new Random().nextInt());
+ String batchId = (curTime / 1000) + "-" + randomSeed;
+ return batchId;
+ }
+
public Map<String, Object> run(Map<String, Object> args) throws Exception {
String batchId = (String) args.get(Nutch.ARG_BATCH);
- if (batchId != null) {
- getConf().set(GeneratorJob.BATCH_ID, batchId);
- } else {
- // generate batchId
- long curTime = System.currentTimeMillis();
- int randomSeed = Math.abs(new Random().nextInt());
- batchId = (curTime / 1000) + "-" + randomSeed;
- getConf().set(BATCH_ID, batchId);
+ if (batchId == null) {
+ batchId = randomBatchId();
}
+ getConf().set(BATCH_ID, batchId);
// map to inverted subset due for fetch, sort by score
Long topN = null;
@@ -249,10 +252,15 @@ public class GeneratorJob extends NutchT
if (topN != Long.MAX_VALUE) {
LOG.info("GeneratorJob: topN: " + topN);
}
+ String batchId = getConf().get(BATCH_ID);
Map<String, Object> results = run(ToolUtil.toArgMap(Nutch.ARG_TOPN, topN,
Nutch.ARG_CURTIME, curTime, Nutch.ARG_FILTER, filter,
- Nutch.ARG_NORMALIZE, norm));
- String batchId = getConf().get(BATCH_ID);
+ Nutch.ARG_NORMALIZE, norm, Nutch.ARG_BATCH, batchId));
+ if (batchId == null) {
+ // use generated random batch id
+ batchId = (String) results.get(BATCH_ID);
+ }
+
long finish = System.currentTimeMillis();
long generateCount = (Long) results.get(GENERATE_COUNT);
LOG.info("GeneratorJob: finished at " + sdf.format(finish)
@@ -290,11 +298,6 @@ public class GeneratorJob extends NutchT
long curTime = System.currentTimeMillis(), topN = Long.MAX_VALUE;
boolean filter = true, norm = true;
- // generate batchId
- int randomSeed = Math.abs(new Random().nextInt());
- String batchId = (curTime / 1000) + "-" + randomSeed;
- getConf().set(BATCH_ID, batchId);
-
for (int i = 0; i < args.length; i++) {
if ("-topN".equals(args[i])) {
topN = Long.parseLong(args[++i]);
@@ -307,9 +310,9 @@ public class GeneratorJob extends NutchT
} else if ("-adddays".equals(args[i])) {
long numDays = Integer.parseInt(args[++i]);
curTime += numDays * 1000L * 60 * 60 * 24;
- } else if ("-batchId".equals(args[i]))
+ } else if ("-batchId".equals(args[i])) {
getConf().set(BATCH_ID, args[++i]);
- else {
+ } else {
System.err.println("Unrecognized arg " + args[i]);
return -1;
}