You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2013/05/30 16:29:48 UTC
svn commit: r1487875 - in /nutch/branches/2.x: CHANGES.txt src/bin/crawl
src/java/org/apache/nutch/crawl/GeneratorJob.java
Author: fenglu
Date: Thu May 30 14:29:48 2013
New Revision: 1487875
URL: http://svn.apache.org/r1487875
Log:
NUTCH-1545 capture batchId and remove references to segments in 2.x crawl script.
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/bin/crawl
nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1487875&r1=1487874&r2=1487875&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Thu May 30 14:29:48 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Release 2.2 - Current Development
+* NUTCH-1545 capture batchId and remove references to segments in 2.x crawl script. (Feng)
+
* NUTCH-1575 support solr authentication in nutch 2.x (Feng)
* NUTCH-1569 Upgrade 2.x to Gora 0.3 (lewismc)
Modified: nutch/branches/2.x/src/bin/crawl
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/crawl?rev=1487875&r1=1487874&r2=1487875&view=diff
==============================================================================
--- nutch/branches/2.x/src/bin/crawl (original)
+++ nutch/branches/2.x/src/bin/crawl Thu May 30 14:29:48 2013
@@ -19,7 +19,7 @@
#
#
# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND
-# INDEXING FOR EACH SEGMENT
+# INDEXING FOR EACH BATCH
SEEDDIR="$1"
CRAWL_ID="$2"
@@ -111,33 +111,35 @@ do
echo `date` ": Iteration $a of $LIMIT"
+ echo "Generating batchId"
+ batchId=`date +%s`-$RANDOM
+
echo "Generating a new fetchlist"
- $bin/nutch generate $commonOptions $CRAWL_ID/crawldb $CRAWL_ID/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter -adddays $addDays
+ $bin/nutch generate $commonOptions -topN $sizeFetchlist -noNorm -noFilter -adddays $addDays -crawlId $CRAWL_ID -batchId $batchId
if [ $? -ne 0 ]
then exit $?
fi
- # TODO capture the batchID
echo "Fetching : "
- $bin/nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch -all -crawlId $CRAWL_ID -threads 50
+ $bin/nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch $batchId -crawlId $CRAWL_ID -threads 50
if [ $? -ne 0 ]
then exit $?
fi
- # parsing the segment
+ # parsing the batch
echo "Parsing : "
# enable the skipping of records for the parsing so that a dodgy document
# so that it does not fail the full task
skipRecordsOptions="-D mapred.skip.attempts.to.start.skipping=2 -D mapred.skip.map.max.skip.records=1"
- $bin/nutch parse $commonOptions $skipRecordsOptions -all -crawlId $CRAWL_ID
+ $bin/nutch parse $commonOptions $skipRecordsOptions $batchId -crawlId $CRAWL_ID
if [ $? -ne 0 ]
then exit $?
fi
- # updatedb with this segment
+ # updatedb with this batch
echo "CrawlDB update"
$bin/nutch updatedb $commonOptions
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java?rev=1487875&r1=1487874&r2=1487875&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java Thu May 30 14:29:48 2013
@@ -5,9 +5,9 @@
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -122,10 +122,10 @@ public class GeneratorJob extends NutchT
return false;
return true;
}
-
+
/**
* Sets url with score on this writable. Allows for writable reusing.
- *
+ *
* @param url
* @param score
*/
@@ -175,10 +175,7 @@ public class GeneratorJob extends NutchT
getConf().setLong(GENERATOR_TOP_N, topN);
if (filter != null)
getConf().setBoolean(GENERATOR_FILTER, filter);
- int randomSeed = Math.abs(new Random().nextInt());
- batchId = (curTime / 1000) + "-" + randomSeed;
- getConf().setInt(GENERATOR_RANDOM_SEED, randomSeed);
- getConf().set(BATCH_ID, batchId);
+
getConf().setLong(Nutch.GENERATE_TIME_KEY, System.currentTimeMillis());
if (norm != null)
getConf().setBoolean(GENERATOR_NORMALISE, norm);
@@ -194,19 +191,17 @@ public class GeneratorJob extends NutchT
}
numJobs = 1;
currentJobNum = 0;
- currentJob = new NutchJob(getConf(), "generate: " + batchId);
+ currentJob = new NutchJob(getConf(), "generate: " + getConf().get(BATCH_ID));
Collection<WebPage.Field> fields = getFields(currentJob);
StorageUtils.initMapperJob(currentJob, fields, SelectorEntry.class,
WebPage.class, GeneratorMapper.class, SelectorEntryPartitioner.class, true);
StorageUtils.initReducerJob(currentJob, GeneratorReducer.class);
currentJob.waitForCompletion(true);
ToolUtil.recordJobStatus(null, currentJob, results);
- results.put(BATCH_ID, batchId);
+ results.put(BATCH_ID, getConf().get(BATCH_ID));
return results;
}
-
- private String batchId;
-
+
/**
* Mark URLs ready for fetching.
* @throws ClassNotFoundException
@@ -230,7 +225,7 @@ public class GeneratorJob extends NutchT
Nutch.ARG_CURTIME, curTime,
Nutch.ARG_FILTER, filter,
Nutch.ARG_NORMALIZE, norm));
- batchId = getConf().get(BATCH_ID);
+ String batchId = getConf().get(BATCH_ID);
long finish = System.currentTimeMillis();
LOG.info("GeneratorJob: finished at " + sdf.format(finish) + ", time elapsed: " + TimingUtil.elapsedTime(start, finish));
LOG.info("GeneratorJob: generated batch id: " + batchId);
@@ -246,6 +241,7 @@ public class GeneratorJob extends NutchT
System.out.println(" -noNorm - do not activate the normalizer plugin to normalize the url, default is true ");
System.out.println(" -adddays - Adds numDays to the current time to facilitate crawling urls already");
System.out.println(" fetched sooner then db.fetch.interval.default. Default value is 0.");
+ System.out.println(" -batchId - the batch id ");
System.out.println("----------------------");
System.out.println("Please set the params.");
return -1;
@@ -254,6 +250,11 @@ public class GeneratorJob extends NutchT
long curTime = System.currentTimeMillis(), topN = Long.MAX_VALUE;
boolean filter = true, norm = true;
+ // generate batchId
+ int randomSeed = Math.abs(new Random().nextInt());
+ String batchId = (curTime / 1000) + "-" + randomSeed;
+ getConf().set(BATCH_ID, batchId);
+
for (int i = 0; i < args.length; i++) {
if ("-topN".equals(args[i])) {
topN = Long.parseLong(args[++i]);
@@ -266,6 +267,11 @@ public class GeneratorJob extends NutchT
} else if ("-adddays".equals(args[i])) {
long numDays = Integer.parseInt(args[++i]);
curTime += numDays * 1000L * 60 * 60 * 24;
+ }else if ("-batchId".equals(args[i]))
+ getConf().set(BATCH_ID,args[++i]);
+ else {
+ System.err.println("Unrecognized arg " + args[i]);
+ return -1;
}
}