You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2013/05/30 16:29:48 UTC

svn commit: r1487875 - in /nutch/branches/2.x: CHANGES.txt src/bin/crawl src/java/org/apache/nutch/crawl/GeneratorJob.java

Author: fenglu
Date: Thu May 30 14:29:48 2013
New Revision: 1487875

URL: http://svn.apache.org/r1487875
Log:
NUTCH-1545 capture batchId and remove references to segments in 2.x crawl script. 

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/bin/crawl
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1487875&r1=1487874&r2=1487875&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Thu May 30 14:29:48 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 2.2 - Current Development
 
+* NUTCH-1545 capture batchId and remove references to segments in 2.x crawl script. (Feng)
+
 * NUTCH-1575 support solr authentication in nutch 2.x (Feng)
 
 * NUTCH-1569 Upgrade 2.x to Gora 0.3 (lewismc)

Modified: nutch/branches/2.x/src/bin/crawl
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/crawl?rev=1487875&r1=1487874&r2=1487875&view=diff
==============================================================================
--- nutch/branches/2.x/src/bin/crawl (original)
+++ nutch/branches/2.x/src/bin/crawl Thu May 30 14:29:48 2013
@@ -19,7 +19,7 @@
 #
 # 
 # UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND 
-# INDEXING FOR EACH SEGMENT
+# INDEXING FOR EACH BATCH
 
 SEEDDIR="$1"
 CRAWL_ID="$2"
@@ -111,33 +111,35 @@ do
 
   echo `date` ": Iteration $a of $LIMIT"
 
+  echo "Generating batchId"
+  batchId=`date +%s`-$RANDOM
+
   echo "Generating a new fetchlist"
-  $bin/nutch generate $commonOptions $CRAWL_ID/crawldb $CRAWL_ID/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter -adddays $addDays
+  $bin/nutch generate $commonOptions -topN $sizeFetchlist -noNorm -noFilter -adddays $addDays -crawlId $CRAWL_ID -batchId $batchId
   
   if [ $? -ne 0 ] 
   then exit $? 
   fi
 
-  # TODO capture the batchID
   echo "Fetching : "
-  $bin/nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch -all -crawlId $CRAWL_ID -threads 50
+  $bin/nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch $batchId -crawlId $CRAWL_ID -threads 50
 
   if [ $? -ne 0 ] 
   then exit $? 
   fi
 
-  # parsing the segment
+  # parsing the batch
   echo "Parsing : "
   # enable the skipping of records for the parsing so that a dodgy document 
   # so that it does not fail the full task
   skipRecordsOptions="-D mapred.skip.attempts.to.start.skipping=2 -D mapred.skip.map.max.skip.records=1"
-  $bin/nutch parse $commonOptions $skipRecordsOptions -all -crawlId $CRAWL_ID
+  $bin/nutch parse $commonOptions $skipRecordsOptions $batchId -crawlId $CRAWL_ID
 
   if [ $? -ne 0 ] 
   then exit $? 
   fi
 
-  # updatedb with this segment
+  # updatedb with this batch
   echo "CrawlDB update"
   $bin/nutch updatedb $commonOptions
 

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java?rev=1487875&r1=1487874&r2=1487875&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java Thu May 30 14:29:48 2013
@@ -5,9 +5,9 @@
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
- * 
+ *
  *     http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -122,10 +122,10 @@ public class GeneratorJob extends NutchT
         return false;
       return true;
     }
-		
+
     /**
      * Sets url with score on this writable. Allows for writable reusing.
-     * 
+     *
      * @param url
      * @param score
      */
@@ -175,10 +175,7 @@ public class GeneratorJob extends NutchT
       getConf().setLong(GENERATOR_TOP_N, topN);
     if (filter != null)
       getConf().setBoolean(GENERATOR_FILTER, filter);
-    int randomSeed = Math.abs(new Random().nextInt());
-    batchId = (curTime / 1000) + "-" + randomSeed;
-    getConf().setInt(GENERATOR_RANDOM_SEED, randomSeed);
-    getConf().set(BATCH_ID, batchId);
+
     getConf().setLong(Nutch.GENERATE_TIME_KEY, System.currentTimeMillis());
     if (norm != null)
       getConf().setBoolean(GENERATOR_NORMALISE, norm);
@@ -194,19 +191,17 @@ public class GeneratorJob extends NutchT
     }
     numJobs = 1;
     currentJobNum = 0;
-    currentJob = new NutchJob(getConf(), "generate: " + batchId);
+    currentJob = new NutchJob(getConf(), "generate: " + getConf().get(BATCH_ID));
     Collection<WebPage.Field> fields = getFields(currentJob);
     StorageUtils.initMapperJob(currentJob, fields, SelectorEntry.class,
         WebPage.class, GeneratorMapper.class, SelectorEntryPartitioner.class, true);
     StorageUtils.initReducerJob(currentJob, GeneratorReducer.class);
     currentJob.waitForCompletion(true);
     ToolUtil.recordJobStatus(null, currentJob, results);
-    results.put(BATCH_ID, batchId);
+    results.put(BATCH_ID, getConf().get(BATCH_ID));
     return results;
   }
-  
-  private String batchId;
-  
+
   /**
    * Mark URLs ready for fetching.
    * @throws ClassNotFoundException
@@ -230,7 +225,7 @@ public class GeneratorJob extends NutchT
         Nutch.ARG_CURTIME, curTime,
         Nutch.ARG_FILTER, filter,
         Nutch.ARG_NORMALIZE, norm));
-    batchId =  getConf().get(BATCH_ID);
+    String batchId =  getConf().get(BATCH_ID);
     long finish = System.currentTimeMillis();
     LOG.info("GeneratorJob: finished at " + sdf.format(finish) + ", time elapsed: " + TimingUtil.elapsedTime(start, finish));
     LOG.info("GeneratorJob: generated batch id: " + batchId);
@@ -246,6 +241,7 @@ public class GeneratorJob extends NutchT
       System.out.println("    -noNorm        - do not activate the normalizer plugin to normalize the url, default is true ");
       System.out.println("    -adddays       - Adds numDays to the current time to facilitate crawling urls already");
       System.out.println("                     fetched sooner then db.fetch.interval.default. Default value is 0.");
+      System.out.println("    -batchId       - the batch id ");
       System.out.println("----------------------");
       System.out.println("Please set the params.");
       return -1;
@@ -254,6 +250,11 @@ public class GeneratorJob extends NutchT
     long curTime = System.currentTimeMillis(), topN = Long.MAX_VALUE;
     boolean filter = true, norm = true;
 
+    // generate batchId
+    int randomSeed = Math.abs(new Random().nextInt());
+    String batchId = (curTime / 1000) + "-" + randomSeed;
+    getConf().set(BATCH_ID, batchId);
+
     for (int i = 0; i < args.length; i++) {
       if ("-topN".equals(args[i])) {
         topN = Long.parseLong(args[++i]);
@@ -266,6 +267,11 @@ public class GeneratorJob extends NutchT
       } else if ("-adddays".equals(args[i])) {
         long numDays = Integer.parseInt(args[++i]);
         curTime += numDays * 1000L * 60 * 60 * 24;
+      }else if ("-batchId".equals(args[i]))
+        getConf().set(BATCH_ID,args[++i]);
+      else {
+        System.err.println("Unrecognized arg " + args[i]);
+        return -1;
       }
     }