You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/11/11 17:20:02 UTC

svn commit: r1638203 - in /nutch: branches/2.x/src/bin/crawl trunk/src/bin/crawl

Author: snagel
Date: Tue Nov 11 16:20:01 2014
New Revision: 1638203

URL: http://svn.apache.org/r1638203
Log:
NUTCH-1883 in case of generate: break loop and do not exit with error

Modified:
    nutch/branches/2.x/src/bin/crawl
    nutch/trunk/src/bin/crawl

Modified: nutch/branches/2.x/src/bin/crawl
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/crawl?rev=1638203&r1=1638202&r2=1638203&view=diff
==============================================================================
--- nutch/branches/2.x/src/bin/crawl (original)
+++ nutch/branches/2.x/src/bin/crawl Tue Nov 11 16:20:01 2014
@@ -137,7 +137,22 @@ do
   batchId=`date +%s`-$RANDOM
 
   echo "Generating a new fetchlist"
-  __bin_nutch generate $commonOptions -topN $sizeFetchlist -noNorm -noFilter -adddays $addDays -crawlId "$CRAWL_ID" -batchId $batchId
+  generate_args=($commonOptions -topN $sizeFetchlist -noNorm -noFilter -adddays $addDays -crawlId "$CRAWL_ID" -batchId $batchId)
+  echo "$bin/nutch generate ${generate_args[@]}"
+  $bin/nutch generate "${generate_args[@]}"
+  RETCODE=$?
+  if [ $RETCODE -eq 0 ]; then
+      : # ok: no error
+  elif [ $RETCODE -eq 1 ]; then
+    echo "Generate returned 1 (no new segments created)"
+    echo "Escaping loop: no more URLs to fetch now"
+    break
+  else
+    echo "Error running:"
+    echo "  $bin/nutch generate ${generate_args[@]}"
+    echo "Failed with exit value $RETCODE."
+    exit $RETCODE
+  fi
 
   echo "Fetching : "
   __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch $batchId -crawlId "$CRAWL_ID" -threads 50

Modified: nutch/trunk/src/bin/crawl
URL: http://svn.apache.org/viewvc/nutch/trunk/src/bin/crawl?rev=1638203&r1=1638202&r2=1638203&view=diff
==============================================================================
--- nutch/trunk/src/bin/crawl (original)
+++ nutch/trunk/src/bin/crawl Tue Nov 11 16:20:01 2014
@@ -133,7 +133,22 @@ do
   echo `date` ": Iteration $a of $LIMIT"
 
   echo "Generating a new segment"
-  __bin_nutch generate $commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter
+  generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter)
+  echo "$bin/nutch generate ${generate_args[@]}"
+  $bin/nutch generate "${generate_args[@]}"
+  RETCODE=$?
+  if [ $RETCODE -eq 0 ]; then
+      : # ok: no error
+  elif [ $RETCODE -eq 1 ]; then
+    echo "Generate returned 1 (no new segments created)"
+    echo "Escaping loop: no more URLs to fetch now"
+    break
+  else
+    echo "Error running:"
+    echo "  $bin/nutch generate ${generate_args[@]}"
+    echo "Failed with exit value $RETCODE."
+    exit $RETCODE
+  fi
 
   # capture the name of the segment
   # call hadoop in distributed mode
@@ -168,7 +183,7 @@ do
   __bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
 
   echo "Dedup on crawldb"
-  $bin/nutch dedup $CRAWL_PATH/crawldb
+  __bin_nutch dedup "$CRAWL_PATH"/crawldb
 
   if [ -n "$SOLRURL" ]; then
       echo "Indexing $SEGMENT on SOLR index -> $SOLRURL"