You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2014/08/29 13:19:40 UTC

svn commit: r1621284 - in /nutch/trunk: CHANGES.txt src/bin/crawl

Author: jnioche
Date: Fri Aug 29 11:19:37 2014
New Revision: 1621284

URL: http://svn.apache.org/r1621284
Log:
NUTCH-1828 bin/crawl : incorrect handling of nutch errors

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/bin/crawl

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1621284&r1=1621283&r2=1621284&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Aug 29 11:19:37 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1828 bin/crawl : incorrect handling of nutch errors (Mathieu Bouchard via jnioche)
+
 * NUTCH-1775 IndexingFilter: document origin of passed CrawlDatum (snagel)
 
 * NUTCH-1693 TextMD5Signature computed on textual content (Tien Nguyen Manh, markus via snagel)

Modified: nutch/trunk/src/bin/crawl
URL: http://svn.apache.org/viewvc/nutch/trunk/src/bin/crawl?rev=1621284&r1=1621283&r2=1621284&view=diff
==============================================================================
--- nutch/trunk/src/bin/crawl (original)
+++ nutch/trunk/src/bin/crawl Fri Aug 29 11:19:37 2014
@@ -92,9 +92,10 @@ fi
 
 # initial injection
 "$bin/nutch" inject "$CRAWL_PATH"/crawldb "$SEEDDIR"
+RETCODE=$?
 
-if [ $? -ne 0 ] 
-  then exit $? 
+if [ $RETCODE -ne 0 ] 
+  then exit $RETCODE 
 fi
 
 
@@ -111,9 +112,10 @@ do
 
   echo "Generating a new segment"
   "$bin/nutch" generate $commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter
+  RETCODE=$?
   
-  if [ $? -ne 0 ] 
-  then exit $? 
+  if [ $RETCODE -ne 0 ] 
+  then exit $RETCODE 
   fi
 
   # capture the name of the segment
@@ -131,9 +133,10 @@ do
   # fetching the segment
   echo "Fetching : $SEGMENT"
   "$bin/nutch" fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch "$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $numThreads
+  RETCODE=$?
 
-  if [ $? -ne 0 ] 
-  then exit $? 
+  if [ $RETCODE -ne 0 ] 
+  then exit $RETCODE 
   fi
 
   # parsing the segment
@@ -142,47 +145,53 @@ do
   # so that it does not fail the full task
   skipRecordsOptions="-D mapred.skip.attempts.to.start.skipping=2 -D mapred.skip.map.max.skip.records=1"
   "$bin/nutch" parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT
+  RETCODE=$?
 
-  if [ $? -ne 0 ] 
-  then exit $? 
+  if [ $RETCODE -ne 0 ] 
+  then exit $RETCODE 
   fi
 
   # updatedb with this segment
   echo "CrawlDB update"
   "$bin/nutch" updatedb $commonOptions "$CRAWL_PATH"/crawldb  "$CRAWL_PATH"/segments/$SEGMENT
+  RETCODE=$?
 
-  if [ $? -ne 0 ] 
-  then exit $? 
+  if [ $RETCODE -ne 0 ] 
+  then exit $RETCODE 
   fi
 
 # note that the link inversion - indexing routine can be done within the main loop 
 # on a per segment basis
   echo "Link inversion"
   "$bin/nutch" invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
+  RETCODE=$?
 
-  if [ $? -ne 0 ] 
-  then exit $? 
+  if [ $RETCODE -ne 0 ] 
+  then exit $RETCODE 
   fi
 
   echo "Dedup on crawldb"
   $bin/nutch dedup $CRAWL_PATH/crawldb
+  RETCODE=$?
   
-  if [ $? -ne 0 ] 
-   then exit $? 
+  if [ $RETCODE -ne 0 ] 
+  then exit $RETCODE 
   fi
 
   echo "Indexing $SEGMENT on SOLR index -> $SOLRURL"
   "$bin/nutch" index -D solr.server.url=$SOLRURL "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
+  RETCODE=$?
   
-  if [ $? -ne 0 ] 
-   then exit $? 
+  if [ $RETCODE -ne 0 ] 
+  then exit $RETCODE 
   fi
 
   echo "Cleanup on SOLR index -> $SOLRURL"
   "$bin/nutch" clean -D solr.server.url=$SOLRURL "$CRAWL_PATH"/crawldb
+  RETCODE=$?
   
-  if [ $? -ne 0 ] 
-   then exit $? 
+  if [ $RETCODE -ne 0 ] 
+  then exit $RETCODE 
   fi
 
 done