You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2014/08/29 13:19:40 UTC
svn commit: r1621284 - in /nutch/trunk: CHANGES.txt src/bin/crawl
Author: jnioche
Date: Fri Aug 29 11:19:37 2014
New Revision: 1621284
URL: http://svn.apache.org/r1621284
Log:
NUTCH-1828 bin/crawl : incorrect handling of nutch errors
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/bin/crawl
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1621284&r1=1621283&r2=1621284&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Aug 29 11:19:37 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development
+* NUTCH-1828 bin/crawl : incorrect handling of nutch errors (Mathieu Bouchard via jnioche)
+
* NUTCH-1775 IndexingFilter: document origin of passed CrawlDatum (snagel)
* NUTCH-1693 TextMD5Signature computed on textual content (Tien Nguyen Manh, markus via snagel)
Modified: nutch/trunk/src/bin/crawl
URL: http://svn.apache.org/viewvc/nutch/trunk/src/bin/crawl?rev=1621284&r1=1621283&r2=1621284&view=diff
==============================================================================
--- nutch/trunk/src/bin/crawl (original)
+++ nutch/trunk/src/bin/crawl Fri Aug 29 11:19:37 2014
@@ -92,9 +92,10 @@ fi
# initial injection
"$bin/nutch" inject "$CRAWL_PATH"/crawldb "$SEEDDIR"
+RETCODE=$?
-if [ $? -ne 0 ]
- then exit $?
+if [ $RETCODE -ne 0 ]
+ then exit $RETCODE
fi
@@ -111,9 +112,10 @@ do
echo "Generating a new segment"
"$bin/nutch" generate $commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter
+ RETCODE=$?
- if [ $? -ne 0 ]
- then exit $?
+ if [ $RETCODE -ne 0 ]
+ then exit $RETCODE
fi
# capture the name of the segment
@@ -131,9 +133,10 @@ do
# fetching the segment
echo "Fetching : $SEGMENT"
"$bin/nutch" fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch "$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $numThreads
+ RETCODE=$?
- if [ $? -ne 0 ]
- then exit $?
+ if [ $RETCODE -ne 0 ]
+ then exit $RETCODE
fi
# parsing the segment
@@ -142,47 +145,53 @@ do
# so that it does not fail the full task
skipRecordsOptions="-D mapred.skip.attempts.to.start.skipping=2 -D mapred.skip.map.max.skip.records=1"
"$bin/nutch" parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT
+ RETCODE=$?
- if [ $? -ne 0 ]
- then exit $?
+ if [ $RETCODE -ne 0 ]
+ then exit $RETCODE
fi
# updatedb with this segment
echo "CrawlDB update"
"$bin/nutch" updatedb $commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments/$SEGMENT
+ RETCODE=$?
- if [ $? -ne 0 ]
- then exit $?
+ if [ $RETCODE -ne 0 ]
+ then exit $RETCODE
fi
# note that the link inversion - indexing routine can be done within the main loop
# on a per segment basis
echo "Link inversion"
"$bin/nutch" invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
+ RETCODE=$?
- if [ $? -ne 0 ]
- then exit $?
+ if [ $RETCODE -ne 0 ]
+ then exit $RETCODE
fi
echo "Dedup on crawldb"
$bin/nutch dedup $CRAWL_PATH/crawldb
+ RETCODE=$?
- if [ $? -ne 0 ]
- then exit $?
+ if [ $RETCODE -ne 0 ]
+ then exit $RETCODE
fi
echo "Indexing $SEGMENT on SOLR index -> $SOLRURL"
"$bin/nutch" index -D solr.server.url=$SOLRURL "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
+ RETCODE=$?
- if [ $? -ne 0 ]
- then exit $?
+ if [ $RETCODE -ne 0 ]
+ then exit $RETCODE
fi
echo "Cleanup on SOLR index -> $SOLRURL"
"$bin/nutch" clean -D solr.server.url=$SOLRURL "$CRAWL_PATH"/crawldb
+ RETCODE=$?
- if [ $? -ne 0 ]
- then exit $?
+ if [ $RETCODE -ne 0 ]
+ then exit $RETCODE
fi
done