You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/11/11 17:20:02 UTC
svn commit: r1638203 - in /nutch: branches/2.x/src/bin/crawl
trunk/src/bin/crawl
Author: snagel
Date: Tue Nov 11 16:20:01 2014
New Revision: 1638203
URL: http://svn.apache.org/r1638203
Log:
NUTCH-1883 in case of generate: break loop and do not exit with error
Modified:
nutch/branches/2.x/src/bin/crawl
nutch/trunk/src/bin/crawl
Modified: nutch/branches/2.x/src/bin/crawl
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/crawl?rev=1638203&r1=1638202&r2=1638203&view=diff
==============================================================================
--- nutch/branches/2.x/src/bin/crawl (original)
+++ nutch/branches/2.x/src/bin/crawl Tue Nov 11 16:20:01 2014
@@ -137,7 +137,22 @@ do
batchId=`date +%s`-$RANDOM
echo "Generating a new fetchlist"
- __bin_nutch generate $commonOptions -topN $sizeFetchlist -noNorm -noFilter -adddays $addDays -crawlId "$CRAWL_ID" -batchId $batchId
+ generate_args=($commonOptions -topN $sizeFetchlist -noNorm -noFilter -adddays $addDays -crawlId "$CRAWL_ID" -batchId $batchId)
+ echo "$bin/nutch generate ${generate_args[@]}"
+ $bin/nutch generate "${generate_args[@]}"
+ RETCODE=$?
+ if [ $RETCODE -eq 0 ]; then
+ : # ok: no error
+ elif [ $RETCODE -eq 1 ]; then
+ echo "Generate returned 1 (no new segments created)"
+ echo "Escaping loop: no more URLs to fetch now"
+ break
+ else
+ echo "Error running:"
+ echo " $bin/nutch generate ${generate_args[@]}"
+ echo "Failed with exit value $RETCODE."
+ exit $RETCODE
+ fi
echo "Fetching : "
__bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch $batchId -crawlId "$CRAWL_ID" -threads 50
Modified: nutch/trunk/src/bin/crawl
URL: http://svn.apache.org/viewvc/nutch/trunk/src/bin/crawl?rev=1638203&r1=1638202&r2=1638203&view=diff
==============================================================================
--- nutch/trunk/src/bin/crawl (original)
+++ nutch/trunk/src/bin/crawl Tue Nov 11 16:20:01 2014
@@ -133,7 +133,22 @@ do
echo `date` ": Iteration $a of $LIMIT"
echo "Generating a new segment"
- __bin_nutch generate $commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter
+ generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter)
+ echo "$bin/nutch generate ${generate_args[@]}"
+ $bin/nutch generate "${generate_args[@]}"
+ RETCODE=$?
+ if [ $RETCODE -eq 0 ]; then
+ : # ok: no error
+ elif [ $RETCODE -eq 1 ]; then
+ echo "Generate returned 1 (no new segments created)"
+ echo "Escaping loop: no more URLs to fetch now"
+ break
+ else
+ echo "Error running:"
+ echo " $bin/nutch generate ${generate_args[@]}"
+ echo "Failed with exit value $RETCODE."
+ exit $RETCODE
+ fi
# capture the name of the segment
# call hadoop in distributed mode
@@ -168,7 +183,7 @@ do
__bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
echo "Dedup on crawldb"
- $bin/nutch dedup $CRAWL_PATH/crawldb
+ __bin_nutch dedup "$CRAWL_PATH"/crawldb
if [ -n "$SOLRURL" ]; then
echo "Indexing $SEGMENT on SOLR index -> $SOLRURL"