You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2014/09/03 22:43:03 UTC
svn commit: r1622354 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml
src/bin/crawl
Author: mattmann
Date: Wed Sep 3 20:43:03 2014
New Revision: 1622354
URL: http://svn.apache.org/r1622354
Log:
Fix for NUTCH-1832: Nutch should run without an indexer.
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/bin/crawl
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1622354&r1=1622353&r2=1622354&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Sep 3 20:43:03 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development
+* NUTCH-1832 Make Nutch work without an indexer (mattmann)
+
* NUTCH-1828 bin/crawl : incorrect handling of nutch errors (Mathieu Bouchard via jnioche)
* NUTCH-1775 IndexingFilter: document origin of passed CrawlDatum (snagel)
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1622354&r1=1622353&r2=1622354&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Wed Sep 3 20:43:03 2014
@@ -1042,7 +1042,7 @@
<property>
<name>plugin.includes</name>
- <value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
+ <value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
<description>Regular expression naming plugin directory names to
include. Any plugin not matching this expression is excluded.
In any case you need at least include the nutch-extensionpoints plugin. By
Modified: nutch/trunk/src/bin/crawl
URL: http://svn.apache.org/viewvc/nutch/trunk/src/bin/crawl?rev=1622354&r1=1622353&r2=1622354&view=diff
==============================================================================
--- nutch/trunk/src/bin/crawl (original)
+++ nutch/trunk/src/bin/crawl Wed Sep 3 20:43:03 2014
@@ -23,8 +23,16 @@
SEEDDIR="$1"
CRAWL_PATH="$2"
-SOLRURL="$3"
-LIMIT="$4"
+if [ "$#" -eq 3 ]; then
+ LIMIT="$3"
+elif [ "$#" -eq 4 ]; then
+ SOLRURL="$3"
+ LIMIT="$4"
+else
+ echo "Unknown # of arguments $#"
+ echo "Usage: crawl <seedDir> <crawlDir> [<solrUrl>] <numberOfRounds>"
+ exit -1;
+fi
if [ "$SEEDDIR" = "" ]; then
echo "Missing seedDir : crawl <seedDir> <crawlDir> <solrURL> <numberOfRounds>"
@@ -37,8 +45,7 @@ if [ "$CRAWL_PATH" = "" ]; then
fi
if [ "$SOLRURL" = "" ]; then
- echo "Missing SOLRURL : crawl <seedDir> <crawlDir> <solrURL> <numberOfRounds>"
- exit -1;
+ echo "No SOLRURL specified. Skipping indexing."
fi
if [ "$LIMIT" = "" ]; then
@@ -166,7 +173,7 @@ do
"$bin/nutch" invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
RETCODE=$?
- if [ $RETCODE -ne 0 ]
+ if [ $RETCODE -ne 0 ]
then exit $RETCODE
fi
@@ -178,20 +185,23 @@ do
then exit $RETCODE
fi
- echo "Indexing $SEGMENT on SOLR index -> $SOLRURL"
- "$bin/nutch" index -D solr.server.url=$SOLRURL "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
- RETCODE=$?
-
- if [ $RETCODE -ne 0 ]
- then exit $RETCODE
- fi
-
- echo "Cleanup on SOLR index -> $SOLRURL"
- "$bin/nutch" clean -D solr.server.url=$SOLRURL "$CRAWL_PATH"/crawldb
- RETCODE=$?
+ if [ -n "$SOLRURL" ]; then
+ echo "Indexing $SEGMENT on SOLR index -> $SOLRURL"
+ "$bin/nutch" index -D solr.server.url=$SOLRURL "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
+ RETCODE=$?
+ if [ $RETCODE -ne 0 ]; then
+ exit $RETCODE
+ fi
- if [ $RETCODE -ne 0 ]
- then exit $RETCODE
+ echo "Cleanup on SOLR index -> $SOLRURL"
+ "$bin/nutch" clean -D solr.server.url=$SOLRURL "$CRAWL_PATH"/crawldb
+ RETCODE=$?
+
+ if [ $RETCODE -ne 0 ]; then
+ exit $RETCODE
+ fi
+ else
+ echo "Skipping indexing: no SOLR url provided."
fi
done