You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2014/09/17 19:49:38 UTC
svn commit: r1625715 - in /nutch/branches/2.x: CHANGES.txt
conf/nutch-default.xml src/bin/crawl
Author: lewismc
Date: Wed Sep 17 17:49:37 2014
New Revision: 1625715
URL: http://svn.apache.org/r1625715
Log:
NUTCH-1832 Make Nutch work without an indexer
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/nutch-default.xml
nutch/branches/2.x/src/bin/crawl
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1625715&r1=1625714&r2=1625715&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Sep 17 17:49:37 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1832 Make Nutch work without an indexer (mattmann via lewismc)
+
* NUTCH-1840 the describe function in SolrIndexWriter is not correct (kaveh minooie via jnioche)
* NUTCH-1837 Upgrade to Tika 1.6 (lewismc)
Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1625715&r1=1625714&r2=1625715&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Wed Sep 17 17:49:37 2014
@@ -874,7 +874,7 @@
<property>
<name>plugin.includes</name>
- <value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|indexer-solr|urlnormalizer-(pass|regex|basic)|scoring-opic</value>
+ <value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|urlnormalizer-(pass|regex|basic)|scoring-opic</value>
<description>Regular expression naming plugin directory names to
include. Any plugin not matching this expression is excluded.
In any case you need at least include the nutch-extensionpoints plugin. By
Modified: nutch/branches/2.x/src/bin/crawl
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/crawl?rev=1625715&r1=1625714&r2=1625715&view=diff
==============================================================================
--- nutch/branches/2.x/src/bin/crawl (original)
+++ nutch/branches/2.x/src/bin/crawl Wed Sep 17 17:49:37 2014
@@ -23,26 +23,33 @@
SEEDDIR="$1"
CRAWL_ID="$2"
-SOLRURL="$3"
-LIMIT="$4"
+if [ "$#" -eq 3 ]; then
+ LIMIT="$3"
+elif [ "$#" -eq 4 ]; then
+ SOLRURL="$3"
+ LIMIT="$4"
+else
+ echo "Unknown # of arguments $#"
+ echo "Usage: crawl <seedDir> <crawlDir> [<solrUrl>] <numberOfRounds>"
+ exit -1;
+fi
if [ "$SEEDDIR" = "" ]; then
- echo "Missing seedDir : crawl <seedDir> <crawlID> <solrURL> <numberOfRounds>"
+ echo "Missing seedDir : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>"
exit -1;
fi
if [ "$CRAWL_ID" = "" ]; then
- echo "Missing crawlDir : crawl <seedDir> <crawlID> <solrURL> <numberOfRounds>"
+ echo "Missing crawlDir : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>"
exit -1;
fi
if [ "$SOLRURL" = "" ]; then
- echo "Missing SOLRURL : crawl <seedDir> <crawlID> <solrURL> <numberOfRounds>"
- exit -1;
+ echo "No SOLRURL specified. Skipping indexing."
fi
if [ "$LIMIT" = "" ]; then
- echo "Missing numberOfRounds : crawl <seedDir> <crawlID> <solrURL> <numberOfRounds>"
+ echo "Missing numberOfRounds : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>"
exit -1;
fi
@@ -152,20 +159,24 @@ do
then exit $RETCODE
fi
- echo "Indexing $CRAWL_ID on SOLR index -> $SOLRURL"
- "$bin/nutch" index $commonOptions -D solr.server.url=$SOLRURL -all -crawlId "$CRAWL_ID"
- RETCODE=$?
-
- if [ $RETCODE -ne 0 ]
- then exit $RETCODE
- fi
-
- echo "SOLR dedup -> $SOLRURL"
- "$bin/nutch" solrdedup $commonOptions $SOLRURL
- RETCODE=$?
-
- if [ $RETCODE -ne 0 ]
- then exit $RETCODE
+ if [ -n "$SOLRURL" ]; then
+ echo "Indexing $CRAWL_ID on SOLR index -> $SOLRURL"
+ "$bin/nutch" index $commonOptions -D solr.server.url=$SOLRURL -all -crawlId "$CRAWL_ID"
+ RETCODE=$?
+
+ if [ $RETCODE -ne 0 ]
+ then exit $RETCODE
+ fi
+
+ echo "SOLR dedup -> $SOLRURL"
+ "$bin/nutch" solrdedup $commonOptions $SOLRURL
+ RETCODE=$?
+
+ if [ $RETCODE -ne 0 ]
+ then exit $RETCODE
+ fi
+ else
+ echo "Skipping indexing tasks: no SOLR url provided."
fi
done