You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2014/09/17 19:49:38 UTC
svn commit: r1625715 - in /nutch/branches/2.x: CHANGES.txt conf/nutch-default.xml src/bin/crawl

Author: lewismc
Date: Wed Sep 17 17:49:37 2014
New Revision: 1625715

URL: http://svn.apache.org/r1625715
Log:
NUTCH-1832 Make Nutch work without an indexer

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/conf/nutch-default.xml
    nutch/branches/2.x/src/bin/crawl

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1625715&r1=1625714&r2=1625715&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Sep 17 17:49:37 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1832 Make Nutch work without an indexer (mattmann via lewismc)
+
 * NUTCH-1840 the describe function in SolrIndexWriter is not correct (kaveh minooie via jnioche)
 
 * NUTCH-1837 Upgrade to Tika 1.6 (lewismc)

Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1625715&r1=1625714&r2=1625715&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Wed Sep 17 17:49:37 2014
@@ -874,7 +874,7 @@
 
 <property>
   <name>plugin.includes</name>
- <value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|indexer-solr|urlnormalizer-(pass|regex|basic)|scoring-opic</value>
+ <value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|urlnormalizer-(pass|regex|basic)|scoring-opic</value>
  <description>Regular expression naming plugin directory names to
   include.  Any plugin not matching this expression is excluded.
   In any case you need at least include the nutch-extensionpoints plugin. By

Modified: nutch/branches/2.x/src/bin/crawl
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/crawl?rev=1625715&r1=1625714&r2=1625715&view=diff
==============================================================================
--- nutch/branches/2.x/src/bin/crawl (original)
+++ nutch/branches/2.x/src/bin/crawl Wed Sep 17 17:49:37 2014
@@ -23,26 +23,33 @@
 
 SEEDDIR="$1"
 CRAWL_ID="$2"
-SOLRURL="$3"
-LIMIT="$4"
+if [ "$#" -eq 3 ]; then
+    LIMIT="$3"
+elif [ "$#" -eq 4 ]; then
+     SOLRURL="$3"
+     LIMIT="$4"
+else
+    echo "Unknown # of arguments $#"
+    echo "Usage: crawl <seedDir> <crawlDir> [<solrUrl>] <numberOfRounds>"
+    exit -1;
+fi
 
 if [ "$SEEDDIR" = "" ]; then
-    echo "Missing seedDir : crawl <seedDir> <crawlID> <solrURL> <numberOfRounds>"
+    echo "Missing seedDir : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>"
     exit -1;
 fi
 
 if [ "$CRAWL_ID" = "" ]; then
-    echo "Missing crawlDir : crawl <seedDir> <crawlID> <solrURL> <numberOfRounds>"
+    echo "Missing crawlDir : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>"
     exit -1;
 fi
 
 if [ "$SOLRURL" = "" ]; then
-    echo "Missing SOLRURL : crawl <seedDir> <crawlID> <solrURL> <numberOfRounds>"
-    exit -1;
+    echo "No SOLRURL specified. Skipping indexing."
 fi
 
 if [ "$LIMIT" = "" ]; then
-    echo "Missing numberOfRounds : crawl <seedDir> <crawlID> <solrURL> <numberOfRounds>"
+    echo "Missing numberOfRounds : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>"
     exit -1;
 fi
 
@@ -152,20 +159,24 @@ do
     then exit $RETCODE 
   fi
 
-  echo "Indexing $CRAWL_ID on SOLR index -> $SOLRURL"
-  "$bin/nutch" index $commonOptions -D solr.server.url=$SOLRURL -all -crawlId "$CRAWL_ID"
-  RETCODE=$?
-
-  if [ $RETCODE -ne 0 ] 
-    then exit $RETCODE 
-  fi
-
-  echo "SOLR dedup -> $SOLRURL"
-  "$bin/nutch" solrdedup $commonOptions $SOLRURL
-  RETCODE=$?
-
-  if [ $RETCODE -ne 0 ] 
-    then exit $RETCODE 
+  if [ -n "$SOLRURL" ]; then
+    echo "Indexing $CRAWL_ID on SOLR index -> $SOLRURL"
+    "$bin/nutch" index $commonOptions -D solr.server.url=$SOLRURL -all -crawlId "$CRAWL_ID"
+    RETCODE=$?
+
+    if [ $RETCODE -ne 0 ] 
+      then exit $RETCODE 
+    fi
+
+    echo "SOLR dedup -> $SOLRURL"
+    "$bin/nutch" solrdedup $commonOptions $SOLRURL
+    RETCODE=$?
+
+    if [ $RETCODE -ne 0 ] 
+      then exit $RETCODE 
+    fi
+  else
+      echo "Skipping indexing tasks: no SOLR url provided."
   fi
 
 done