You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2014/09/03 22:43:03 UTC

svn commit: r1622354 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/bin/crawl

Author: mattmann
Date: Wed Sep  3 20:43:03 2014
New Revision: 1622354

URL: http://svn.apache.org/r1622354
Log:
Fix for NUTCH-1832: Nutch should run without an indexer.

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/bin/crawl

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1622354&r1=1622353&r2=1622354&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Sep  3 20:43:03 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1832 Make Nutch work without an indexer (mattmann)
+
 * NUTCH-1828 bin/crawl : incorrect handling of nutch errors (Mathieu Bouchard via jnioche)
 
 * NUTCH-1775 IndexingFilter: document origin of passed CrawlDatum (snagel)

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1622354&r1=1622353&r2=1622354&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Wed Sep  3 20:43:03 2014
@@ -1042,7 +1042,7 @@
 
 <property>
   <name>plugin.includes</name>
-  <value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
+  <value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
   <description>Regular expression naming plugin directory names to
   include.  Any plugin not matching this expression is excluded.
   In any case you need at least include the nutch-extensionpoints plugin. By

Modified: nutch/trunk/src/bin/crawl
URL: http://svn.apache.org/viewvc/nutch/trunk/src/bin/crawl?rev=1622354&r1=1622353&r2=1622354&view=diff
==============================================================================
--- nutch/trunk/src/bin/crawl (original)
+++ nutch/trunk/src/bin/crawl Wed Sep  3 20:43:03 2014
@@ -23,8 +23,16 @@
 
 SEEDDIR="$1"
 CRAWL_PATH="$2"
-SOLRURL="$3"
-LIMIT="$4"
+if [ "$#" -eq 3 ]; then
+    LIMIT="$3"
+elif [ "$#" -eq 4 ]; then
+     SOLRURL="$3"
+     LIMIT="$4"
+else
+    echo "Unknown # of arguments $#"
+    echo "Usage: crawl <seedDir> <crawlDir> [<solrUrl>] <numberOfRounds>"
+    exit -1;
+fi
 
 if [ "$SEEDDIR" = "" ]; then
     echo "Missing seedDir : crawl <seedDir> <crawlDir> <solrURL> <numberOfRounds>"
@@ -37,8 +45,7 @@ if [ "$CRAWL_PATH" = "" ]; then
 fi
 
 if [ "$SOLRURL" = "" ]; then
-    echo "Missing SOLRURL : crawl <seedDir> <crawlDir> <solrURL> <numberOfRounds>"
-    exit -1;
+    echo "No SOLRURL specified. Skipping indexing."
 fi
 
 if [ "$LIMIT" = "" ]; then
@@ -166,7 +173,7 @@ do
   "$bin/nutch" invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
   RETCODE=$?
 
-  if [ $RETCODE -ne 0 ] 
+  if [ $RETCODE -ne 0 ]   
   then exit $RETCODE 
   fi
 
@@ -178,20 +185,23 @@ do
   then exit $RETCODE 
   fi
 
-  echo "Indexing $SEGMENT on SOLR index -> $SOLRURL"
-  "$bin/nutch" index -D solr.server.url=$SOLRURL "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
-  RETCODE=$?
-  
-  if [ $RETCODE -ne 0 ] 
-  then exit $RETCODE 
-  fi
-
-  echo "Cleanup on SOLR index -> $SOLRURL"
-  "$bin/nutch" clean -D solr.server.url=$SOLRURL "$CRAWL_PATH"/crawldb
-  RETCODE=$?
+  if [ -n "$SOLRURL" ]; then
+      echo "Indexing $SEGMENT on SOLR index -> $SOLRURL"
+      "$bin/nutch" index -D solr.server.url=$SOLRURL "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
+      RETCODE=$?
+      if [ $RETCODE -ne 0 ]; then 
+         exit $RETCODE
+      fi
   
-  if [ $RETCODE -ne 0 ] 
-  then exit $RETCODE 
+      echo "Cleanup on SOLR index -> $SOLRURL"
+      "$bin/nutch" clean -D solr.server.url=$SOLRURL "$CRAWL_PATH"/crawldb
+      RETCODE=$?
+
+      if [ $RETCODE -ne 0 ]; then 
+         exit $RETCODE
+      fi
+  else
+      echo "Skipping indexing: no SOLR url provided."
   fi
 
 done