You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2015/06/25 15:56:14 UTC
svn commit: r1687522 - in /nutch/trunk: CHANGES.txt src/bin/crawl

Author: jnioche
Date: Thu Jun 25 13:56:14 2015
New Revision: 1687522

URL: http://svn.apache.org/r1687522
Log:
Adding some continuous crawl goodies to the crawl script NUTCH-2036

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/bin/crawl

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1687522&r1=1687521&r2=1687522&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jun 25 13:56:14 2015
@@ -2,6 +2,8 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-2036 Adding some continuous crawl goodies to the crawl script (jorge, snagel)
+
 * NUTCH-2039 Relevance based scoring filter (Sujen Shah, lewismc via mattmann)
 
 * NUTCH-2037 Job endpoint to support Indexing from the REST API (Sujen Shah via mattmann)

Modified: nutch/trunk/src/bin/crawl
URL: http://svn.apache.org/viewvc/nutch/trunk/src/bin/crawl?rev=1687522&r1=1687521&r2=1687522&view=diff
==============================================================================
--- nutch/trunk/src/bin/crawl (original)
+++ nutch/trunk/src/bin/crawl Thu Jun 25 13:56:14 2015
@@ -14,20 +14,48 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# 
-# Usage: crawl [-i|--index] [-D "key=value"] <Seed Dir> <Crawl Dir> <Num Rounds>
+#
+# Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds>
 #    -i|--index      Indexes crawl results into a configured indexer
+#    -w|--wait       NUMBER[SUFFIX] Time to wait before generating a new segment when no URLs
+#                    are scheduled for fetching. Suffix can be: s for second,
+#                    m for minute, h for hour and d for day. If no suffix is
+#                    specified second is used by default.
 #    -D              A Java property to pass to Nutch calls
 #    Seed Dir        Directory in which to look for a seeds file
 #    Crawl Dir       Directory where the crawl/link/segments dirs are saved
 #    Num Rounds      The number of rounds to run this crawl for
 #
-# 
-# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND 
+#
+# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND
 # INDEXING FOR EACH SEGMENT
 
 INDEXFLAG=false
 JAVA_PROPERTIES=""
+WAIT=-1 # don't wait if there are no URLs to fetch
+
+function __to_seconds() {
+  NUMBER=$(echo $1 | tr -dc '0-9')
+  MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]')
+
+  case $MODIFIER in
+      m|M)
+        SECONDS=`expr $NUMBER \* 60`
+        ;;
+      h|H)
+        SECONDS=`expr $NUMBER \* 120`
+        ;;
+      d|D)
+        SECONDS=`expr $NUMBER \* 86400`
+        ;;
+      s|S|*)
+        SECONDS=$NUMBER
+        ;;
+  esac
+
+  echo $SECONDS
+}
+
 while [[ $# > 0 ]]
 do
     case $1 in
@@ -39,6 +67,10 @@ do
             JAVA_PROPERTIES="-D${2} ${JAVA_PROPERTIES}"
             shift 2
             ;;
+        -w|--wait)
+            WAIT="${2}"
+            shift 2
+            ;;
         *)
             break
             ;;
@@ -46,9 +78,13 @@ do
 done
 
 if [[ $# != 3 ]]; then
-    echo "Usage: crawl [-i|--index] [-D \"key=value\"] <Seed Dir> <Crawl Dir> <Num Rounds>"
+    echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds>"
     echo -e "\t-i|--index\tIndexes crawl results into a configured indexer"
     echo -e "\t-D\t\tA Java property to pass to Nutch calls"
+    echo -e "\t-w|--wait\tNUMBER[SUFFIX] Time to wait before generating a new segment when no URLs"
+    echo -e "\t\t\tare scheduled for fetching. Suffix can be: s for second,"
+    echo -e "\t\t\tm for minute, h for hour and d for day. If no suffix is"
+    echo -e "\t\t\tspecified second is used by default."
     echo -e "\tSeed Dir\tDirectory in which to look for a seeds file"
     echo -e "\tCrawl Dir\tDirectory where the crawl/link/segments dirs are saved"
     echo -e "\tNum Rounds\tThe number of rounds to run this crawl for"
@@ -59,6 +95,12 @@ SEEDDIR="$1"
 CRAWL_PATH="$2"
 LIMIT="$3"
 
+# convert wait time to seconds for compatibility reasons
+if [ "$WAIT" != "-1" ]; then
+  WAIT=$( __to_seconds "$WAIT" )
+  echo "Time to wait (--wait) = $WAIT sec."
+fi
+
 #############################################
 # MODIFY THE PARAMETERS BELOW TO YOUR NEEDS #
 #############################################
@@ -91,11 +133,11 @@ if [ -f "${bin}"/../*nutch*.job ]; then
     mode=distributed
 fi
 
-# note that some of the options listed here could be set in the 
-# corresponding hadoop site xml param file 
+# note that some of the options listed here could be set in the
+# corresponding hadoop site xml param file
 commonOptions="-D mapred.reduce.tasks=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapred.reduce.tasks.speculative.execution=false -D mapred.map.tasks.speculative.execution=false -D mapred.compress.map.output=true"
 
- # check that hadoop can be found on the path 
+ # check that hadoop can be found on the path
 if [ $mode = "distributed" ]; then
  if [ $(which hadoop | wc -l ) -eq 0 ]; then
     echo "Can't find Hadoop executable. Add HADOOP_HOME/bin to the path or run in local mode."
@@ -126,9 +168,8 @@ function __bin_nutch {
 echo "Injecting seed URLs"
 __bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR"
 
-
 # main loop : rounds of generate - fetch - parse - update
-for ((a=1; a <= LIMIT ; a++))
+for ((a=1; ; a++))
 do
   if [ -e ".STOP" ]
   then
@@ -136,7 +177,15 @@ do
    break
   fi
 
-  echo `date` ": Iteration $a of $LIMIT"
+  if [ $LIMIT -ne -1 ]; then
+    if [ $a -gt $LIMIT ]; then
+      echo `date` ": Finished loop with $LIMIT iterations"
+      break
+    fi
+    echo `date` ": Iteration $a of $LIMIT"
+  else
+    echo `date` ": Iteration $a"
+  fi
 
   echo "Generating a new segment"
   generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter)
@@ -147,8 +196,15 @@ do
       : # ok: no error
   elif [ $RETCODE -eq 1 ]; then
     echo "Generate returned 1 (no new segments created)"
-    echo "Escaping loop: no more URLs to fetch now"
-    break
+
+    if [ "$WAIT" -ne -1 ]; then
+      echo "Waiting for $WAIT sec. ..."
+      sleep $WAIT
+      continue
+    else
+      echo "Escaping loop: no more URLs to fetch now"
+      break
+    fi
   else
     echo "Error running:"
     echo "  $bin/nutch generate ${generate_args[@]}"
@@ -165,7 +221,7 @@ do
   else
    SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments |  sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1`
   fi
-  
+
   echo "Operating on segment : $SEGMENT"
 
   # fetching the segment
@@ -174,7 +230,7 @@ do
 
   # parsing the segment
   echo "Parsing : $SEGMENT"
-  # enable the skipping of records for the parsing so that a dodgy document 
+  # enable the skipping of records for the parsing so that a dodgy document
   # so that it does not fail the full task
   skipRecordsOptions="-D mapred.skip.attempts.to.start.skipping=2 -D mapred.skip.map.max.skip.records=1"
   __bin_nutch parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT
@@ -183,7 +239,7 @@ do
   echo "CrawlDB update"
   __bin_nutch updatedb $commonOptions "$CRAWL_PATH"/crawldb  "$CRAWL_PATH"/segments/$SEGMENT
 
-# note that the link inversion - indexing routine can be done within the main loop 
+# note that the link inversion - indexing routine can be done within the main loop
 # on a per segment basis
   echo "Link inversion"
   __bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
@@ -194,13 +250,13 @@ do
   if $INDEXFLAG; then
       echo "Indexing $SEGMENT to index"
       __bin_nutch index $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
-  
+
       echo "Cleaning up index if possible"
       __bin_nutch clean $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb
   else
       echo "Skipping indexing ..."
   fi
-  
+
   #######################################################
   # The following commands fall into WebGraph territory
   # and should be uncommented based on your requirements
@@ -223,4 +279,3 @@ do
 done
 
 exit 0
-