You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by Apache Wiki <wi...@apache.org> on 2007/09/20 19:20:33 UTC

[Nutch Wiki] Update of "Crawl" by susam

Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Nutch Wiki" for change notification.

The following page has been changed by susam:
http://wiki.apache.org/nutch/Crawl

The comment on the change is:
fixed topN bug

------------------------------------------------------------------------------
  == Script ==
  {{{
  #!/bin/sh
- 
- # Runs the Nutch bot to crawl or re-crawl
- # Usage: bin/runbot [safe]
- #        If executed in 'safe' mode, it doesn't delete the temporary
- #        directories generated during crawl. This might be helpful for
- #        analysis and recovery in case a crawl fails.
- #
- # Author: Susam Pal
- 
- depth=2
+ depth=8
  threads=50
  adddays=5
- topN=2 # Comment this statement if you don't want to set topN value
+ topN=1000 #Comment this statement if you don't want to set topN value
  
  # Parse arguments
  if [ "$1" == "safe" ]
@@ -101, +92 @@

  
  if [ -n "$topN" ]
  then
-   topN="--topN $rank"
+   topN="-topN $topN"
  else
    topN=""
  fi
@@ -125, +116 @@

    $NUTCH_HOME/bin/nutch fetch $segment -threads $threads
    if [ $? -ne 0 ]
    then
-     echo "runbot: fetch $segment at depth $depth failed. Deleting segment $segment."
+     echo "runbot: fetch $segment at depth `expr $i + 1` failed. Deleting segment $segment."
      rm -rf $segment
      continue
    fi
@@ -138, +129 @@

  $NUTCH_HOME/bin/nutch mergesegs crawl/MERGEDsegments crawl/segments/*
  if [ "$safe" != "yes" ]
  then
-   rm -rf crawl/segments/*
+   rm -rf crawl/segments
  else
-   mkdir crawl/FETCHEDsegments
-   mv --verbose crawl/segments/* crawl/FETCHEDsegments
+   mv $MVARGS crawl/segments crawl/FETCHEDsegments
  fi
  
- mv --verbose crawl/MERGEDsegments/* crawl/segments
+ mv $MVARGS crawl/MERGEDsegments crawl/segments
- rmdir crawl/MERGEDsegments
  
  echo "----- Invert Links (Step 4 of $steps) -----"
  $NUTCH_HOME/bin/nutch invertlinks crawl/linkdb crawl/segments/*