You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@nutch.apache.org by Apache Wiki <wi...@apache.org> on 2011/03/27 14:25:23 UTC

[Nutch Wiki] Update of "Tutorial on incremental crawling" by Gabriele Kahlout

Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Nutch Wiki" for change notification.

The "Tutorial on incremental crawling" page has been changed by Gabriele Kahlout.
http://wiki.apache.org/nutch/Tutorial%20on%20incremental%20crawling

--------------------------------------------------

New page:
The following script crawls the whole-web incrementally; Specifying a list of urls to crawl, nutch will continuously fetch $it_size urls from a specified list of urls, index and merge them with our whole-web index,  so that they can be immediately searched, until all urls have been fetched.

Tested with Nutch-1.2 release. Please report any bug you find on the mailing list or directly to [[Gabriele Kahlout|me]].

If not ready, follow [[Tutorial]] to setup and configure Nutch on your machine.

It also works with Solr. If you have Solr setup

{{{
#!/bin/sh
#
# Created by Gabriele Kahlout on 27.03.11.
# 
# The following script crawls the whole-web incrementally; Specifying a list of urls to crawl, nutch will continuously fetch $it_size urls from a 
# specified list of urls, index and merge them with our whole-web index,  so that they can be immediately searched, until all urls have been fetched.
#
# Usage: ./whole-web-crawling-incremental [it_seedsDir-path urls-to-fetch-per-iteration depth]
#
# Getting Started:
# 1. $ mv whole-web-crawling-incremental $NUTCH_HOME/whole-web-crawling-incremental
# 2. $ cd $NUTCH_HOME
# 3. $ chmod +x whole-web-crawling-incremental
# 4. $ ./whole-web-crawling-incremental
# 
# Start
function echoThenRun () { # echo and then run the command
  echo $1
  $1
  echo
}
echoThenRun "rm -r crawl" # fresh crawl
if [[ ! -d "build" ]]
then
    echoThenRun "ant"
fi
seedsDir="seeds"
if [[ $1 != "" ]]
then
    seedsDir=$1
fi
it_size=10
if [[ $2 != "" ]]
then
    it_size=$2
fi
indexedPlus1=1 #indexedPlus1 urls+1 because of tail. Never printed out
it_seedsDir="$seedsDir/it_seeds"
rm -r $it_seedsDir
mkdir $it_seedsDir
allUrls=`cat $seedsDir/*url* | wc -l | sed -e "s/^ *//"`
echo $allUrls" urls to crawl"
it_crawldb="crawl/crawldb"
depth=1
if [[ $3 != "" ]]
then
    depth=$3
fi
while [[ $indexedPlus1 -le $allUrls ]] #repeat generate-fetch-updatedb-invertlinks-index-merge loop until all urls are fetched
do
    rm $it_seedsDir/urls
    tail -n+$indexedPlus1 $seedsDir/*url* | head -n$it_size > $it_seedsDir/urls
    echo
    echoThenRun "bin/nutch inject $it_crawldb $it_seedsDir"
    i=0

    while [[ $i -lt $depth ]] # depth-first
    do
        echo
        echo "generate-fetch-updatedb-invertlinks-index-merge iteration "$i":"
        echoThenRun "bin/nutch generate $it_crawldb crawl/segments -topN $it_size"
        output=`$cmd`
        echo $output
        if [[ $output == *'0 records selected for fetching'* ]] #all the urls of this iteration have been fetched
        then
            break;
        fi
        s1=`ls -d crawl/segments/2* | tail -1`
        echoThenRun "bin/nutch fetch $s1"
        echoThenRun "bin/nutch updatedb $it_crawldb $s1"
        echoThenRun "bin/nutch invertlinks crawl/linkdb -dir crawl/segments"
        # echoThenRun "bin/nutch solrindex http://localhost:8080/solr/ $it_crawldb crawl/linkdb crawl/segments/*"
        # if you have solr setup you can use it by uncommenting the above command and commenting the following nutch index and merge step.
        # start nutch index and merge step
        new_indexes="crawl/new_indexes"
        rm -r $new_indexes $temp_indexes
        echoThenRun "bin/nutch index $new_indexes $it_crawldb crawl/linkdb crawl/segments/*"
        indexes="crawl/indexes"
        temp_indexes="crawl/temp_indexes"
        # solrindex also merged, with nutch index we've to do it:
        echoThenRun "bin/nutch merge $temp_indexes/part-1 $indexes $new_indexes" # work-around for https://issues.apache.org/jira/browse/NUTCH-971 (Patch available)
        rm -r $indexes $new_indexes
        mv $temp_indexes $indexes
        # end nutch index and merge step
        # you can now search the index with http://localhost:8080/solr/admin/ (if setup) or http://code.google.com/p/luke/ . The index is stored in crawl/indexes, while if Solr is used then in $NUTCH_HOME/solr/data/index.
        ((i++))
        ((indexedPlus1+=$it_size)) # maybe should readdb crawl/crawldb -stats number of actually fetched, but (! going to fetch a page) --> infinite loop
    done
    echoThenRun "bin/nutch readdb $it_crawldb -stats"
    allcrawldb="crawl/allcrawldb"
    temp_crawldb="crawl/temp_crawldb"
    merge_dbs="$it_crawldb $allcrawldb"
    # work-around for https://issues.apache.org/jira/browse/NUTCH-972 (Patch available)
    if [[ ! -d $allcrawldb ]]
    then
        merge_dbs="$it_crawldb"
    fi
    echoThenRun "bin/nutch mergedb $temp_crawldb $merge_dbs"
    rm -r $allcrawldb $it_crawldb crawl/segments crawl/linkdb
    mv $temp_crawldb $allcrawldb
done
echo
crawl_dump="$allcrawldb/dump"
rm -r $crawl_dump $it_seedsDir
echoThenRun "bin/nutch readdb $allcrawldb -dump $crawl_dump" # you can inspect the dump with $ vim $crawl_dump
bin/nutch readdb $allcrawldb -stats
}}}