You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@nutch.apache.org by Apache Wiki <wi...@apache.org> on 2011/03/28 23:34:58 UTC
[Nutch Wiki] Update of "Whole-Web Crawling incremental script" by Gabriele Kahlout
Dear Wiki user,
You have subscribed to a wiki page or wiki category on "Nutch Wiki" for change notification.
The "Whole-Web Crawling incremental script" page has been changed by Gabriele Kahlout.
http://wiki.apache.org/nutch/Whole-Web%20Crawling%20incremental%20script?action=diff&rev1=15&rev2=16
--------------------------------------------------
=== Script Editions: ===
1. Abridged using Solr (tersest)
- 1. Unabridged with explanations and using nutch index (beginner)
+ 1. Unabridged with explanations and using nutch index and local fs cmds (beginner)
- 1. TODO: Unabridged with explanations, using solr and Hadoop fs (most advanced)
+ 1. Unabridged with explanations, using solr and Hadoop fs cmds (advanced)
Please report any bug you find on the mailing list and to [[Gabriele Kahlout|me]].
- == 1. Abridged script using Solr ==
+ == 1. Abridged using Solr (tersest) ==
{{{
#!/bin/sh
@@ -83, +83 @@
rm -r $it_seedsDir
}}}
- == 2. Unabridged script with explanations and using nutch index ==
+ == 2. Unabridged with explanations and using nutch index and local fs cmds (beginner) ==
{{{
@@ -223, +223 @@
bin/nutch readdb $allcrawldb -stats
}}}
+ == 3. Unabridged with explanations, using solr and Hadoop fs cmds (advanced) ==
+ {{{
+ #!/bin/sh
+
+ #
+ # Created by Gabriele Kahlout on 27.03.11.
+ # The following script crawls the whole-web incrementally; Specifying a list of urls to crawl, nutch will continuously fetch $it_size urls from a specified list of urls, index and merge them with our whole-web index, so that they can be immediately searched, until all urls have been fetched.
+ #
+ # TO USE:
+ # 1. $ mv whole-web-crawling-incremental $NUTCH_HOME/whole-web-crawling-incremental
+ # 2. $ cd $NUTCH_HOME
+ # 3. $ chmod +x whole-web-crawling-incremental
+ # 4. $ ./whole-web-crawling-incremental
+
+ # Usage: ./whole-web-crawling-incremental [it_seedsDir-path urls-to-fetch-per-iteration depth]
+ # Start
+
+ function echoThenRun () { # echo and then run the command
+ echo $1
+ $1
+ echo
+ }
+
+ echoThenRun "bin/hadoop dfs -rmr crawl" # fresh crawl
+
+ solrIndex="http://localhost:8080/solr"
+ echoThenRun "curl --fail $solrIndex/update?commit=true -d '<delete><query>*:*</query></delete>'" #empty index
+
+
+ if [[ ! -d "build" ]]
+ then
+ echoThenRun "ant"
+ fi
+
+ seedsDir="seeds"
+ if [[ $1 != "" ]]
+ then
+ seedsDir=$1
+ fi
+
+ it_size=10
+ if [[ $2 != "" ]]
+ then
+ it_size=$2
+ fi
+
+ indexedPlus1=1 #indexedPlus1 urls+1 because of tail. Never printed out
+ it_seedsDir="$seedsDir/it_seeds"
+
+ bin/hadoop dfs -rmr $it_seedsDir
+ bin/hadoop dfs -mkdir $it_seedsDir
+ bin/hadoop dfs -mkdir crawl/crawldb
+ rm $seedsDir/urls-local-only
+
+ echoThenRun "bin/hadoop dfs -get $seedsDir/*url* $seedsDir/urls-local-only"
+
+ allUrls=`cat $seedsDir/urls-local-only | wc -l | sed -e "s/^ *//"`
+ echo $allUrls" urls to crawl"
+
+
+ depth=1
+ if [[ $3 != "" ]]
+ then
+ depth=$3
+ fi
+
+ j=0
+ while [[ $indexedPlus1 -le $allUrls ]] #repeat generate-fetch-updatedb-invertlinks-index-merge loop until all urls are fetched
+ do
+ bin/hadoop dfs -rm $it_seedsDir/urls
+
+ tail -n+$indexedPlus1 $seedsDir/urls-local-only | head -n$it_size > $it_seedsDir/urls-local-only
+ bin/hadoop dfs -moveFromLocal $it_seedsDir/urls-local-only $it_seedsDir/urls
+
+ it_crawldb="crawl/crawldb/$j/0"
+ bin/hadoop dfs -mkdir $it_crawldb
+
+ echo
+ echoThenRun "bin/nutch inject $it_crawldb $it_seedsDir"
+ i=0
+
+ while [[ $i -lt $depth ]] # depth-first
+ do
+ echo "generate-fetch-updatedb-invertlinks-index-merge iteration "$i":"
+
+ it_crawldb="crawl/crawldb/$j/$i"
+
+ echo
+ cmd="bin/nutch generate $it_crawldb crawl/segments -topN $it_size"
+ echo $cmd
+ output=`$cmd`
+ echo $output
+ echo
+ if [[ $output == *'0 records selected for fetching'* ]] #all the urls of this iteration have been fetched
+ then
+ break;
+ fi
+
+ echoThenRun "bin/nutch fetch crawl/segments/2*"
+
+ echoThenRun "bin/nutch updatedb $it_crawldb crawl/segments/2*"
+
+ echoThenRun "bin/nutch invertlinks crawl/linkdb -dir crawl/segments"
+
+
+ echoThenRun "bin/nutch solrindex $solrIndex $it_crawldb crawl/linkdb crawl/segments/*"
+
+ # you can now search the index with http://localhost:8080/solr/admin/ (if setup) or http://code.google.com/p/luke/ . The index is stored in $NUTCH_HOME/solr/data/index.
+ ((i++))
+ ((indexedPlus1+=$it_size)) # maybe should readdb crawl/crawldb -stats number of actually fetched, but (! going to fetch a page) --> infinite loop
+ echo
+ done
+
+ echoThenRun "bin/nutch readdb $it_crawldb -stats"
+
+ allcrawldb="crawl/allcrawldb"
+ temp_crawldb="crawl/temp_crawldb"
+ merge_dbs="$it_crawldb $allcrawldb"
+
+ # work-around for https://issues.apache.org/jira/browse/NUTCH-972 (Patch available)
+ if [[ ! -d $allcrawldb ]]
+ then
+ merge_dbs="$it_crawldb"
+ fi
+
+ #echoThenRun "bin/nutch mergedb $temp_crawldb $merge_dbs"
+
+ #rm -r $allcrawldb $it_crawldb crawl/segments crawl/linkdb
+ #mv $temp_crawldb $allcrawldb
+ ((j++))
+ done
+
+ bin/hadoop dfs -rmr $it_seedsDir
+ }}}
+