You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@nutch.apache.org by Apache Wiki <wi...@apache.org> on 2011/03/28 23:34:58 UTC

[Nutch Wiki] Update of "Whole-Web Crawling incremental script" by Gabriele Kahlout

Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Nutch Wiki" for change notification.

The "Whole-Web Crawling incremental script" page has been changed by Gabriele Kahlout.
http://wiki.apache.org/nutch/Whole-Web%20Crawling%20incremental%20script?action=diff&rev1=15&rev2=16

--------------------------------------------------

  
  === Script Editions: ===
   1. Abridged using Solr (tersest)
-  1. Unabridged with explanations and using nutch index (beginner)
+  1. Unabridged with explanations and using nutch index and local fs cmds (beginner)
-  1. TODO: Unabridged with explanations, using solr and Hadoop fs (most advanced)
+  1. Unabridged with explanations, using solr and Hadoop fs cmds (advanced)
  
  Please report any bug you find on the mailing list and to [[Gabriele Kahlout|me]].
  
- == 1. Abridged script using Solr ==
+ == 1. Abridged using Solr (tersest) ==
  {{{
  #!/bin/sh
  
@@ -83, +83 @@

  rm -r $it_seedsDir
  
  }}}
- == 2. Unabridged script with explanations and using nutch index ==
+ == 2. Unabridged with explanations and using nutch index and local fs cmds (beginner) ==
  
  {{{
  
@@ -223, +223 @@

  bin/nutch readdb $allcrawldb -stats
  }}}
  
+ == 3. Unabridged with explanations, using solr and Hadoop fs cmds (advanced) ==
+ {{{
+ #!/bin/sh
+ 
+ #
+ # Created by Gabriele Kahlout on 27.03.11.
+ # The following script crawls the whole-web incrementally; Specifying a list of urls to crawl, nutch will continuously fetch $it_size urls from a specified list of urls, index and merge them with our whole-web index,  so that they can be immediately searched, until all urls have been fetched.
+ #
+ # TO USE:
+ # 1. $ mv whole-web-crawling-incremental $NUTCH_HOME/whole-web-crawling-incremental
+ # 2. $ cd $NUTCH_HOME
+ # 3. $ chmod +x whole-web-crawling-incremental
+ # 4. $ ./whole-web-crawling-incremental
+ 
+ # Usage: ./whole-web-crawling-incremental [it_seedsDir-path urls-to-fetch-per-iteration depth]
+ # Start
+ 
+ function echoThenRun () { # echo and then run the command
+   echo $1
+   $1
+   echo
+ }
+ 
+ echoThenRun "bin/hadoop dfs -rmr crawl" # fresh crawl
+ 
+ solrIndex="http://localhost:8080/solr"
+ echoThenRun "curl --fail $solrIndex/update?commit=true -d  '<delete><query>*:*</query></delete>'" #empty index
+ 
+ 
+ if [[ ! -d "build" ]]
+ then
+ 	echoThenRun "ant"
+ fi
+ 
+ seedsDir="seeds"
+ if [[ $1 != "" ]]
+ then
+ 	seedsDir=$1
+ fi
+ 
+ it_size=10
+ if [[ $2 != "" ]]
+ then
+ 	it_size=$2
+ fi
+ 
+ indexedPlus1=1 #indexedPlus1 urls+1 because of tail. Never printed out
+ it_seedsDir="$seedsDir/it_seeds"
+ 
+ bin/hadoop dfs -rmr $it_seedsDir
+ bin/hadoop dfs -mkdir $it_seedsDir
+ bin/hadoop dfs -mkdir crawl/crawldb
+ rm $seedsDir/urls-local-only
+ 
+ echoThenRun "bin/hadoop dfs -get $seedsDir/*url* $seedsDir/urls-local-only"
+ 
+ allUrls=`cat $seedsDir/urls-local-only | wc -l | sed -e "s/^ *//"`
+ echo $allUrls" urls to crawl"
+ 
+ 
+ depth=1
+ if [[ $3 != "" ]]
+ then
+ 	depth=$3
+ fi
+ 
+ j=0
+ while [[ $indexedPlus1 -le $allUrls ]] #repeat generate-fetch-updatedb-invertlinks-index-merge loop until all urls are fetched
+ do
+ 	bin/hadoop dfs -rm $it_seedsDir/urls
+ 	
+ 	tail -n+$indexedPlus1 $seedsDir/urls-local-only | head -n$it_size > $it_seedsDir/urls-local-only
+ 	bin/hadoop dfs -moveFromLocal $it_seedsDir/urls-local-only $it_seedsDir/urls
+ 	
+ 	it_crawldb="crawl/crawldb/$j/0"
+ 	bin/hadoop dfs -mkdir $it_crawldb
+ 	
+ 	echo
+ 	echoThenRun "bin/nutch inject $it_crawldb $it_seedsDir"
+ 	i=0
+ 	
+ 	while [[ $i -lt $depth ]] # depth-first
+ 	do
+ 		echo "generate-fetch-updatedb-invertlinks-index-merge iteration "$i":"
+ 		
+ 		it_crawldb="crawl/crawldb/$j/$i"
+ 		
+ 		echo
+ 		cmd="bin/nutch generate $it_crawldb crawl/segments -topN $it_size"
+ 		echo $cmd
+ 		output=`$cmd`
+ 		echo $output
+ 		echo
+ 		if [[ $output == *'0 records selected for fetching'* ]] #all the urls of this iteration have been fetched
+ 		then
+ 			break;
+ 		fi
+ 		
+ 		echoThenRun "bin/nutch fetch crawl/segments/2*"
+ 
+ 		echoThenRun "bin/nutch updatedb $it_crawldb crawl/segments/2*"
+ 
+ 		echoThenRun "bin/nutch invertlinks crawl/linkdb -dir crawl/segments"
+ 
+ 
+ 		echoThenRun "bin/nutch solrindex $solrIndex $it_crawldb crawl/linkdb crawl/segments/*"
+ 
+ 		# you can now search the index with http://localhost:8080/solr/admin/ (if setup) or http://code.google.com/p/luke/ . The index is stored in $NUTCH_HOME/solr/data/index.
+ 		((i++))
+ 		((indexedPlus1+=$it_size)) # maybe should readdb crawl/crawldb -stats number of actually fetched, but (! going to fetch a page) --> infinite loop
+ 		echo
+ 	done
+ 
+ 	echoThenRun "bin/nutch readdb $it_crawldb -stats"
+ 
+ 	allcrawldb="crawl/allcrawldb"
+ 	temp_crawldb="crawl/temp_crawldb"
+ 	merge_dbs="$it_crawldb $allcrawldb"
+ 
+ 	# work-around for https://issues.apache.org/jira/browse/NUTCH-972 (Patch available)
+ 	if [[ ! -d $allcrawldb ]]
+ 	then
+ 		merge_dbs="$it_crawldb"
+ 	fi
+ 
+ 	#echoThenRun "bin/nutch mergedb $temp_crawldb $merge_dbs"
+ 
+ 	#rm -r $allcrawldb $it_crawldb crawl/segments crawl/linkdb
+ 	#mv $temp_crawldb $allcrawldb
+ 	((j++))
+ done
+ 
+ bin/hadoop dfs -rmr $it_seedsDir
+ }}}
+