You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@nutch.apache.org by roberto navoni <r....@radionav.it> on 2006/07/22 20:27:47 UTC
Hadoop and Inject and Recrawl hadoop and nutch v0.8 WORK FINE!!!!
Tutorial Nutch 0.8 and Hadoop
This tutorial derived by hadoop + nutch tutorial and other 0.8 tutorial
found on wiky site and on google and "work fine!!!"
At the end of tutorial you can found also a recrawl tutorial and
rebuild index
#Format the hadoop namenode
root@LSearchDev01:/nutch/search#bin/hadoop namenode -format
Re-format filesystem in /nutch/filesystem/name ? (Y or N) Y
Formatted /nutch/filesystem/name
#Start Hadoop
root@LSearchDev01:/nutch/search# bin/start-all.sh
namenode running as process 16789.
root@lsearchdev01's password:
jobtracker running as process 16866.
root@lsearchdev01's password:
LSearchDev01: starting tasktracker, logging
to /nutch/search/logs/hadoop-root-tasktracker-LSearchDev01.out
#ls on hadoop file systems
root@LSearchDev01:/nutch/search#
root@LSearchDev01:/nutch/search# bin/hadoop dfs -ls
Found 0 items
#Hadoop work fine
# use vi to add your site in http://www.yoursite.com format
root@LSearchDev01:/nutch/search# vi urls.txt
# Make urls directory on hadoop file system
root@LSearchDev01:/nutch/search# bin/hadoop dfs -mkdir urls
# Copy urls.txt file from linux file system to hadoop file system
root@LSearchDev01:/nutch/search# bin/hadoop dfs -copyFromLocal urls.txt
urls/urls.txt
# List the file on hadoop file system
root@LSearchDev01:/nutch/search# bin/hadoop dfs -lsr /user/root/urls
<dir>
/user/root/urls/urls.txt <r 2> 41
#If you want to delete the old urls file on hadoop and put a new one
file system use the follow command
root@LSearchDev01:/nutch/search# bin/hadoop dfs
-rm /user/root/urls/urls.txt
Deleted /user/root/urls/urls.txt
root@LSearchDev01:/nutch/search# bin/hadoop dfs -copyFromLocal urls.txt
urls/urls.txt
#Start to inject the urls in the urls.txt to <crawld> dbase
root@LSearchDev01:/nutch/search# bin/nutch inject crawld urls
# (*) if you want to see what are the statu of job going to:
http://127.0.0.1:50030
# This is the new situation of your hadoop file system now
root@LSearchDev01:/nutch/search# bin/hadoop dfs -lsr
/user/root/crawld <dir>
/user/root/crawld/current <dir>
/user/root/crawld/current/part-00000 <dir>
/user/root/crawld/current/part-00000/data <r 2> 62
/user/root/crawld/current/part-00000/index <r 2> 33
/user/root/crawld/current/part-00001 <dir>
/user/root/crawld/current/part-00001/data <r 2> 62
/user/root/crawld/current/part-00001/index <r 2> 33
/user/root/crawld/current/part-00002 <dir>
/user/root/crawld/current/part-00002/data <r 2> 124
/user/root/crawld/current/part-00002/index <r 2> 74
/user/root/crawld/current/part-00003 <dir>
/user/root/crawld/current/part-00003/data <r 2> 181
/user/root/crawld/current/part-00003/index <r 2> 74
/user/root/urls <dir>
/user/root/urls/urls.txt <r 2> 64
# Now you can generate the file for fetch job
root@LSearchDev01:/nutch/search# bin/nutch
generate /user/root/crawld /user/root/crawld/segments
# (*) if you want to see what are the statu of job going to:
http://127.0.0.1:50030
# This /user/root/crawld/segments/20060722130642 is the name of the
segment that you want to fetch
root@LSearchDev01:/nutch/search# bin/hadoop dfs
-ls /user/root/crawld/segments
Found 1 items
/user/root/crawld/segments/20060722130642 <dir>
root@LSearchDev01:/nutch/search#
#Fetch the site list in urls.txt
root@LSearchDev01:/nutch/search# bin/nutch
fetch /user/root/crawld/segments/20060722130642
# (*) if you want to see what are the statu of job going to:
http://127.0.0.1:50030
#This is what there are on your hadoop file systems now
root@LSearchDev01:/nutch/search# bin/hadoop dfs -lsr /user/root/crawld
<dir>
/user/root/crawld/current <dir>
/user/root/crawld/current/part-00000 <dir>
/user/root/crawld/current/part-00000/data <r 2> 62
/user/root/crawld/current/part-00000/index <r 2> 33
/user/root/crawld/current/part-00001 <dir>
/user/root/crawld/current/part-00001/data <r 2> 62
/user/root/crawld/current/part-00001/index <r 2> 33
/user/root/crawld/current/part-00002 <dir>
/user/root/crawld/current/part-00002/data <r 2> 124
/user/root/crawld/current/part-00002/index <r 2> 74
/user/root/crawld/current/part-00003 <dir>
/user/root/crawld/current/part-00003/data <r 2> 181
/user/root/crawld/current/part-00003/index <r 2> 74
/user/root/crawld/segments <dir>
/user/root/crawld/segments/20060722130642 <dir>
/user/root/crawld/segments/20060722130642/content <dir>
/user/root/crawld/segments/20060722130642/content/part-00000 <dir>
/user/root/crawld/segments/20060722130642/content/part-00000/data
<r 2> 62
/user/root/crawld/segments/20060722130642/content/part-00000/index
<r 2> 33
/user/root/crawld/segments/20060722130642/content/part-00001 <dir>
/user/root/crawld/segments/20060722130642/content/part-00001/data
<r 2> 62
/user/root/crawld/segments/20060722130642/content/part-00001/index
<r 2> 33
/user/root/crawld/segments/20060722130642/content/part-00002 <dir>
/user/root/crawld/segments/20060722130642/content/part-00002/data
<r 2> 2559
/user/root/crawld/segments/20060722130642/content/part-00002/index
<r 2> 74
/user/root/crawld/segments/20060722130642/content/part-00003 <dir>
/user/root/crawld/segments/20060722130642/content/part-00003/data
<r 2> 6028
/user/root/crawld/segments/20060722130642/content/part-00003/index
<r 2> 74
/user/root/crawld/segments/20060722130642/crawl_fetch <dir>
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00000
<dir>
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00000/data
<r 2> 62
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00000/index
<r 2> 33
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00001
<dir>
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00001/data
<r 2> 62
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00001/index
<r 2> 33
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00002
<dir>
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00002/data
<r 2> 140
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00002/index
<r 2> 74
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00003
<dir>
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00003/data
<r 2> 213
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00003/index
<r 2> 74
/user/root/crawld/segments/20060722130642/crawl_generate <dir>
/user/root/crawld/segments/20060722130642/crawl_generate/part-00000
<r 2> 119
/user/root/crawld/segments/20060722130642/crawl_generate/part-00001
<r 2> 124
/user/root/crawld/segments/20060722130642/crawl_generate/part-00002
<r 2> 124
/user/root/crawld/segments/20060722130642/crawl_generate/part-00003
<r 2> 62
/user/root/crawld/segments/20060722130642/crawl_parse <dir>
/user/root/crawld/segments/20060722130642/crawl_parse/part-00000
<r 2> 62
/user/root/crawld/segments/20060722130642/crawl_parse/part-00001
<r 2> 62
/user/root/crawld/segments/20060722130642/crawl_parse/part-00002
<r 2> 784
/user/root/crawld/segments/20060722130642/crawl_parse/part-00003
<r 2> 1698
/user/root/crawld/segments/20060722130642/parse_data <dir>
/user/root/crawld/segments/20060722130642/parse_data/part-00000 <dir>
/user/root/crawld/segments/20060722130642/parse_data/part-00000/data
<r 2> 61
/user/root/crawld/segments/20060722130642/parse_data/part-00000/index
<r 2> 33
/user/root/crawld/segments/20060722130642/parse_data/part-00001 <dir>
/user/root/crawld/segments/20060722130642/parse_data/part-00001/data
<r 2> 61
/user/root/crawld/segments/20060722130642/parse_data/part-00001/index
<r 2> 33
/user/root/crawld/segments/20060722130642/parse_data/part-00002 <dir>
/user/root/crawld/segments/20060722130642/parse_data/part-00002/data
<r 2> 839
/user/root/crawld/segments/20060722130642/parse_data/part-00002/index
<r 2> 74
/user/root/crawld/segments/20060722130642/parse_data/part-00003 <dir>
/user/root/crawld/segments/20060722130642/parse_data/part-00003/data
<r 2> 1798
/user/root/crawld/segments/20060722130642/parse_data/part-00003/index
<r 2> 74
/user/root/crawld/segments/20060722130642/parse_text <dir>
/user/root/crawld/segments/20060722130642/parse_text/part-00000 <dir>
/user/root/crawld/segments/20060722130642/parse_text/part-00000/data
<r 2> 61
/user/root/crawld/segments/20060722130642/parse_text/part-00000/index
<r 2> 33
/user/root/crawld/segments/20060722130642/parse_text/part-00001 <dir>
/user/root/crawld/segments/20060722130642/parse_text/part-00001/data
<r 2> 61
/user/root/crawld/segments/20060722130642/parse_text/part-00001/index
<r 2> 33
/user/root/crawld/segments/20060722130642/parse_text/part-00002 <dir>
/user/root/crawld/segments/20060722130642/parse_text/part-00002/data
<r 2> 377
/user/root/crawld/segments/20060722130642/parse_text/part-00002/index
<r 2> 74
/user/root/crawld/segments/20060722130642/parse_text/part-00003 <dir>
/user/root/crawld/segments/20060722130642/parse_text/part-00003/data
<r 2> 811
/user/root/crawld/segments/20060722130642/parse_text/part-00003/index
<r 2> 74
/user/root/urls <dir>
/user/root/urls/urls.txt <r 2> 64
#Now you need to do the invertlinks JOB
root@LSearchDev01:/nutch/search# bin/nutch
invertlinks /user/root/crawld/linkdb /user/root/crawld/segments/20060722130642
#And at the end you need to build your index
root@LSearchDev01:/nutch/search# bin/nutch
index /user/root/crawld/indexes /user/root/crawld/ /user/root/crawld/linkdb /user/root/crawld/segments/20060722130642
root@LSearchDev01:/nutch/search# bin/hadoop dfs -ls /user/root/crawld
Found 4 items
/user/root/crawld/current <dir>
/user/root/crawld/indexes <dir>
/user/root/crawld/linkdb <dir>
/user/root/crawld/segments <dir>
root@LSearchDev01:/nutch/search#
At the end of your hard job you have on your hadoop file system this
directory
So you are ready to start tomcat .
Before you start tomcat remeber to change the path of your search
directory in the file nutch-site.xml in webapps/ROOT/web-inf/classes
directory
#This is an example of my configuration
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>fs.default.name</name>
<value>LSearchDev01:9000</value>
</property>
<property>bin
<name>searcher.dir</name>
<value>/user/root/crawld</value>
</property>
</configuration>
~
~
#RECRAWL AND NEW INJECT
# Create a new indexe0
bin/nutch
index /user/root/crawld/indexe0 /user/root/crawld/ /user/root/crawld/linkdb /user/root/crawld/segments/20060722153133
# Create a new index1
bin/nutch
index /user/root/crawld/indexe1 /user/root/crawld/ /user/root/crawld/linkdb /user/root/crawld/segments/20060722182213
#Dedup the new indexe0
bin/nutch dedup /user/root/crawld/indexe0
#Dedup the new index1
bin/nutch dedup /user/root/crawld/indexe1
#Delete the old index
#Merge the new index merge directory
bin/nutch
merge /user/root/crawld/index /user/root/crawld/indexe0 /user/root/crawld/indexe1 ... #(and the other index create for the fetch segments)
#index is the stardard directory in the crawld (DB) where there is a
merge master index
I hope that i Help someone to do they first search engine on nutch 0.8 +
hadoop :)
Best crawling
Roberto Navoni