You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@nutch.apache.org by roberto navoni <r....@radionav.it> on 2006/07/22 20:27:47 UTC
Hadoop and Inject and Recrawl hadoop and nutch v0.8 WORK FINE!!!!

Tutorial Nutch 0.8 and Hadoop 

This tutorial derived by hadoop + nutch tutorial and other 0.8 tutorial
found on wiky site and on google and "work fine!!!"
At the end of tutorial you can found also  a recrawl tutorial and
rebuild index


#Format the hadoop namenode


root@LSearchDev01:/nutch/search#bin/hadoop namenode -format
Re-format filesystem in /nutch/filesystem/name ? (Y or N) Y
Formatted /nutch/filesystem/name


#Start Hadoop 

root@LSearchDev01:/nutch/search# bin/start-all.sh
namenode running as process 16789. 
root@lsearchdev01's password:
jobtracker running as process 16866.
root@lsearchdev01's password:
LSearchDev01: starting tasktracker, logging
to /nutch/search/logs/hadoop-root-tasktracker-LSearchDev01.out

#ls on hadoop file systems

root@LSearchDev01:/nutch/search#
root@LSearchDev01:/nutch/search# bin/hadoop dfs -ls
Found 0 items

#Hadoop work fine 


# use vi to add your site in  http://www.yoursite.com format 

root@LSearchDev01:/nutch/search# vi urls.txt


# Make urls directory on hadoop file system 

root@LSearchDev01:/nutch/search# bin/hadoop dfs -mkdir urls 

# Copy urls.txt file from linux file system to hadoop file system
root@LSearchDev01:/nutch/search# bin/hadoop dfs -copyFromLocal urls.txt
urls/urls.txt

# List the file on hadoop file system
root@LSearchDev01:/nutch/search# bin/hadoop dfs -lsr /user/root/urls
<dir>

/user/root/urls/urls.txt        <r 2>   41


#If you want to delete the old urls file on hadoop and put a new one
file system use the follow command

root@LSearchDev01:/nutch/search# bin/hadoop dfs
-rm /user/root/urls/urls.txt
Deleted /user/root/urls/urls.txt
root@LSearchDev01:/nutch/search# bin/hadoop dfs -copyFromLocal urls.txt
urls/urls.txt

#Start to inject the urls in the urls.txt to <crawld> dbase

root@LSearchDev01:/nutch/search# bin/nutch inject crawld urls

# (*) if you want to see what are the statu of job going to:
http://127.0.0.1:50030


# This is the new situation of your hadoop file system now
 
root@LSearchDev01:/nutch/search# bin/hadoop dfs -lsr
/user/root/crawld       <dir>
/user/root/crawld/current       <dir>
/user/root/crawld/current/part-00000    <dir>
/user/root/crawld/current/part-00000/data       <r 2>   62
/user/root/crawld/current/part-00000/index      <r 2>   33
/user/root/crawld/current/part-00001    <dir>
/user/root/crawld/current/part-00001/data       <r 2>   62
/user/root/crawld/current/part-00001/index      <r 2>   33
/user/root/crawld/current/part-00002    <dir>
/user/root/crawld/current/part-00002/data       <r 2>   124
/user/root/crawld/current/part-00002/index      <r 2>   74
/user/root/crawld/current/part-00003    <dir>
/user/root/crawld/current/part-00003/data       <r 2>   181
/user/root/crawld/current/part-00003/index      <r 2>   74
/user/root/urls <dir>
/user/root/urls/urls.txt        <r 2>   64

# Now you can generate the file for fetch job
root@LSearchDev01:/nutch/search# bin/nutch
generate /user/root/crawld /user/root/crawld/segments

# (*) if you want to see what are the statu of job going to:
http://127.0.0.1:50030

# This /user/root/crawld/segments/20060722130642 is the name of the
segment that you want to fetch

root@LSearchDev01:/nutch/search# bin/hadoop dfs
-ls /user/root/crawld/segments
Found 1 items
/user/root/crawld/segments/20060722130642       <dir>
root@LSearchDev01:/nutch/search#

#Fetch the site list in urls.txt

root@LSearchDev01:/nutch/search# bin/nutch
fetch /user/root/crawld/segments/20060722130642


# (*) if you want to see what are the statu of job going to:
http://127.0.0.1:50030


#This is what there are on your hadoop file systems now

root@LSearchDev01:/nutch/search# bin/hadoop dfs -lsr /user/root/crawld
<dir>
/user/root/crawld/current       <dir>
/user/root/crawld/current/part-00000    <dir>
/user/root/crawld/current/part-00000/data       <r 2>   62
/user/root/crawld/current/part-00000/index      <r 2>   33
/user/root/crawld/current/part-00001    <dir>
/user/root/crawld/current/part-00001/data       <r 2>   62
/user/root/crawld/current/part-00001/index      <r 2>   33
/user/root/crawld/current/part-00002    <dir>
/user/root/crawld/current/part-00002/data       <r 2>   124
/user/root/crawld/current/part-00002/index      <r 2>   74
/user/root/crawld/current/part-00003    <dir>
/user/root/crawld/current/part-00003/data       <r 2>   181
/user/root/crawld/current/part-00003/index      <r 2>   74
/user/root/crawld/segments      <dir>
/user/root/crawld/segments/20060722130642       <dir>
/user/root/crawld/segments/20060722130642/content       <dir>
/user/root/crawld/segments/20060722130642/content/part-00000    <dir>
/user/root/crawld/segments/20060722130642/content/part-00000/data
<r 2>  62
/user/root/crawld/segments/20060722130642/content/part-00000/index
<r 2>  33
/user/root/crawld/segments/20060722130642/content/part-00001    <dir>
/user/root/crawld/segments/20060722130642/content/part-00001/data
<r 2>  62
/user/root/crawld/segments/20060722130642/content/part-00001/index
<r 2>  33
/user/root/crawld/segments/20060722130642/content/part-00002    <dir>
/user/root/crawld/segments/20060722130642/content/part-00002/data
<r 2>  2559
/user/root/crawld/segments/20060722130642/content/part-00002/index
<r 2>  74
/user/root/crawld/segments/20060722130642/content/part-00003    <dir>
/user/root/crawld/segments/20060722130642/content/part-00003/data
<r 2>  6028
/user/root/crawld/segments/20060722130642/content/part-00003/index
<r 2>  74
/user/root/crawld/segments/20060722130642/crawl_fetch   <dir>
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00000
<dir>
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00000/data
<r 2>  62
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00000/index
<r 2>  33
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00001
<dir>
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00001/data
<r 2>  62
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00001/index
<r 2>  33
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00002
<dir>
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00002/data
<r 2>  140
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00002/index
<r 2>  74
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00003
<dir>
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00003/data
<r 2>  213
/user/root/crawld/segments/20060722130642/crawl_fetch/part-00003/index
<r 2>  74
/user/root/crawld/segments/20060722130642/crawl_generate        <dir>
/user/root/crawld/segments/20060722130642/crawl_generate/part-00000
<r 2>  119
/user/root/crawld/segments/20060722130642/crawl_generate/part-00001
<r 2>  124
/user/root/crawld/segments/20060722130642/crawl_generate/part-00002
<r 2>  124
/user/root/crawld/segments/20060722130642/crawl_generate/part-00003
<r 2>  62
/user/root/crawld/segments/20060722130642/crawl_parse   <dir>
/user/root/crawld/segments/20060722130642/crawl_parse/part-00000
<r 2>  62
/user/root/crawld/segments/20060722130642/crawl_parse/part-00001
<r 2>  62
/user/root/crawld/segments/20060722130642/crawl_parse/part-00002
<r 2>  784
/user/root/crawld/segments/20060722130642/crawl_parse/part-00003
<r 2>  1698
/user/root/crawld/segments/20060722130642/parse_data    <dir>
/user/root/crawld/segments/20060722130642/parse_data/part-00000 <dir>
/user/root/crawld/segments/20060722130642/parse_data/part-00000/data
<r 2>  61
/user/root/crawld/segments/20060722130642/parse_data/part-00000/index
<r 2>  33
/user/root/crawld/segments/20060722130642/parse_data/part-00001 <dir>
/user/root/crawld/segments/20060722130642/parse_data/part-00001/data
<r 2>  61
/user/root/crawld/segments/20060722130642/parse_data/part-00001/index
<r 2>  33
/user/root/crawld/segments/20060722130642/parse_data/part-00002 <dir>
/user/root/crawld/segments/20060722130642/parse_data/part-00002/data
<r 2>  839
/user/root/crawld/segments/20060722130642/parse_data/part-00002/index
<r 2>  74
/user/root/crawld/segments/20060722130642/parse_data/part-00003 <dir>
/user/root/crawld/segments/20060722130642/parse_data/part-00003/data
<r 2>  1798
/user/root/crawld/segments/20060722130642/parse_data/part-00003/index
<r 2>  74
/user/root/crawld/segments/20060722130642/parse_text    <dir>
/user/root/crawld/segments/20060722130642/parse_text/part-00000 <dir>
/user/root/crawld/segments/20060722130642/parse_text/part-00000/data
<r 2>  61
/user/root/crawld/segments/20060722130642/parse_text/part-00000/index
<r 2>  33
/user/root/crawld/segments/20060722130642/parse_text/part-00001 <dir>
/user/root/crawld/segments/20060722130642/parse_text/part-00001/data
<r 2>  61
/user/root/crawld/segments/20060722130642/parse_text/part-00001/index
<r 2>  33
/user/root/crawld/segments/20060722130642/parse_text/part-00002 <dir>
/user/root/crawld/segments/20060722130642/parse_text/part-00002/data
<r 2>  377
/user/root/crawld/segments/20060722130642/parse_text/part-00002/index
<r 2>  74
/user/root/crawld/segments/20060722130642/parse_text/part-00003 <dir>
/user/root/crawld/segments/20060722130642/parse_text/part-00003/data
<r 2>  811
/user/root/crawld/segments/20060722130642/parse_text/part-00003/index
<r 2>  74
/user/root/urls <dir>
/user/root/urls/urls.txt        <r 2>   64

#Now you need to do the invertlinks JOB

root@LSearchDev01:/nutch/search# bin/nutch
invertlinks /user/root/crawld/linkdb /user/root/crawld/segments/20060722130642

#And at the end you need to build your index 

root@LSearchDev01:/nutch/search# bin/nutch
index /user/root/crawld/indexes /user/root/crawld/ /user/root/crawld/linkdb /user/root/crawld/segments/20060722130642

root@LSearchDev01:/nutch/search# bin/hadoop dfs -ls /user/root/crawld
Found 4 items
/user/root/crawld/current       <dir>
/user/root/crawld/indexes       <dir>
/user/root/crawld/linkdb        <dir>
/user/root/crawld/segments      <dir>
root@LSearchDev01:/nutch/search#

At the  end of your hard job you have on your hadoop file system this
directory

So you are ready to start tomcat .
Before you start tomcat remeber to change the path of your search
directory in the file nutch-site.xml in webapps/ROOT/web-inf/classes
directory 

#This is an example of my configuration 

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<!-- Put site-specific property overrides in this file. -->

<configuration>
  <property>
    <name>fs.default.name</name>
    <value>LSearchDev01:9000</value>
  </property>

  <property>bin
    <name>searcher.dir</name>
    <value>/user/root/crawld</value>
  </property>

</configuration>
~
~


#RECRAWL AND NEW INJECT

# Create a new indexe0
bin/nutch
index /user/root/crawld/indexe0 /user/root/crawld/ /user/root/crawld/linkdb /user/root/crawld/segments/20060722153133
 
# Create a new index1
bin/nutch
index /user/root/crawld/indexe1 /user/root/crawld/ /user/root/crawld/linkdb /user/root/crawld/segments/20060722182213

#Dedup the new indexe0
bin/nutch dedup /user/root/crawld/indexe0

#Dedup the new index1
bin/nutch dedup /user/root/crawld/indexe1

#Delete the old index


#Merge the new index merge directory

bin/nutch
merge /user/root/crawld/index /user/root/crawld/indexe0 /user/root/crawld/indexe1 ... #(and the other index create for the fetch segments)

#index is the stardard directory in the crawld (DB) where there is a
merge master index



I hope that i Help someone to do they first search engine on nutch 0.8 +
hadoop :)

Best crawling
Roberto Navoni