You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by Apache Wiki <wi...@apache.org> on 2007/03/01 11:32:14 UTC
[Nutch Wiki] Update of "Nutch0.9-Hadoop0.10-Tutorial" by mozdevil

Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Nutch Wiki" for change notification.

The following page has been changed by mozdevil:
http://wiki.apache.org/nutch/Nutch0%2e9-Hadoop0%2e10-Tutorial

The comment on the change is:
Replaced export variables with absolute path names

------------------------------------------------------------------------------
  
  Nutch is written in Java, so the java compiler and runtime are needed as well as ant. Hadoop makes use of ssh clients and servers on all machines. Lucene needs an servlet container, I used tomcat5.
  
+ To be able to login as root with su execute the following command and enter the new password for root as prompted.
+ {{{
+ sudo passwd
+ }}}
+ Login as root
  {{{
  su
- #enable the universe and multiverse repositories.
+ }}}
+ 
+ Enable the universe and multiverse repositories by editing the apt sources.list file.
+ {{{
  vi /etc/apt/sources.list 
- #on all the machines
+ }}}
+ Or execute the following if you are in the Netherlands and are using Ubuntu 6.06 Dapper.
+ {{{
+ echo "deb http://nl.archive.ubuntu.com/ubuntu/ dapper universe multiverse" >> /etc/apt/sources.list
+ echo "deb-src http://nl.archive.ubuntu.com/ubuntu/ dapper universe multiverse" >> /etc/apt/sources.list
+ }}}
+ 
+ Install the necessary packages for Nutch (java and ssh) on all machines
+ {{{
  apt-get install sun-java5-jre
  apt-get install ssh
  
  update-alternatives --config java
- #and select /usr/lib/jvm/java-1.5.0-sun/jre/bin/java
+ #select /usr/lib/jvm/java-1.5.0-sun/jre/bin/java
+ }}}
  
- #only for the search web application
+ And for the search web server 
+ {{{
  apt-get install apache2
  apt-get install sun-java5-jdk
  apt-get install tomcat5
+ }}}
  
+ Configure tomcat by editing /etc/default/tomcat5
+ {{{ 
  vi /etc/default/tomcat5
  #Add JAVA_HOME=/usr/lib/jvm/java-1.5.0-sun/
+ }}}
+ Or execute the following
+ {{{
+ echo "JAVA_HOME=/usr/lib/jvm/java-1.5.0-sun/" >> /etc/default/tomcat5
  }}}
  
  == Build nutch ==
@@ -34, +59 @@

  
  Unpack the tarball to nutch-nightly and build it with ant.
  {{{
- export NUTCH_BUILD_DIR=~/nutch-build
  tar -xvzf nutch-2007-02-06.tar.gz
  cd nutch-nightly
- mkdir ${NUTCH_BUILD_DIR}
- echo ${NUTCH_BUILD_DIR} >> build.properties
+ mkdir /nutch-build
+ echo "/nutch-build" >> build.properties
  ant package
  }}}
  
@@ -47, +71 @@

  Create the nutch user on each machine and create the necessary directories for nutch
  {{{
  ssh root@???
- export NUTCH_INSTALL_DIR=/nutch-0.9.0
- mkdir ${NUTCH_INSTALL_DIR}
- mkdir ${NUTCH_INSTALL_DIR}/search
- mkdir ${NUTCH_INSTALL_DIR}/filesystem
- mkdir ${NUTCH_INSTALL_DIR}/local
- mkdir ${NUTCH_INSTALL_DIR}/home
+ 
+ mkdir /nutch-0.9.0
+ mkdir /nutch-0.9.0/search
+ mkdir /nutch-0.9.0/filesystem
+ mkdir /nutch-0.9.0/local
+ mkdir /nutch-0.9.0/home
  
  groupadd users
- useradd -d ${NUTCH_INSTALL_DIR}/home -g users nutch
+ useradd -d /nutch-0.9.0/home -g users nutch
  passwd nutch
  
- chown -R nutch:users ${NUTCH_INSTALL_DIR}
+ chown -R nutch:users /nutch-0.9.0
  exit
  }}}
  
@@ -66, +90 @@

  Install nutch on the namenode (the master) and add the following variables to the hadoop-env.sh shell script.
  {{{
  ssh nutch@???
+ cp -Rv /nutch-build/* /nutch-0.9.0/search/
- export NUTCH_INSTALL_DIR=/nutch-0.9.0
- cp -Rv ${NUTCH_BUILD_DIR}/* ${NUTCH_INSTALL_DIR}/search/
- #chown -R nutch:users ${NUTCH_INSTALL_DIR}
  
- echo "export HADOOP_HOME="${NUTCH_INSTALL_DIR}"/search" >> ${NUTCH_INSTALL_DIR}/search/conf/hadoop-env.sh
+ echo "export HADOOP_HOME=/nutch-0.9.0/search" >> /nutch-0.9.0/search/conf/hadoop-env.sh
- echo "export JAVA_HOME=/usr/lib/jvm/java-1.5.0-sun" >> ${NUTCH_INSTALL_DIR}/search/conf/hadoop-env.sh
+ echo "export JAVA_HOME=/usr/lib/jvm/java-1.5.0-sun" >> /nutch-0.9.0/search/conf/hadoop-env.sh
- echo "export HADOOP_LOG_DIR=\${HADOOP_HOME}/logs" >> ${NUTCH_INSTALL_DIR}/search/conf/hadoop-env.sh
- echo "export HADOOP_SLAVES=\${HADOOP_HOME}/conf/slaves" >> ${NUTCH_INSTALL_DIR}/search/conf/hadoop-env.sh
+ echo "export HADOOP_LOG_DIR=/nutch-0.9.0/search/logs" >> /nutch-0.9.0/search/conf/hadoop-env.sh
+ echo "export HADOOP_SLAVES=/nutch-0.9.0/search/conf/slaves" >> /nutch-0.9.0/search/conf/hadoop-env.sh
  
  exit
  }}}
@@ -81, +103 @@

  === Configure SSH ===
  Create ssh keys so that the nutch user can login over ssh without being prompted for a password.
  {{{
- ssh nutch@localhost
- cd ${NUTCH_INSTALL_DIR}/home
+ ssh nutch@???
+ cd /nutch-0.9.0/home
+ ssh-keygen -t rsa
+ }}}
+ 
+ {{{
- ssh-keygen -t rsa (Use empty responses for each prompt)
+ #! Use empty responses for each prompt
-   Enter passphrase (empty for no passphrase): 
+ #  Enter passphrase (empty for no passphrase): 
-   Enter same passphrase again: 
+ #  Enter same passphrase again: 
-   Your identification has been saved in ${NUTCH_INSTALL_DIR}/home/.ssh/id_rsa.
+ #  Your identification has been saved in /nutch-0.9.0/home/.ssh/id_rsa.
-   Your public key has been saved in ${NUTCH_INSTALL_DIR}/home/.ssh/id_rsa.pub.
+ #  Your public key has been saved in /nutch-0.9.0/home/.ssh/id_rsa.pub.
-   The key fingerprint is:
+ #  The key fingerprint is:
-   a6:5c:c3:eb:18:94:0b:06:a1:a6:29:58:fa:80:0a:bc nutch@localhost
+ #  a6:5c:c3:eb:18:94:0b:06:a1:a6:29:58:fa:80:0a:bc nutch@localhost
  }}}
  
  Copy the key for this machine to the authorized_keys file that will be copied to the other machines (the slaves).
  {{{
- cd ${NUTCH_INSTALL_DIR}/home/.ssh
+ cd /nutch-0.9.0/home/.ssh
  cp id_rsa.pub authorized_keys
  }}}
  
@@ -176, +202 @@

  
  <property>
    <name>dfs.name.dir</name>
-   <value>${NUTCH_INSTALL_DIR}/filesystem/name</value>
+   <value>/nutch-0.9.0/filesystem/name</value>
  </property>
  
  <property>
    <name>dfs.data.dir</name>
-   <value>${NUTCH_INSTALL_DIR}/filesystem/data</value>
+   <value>/nutch-0.9.0/filesystem/data</value>
  </property>
  
  <property>
    <name>mapred.system.dir</name>
-   <value>${NUTCH_INSTALL_DIR}/filesystem/mapreduce/system</value>
+   <value>$/nutch-0.9.0/filesystem/mapreduce/system</value>
  </property>
  
  <property>
    <name>mapred.local.dir</name>
-   <value>${NUTCH_INSTALL_DIR}/filesystem/mapreduce/local</value>
+   <value>/nutch-0.9.0/filesystem/mapreduce/local</value>
  </property>
  
  <property>
@@ -279, +305 @@

  === Distribute the code and the configuration ===
  Copy the code and the configuration to the slaves
  {{{
- scp -r ${NUTCH_INSTALL_DIR}/search/* nutch@???:${NUTCH_INSTALL_DIR}/search
+ scp -r /nutch-0.9.0/search/* nutch@???:/nutch-0.9.0/search
  }}}
  
  Copy the keys to the slave machines
  {{{
- scp ${NUTCH_INSTALL_DIR}/home/.ssh/authorized_keys nutch@???:${NUTCH_INSTALL_DIR}/home/.ssh/authorized_keys
+ scp /nutch-0.9.0/home/.ssh/authorized_keys nutch@???:/nutch-0.9.0/home/.ssh/authorized_keys
  }}}
  
  Check if shhd is ready on the machines
@@ -334, +360 @@

  Because the searching needs different settings for nutch than for crawling, the easiest thing to do is to make a sepperate folder for the nutch search part.
  {{{
  ssh root@???
+ mkdir /nutchsearch-0.9.0
+ chown nutch:users /nutchsearch-0.9.0
- export NUTCH_BUILD_DIR=~/nutch-build
- export SEARCH_INSTALL_DIR=/nutch-search-0.9.0
- mkdir ${SEARCH_INSTALL_DIR}
- chown nutch:users ${SEARCH_INSTALL_DIR}
  exit
  
  ssh nutch@???
+ cp -Rv /nutch-build /nutchsearch-0.9.0/search
+ mkdir /nutchsearch-0.9.0/local
- export SEARCH_INSTALL_DIR=/nutch-search-0.9.0
- cp -Rv ${NUTCH_BUILD_DIR}/search ${SEARCH_INSTALL_DIR}/search
- mkdir ${SEARCH_INSTALL_DIR}/local
  }}}
  
  === Configure ===
@@ -363, +386 @@

  
    <property>
      <name>searcher.dir</name>
-     <value>${SEARCH_INSTALL_DIR}/local/crawled</value>
+     <value>/nutchsearch-0.9.0/local/crawled</value>
    </property>
  
  </configuration>
@@ -384, +407 @@

  === Make a local index ===
  Copy the data from dfs to the local filesystem.
  {{{
- bin/hadoop dfs -copyToLocal crawled ${SEARCH_INSTALL_DIR}/local/
+ bin/hadoop dfs -copyToLocal crawled /nutchsearch-0.9.0/local/
  }}}
  
  Test if all is configured properly
@@ -397, +420 @@

  Copy the war file to the tomcat directory
  {{{
  rm -rf usr/share/tomcat5/webapps/ROOT*
- cp ${SEARCH_INSTALL_DIR}/*.war /usr/share/tomcat5/webapps/ROOT.war
+ cp /nutchsearch-0.9.0/*.war /usr/share/tomcat5/webapps/ROOT.war
  }}}
  
  Copy the configuration to the tomcat directory
  {{{
- cp ${SEARCH_INSTALL_DIR}/search/conf/* /usr/share/tomcat5/webapps/ROOT/WEB-INF/classes/
+ cp /nutchsearch-0.9.0/search/conf/* /usr/share/tomcat5/webapps/ROOT/WEB-INF/classes/
  }}}
  
  Start tomcat 
@@ -417, +440 @@

  Prepare the other machines that are going to host a part of the index.
  {{{
  ssh root@???
+ mkdir /nutchsearch-0.9.0
+ mkdir /nutchsearch-0.9.0/search
+ chown -R nutch:users /nutchsearch-0.9.0
- export SEARCH_INSTALL_DIR=/nutchsearch-0.9.0
- mkdir ${SEARCH_INSTALL_DIR}
- mkdir ${SEARCH_INSTALL_DIR}/search
- chown -R nutch:users ${SEARCH_INSTALL_DIR}
  exit
  }}}
  
  Copy the search install directory to other machines.
  {{{
- scp -r ${SEARCH_INSTALL_DIR}/search nutch@???:${SEARCH_INSTALL_DIR}/search
+ scp -r /nutchsearch-0.9.0/search nutch@???:/nutchsearch-0.9.0/search
  }}}
  
  === Configure ===
@@ -454, +476 @@

  
    <property>
      <name>searcher.dir</name>
-     <value>${SEARCH_INSTALL_DIR}/search/conf/</value>
+     <value>/nutchsearch-0.9.0/search/conf/</value>
    </property>
  
  </configuration>
@@ -466, +488 @@

  Copy each part of the index to a different machine.
  {{{
  ???
- scp -R ${SEARCH_INSTALL_DIR}/local/partX/crawled nutch@???:${SEARCH_INSTALL_DIR}/local/
+ scp -R /nutchsearch-0.9.0/local/partX/crawled nutch@???:/nutchsearch-0.9.0/local/
  }}}
  
  === Start the services ===
  Startup the search services on all the machines that have a part of the index.
  {{{
- bin/nutch server 9999 ${SEARCH_INSTALL_DIR}/local/crawled
+ bin/nutch server 9999 /nutchsearch-0.9.0/local/crawled
  }}}
  
  Restart the master search node