You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2006/02/15 23:45:34 UTC

svn commit: r378107 - in /lucene/nutch/trunk: conf/ conf/hadoop-env.sh.template conf/slaves.template lib/hadoop-0.1-dev.jar src/java/org/apache/nutch/fetcher/Fetcher.java

Author: cutting
Date: Wed Feb 15 14:45:31 2006
New Revision: 378107

URL: http://svn.apache.org/viewcvs?rev=378107&view=rev
Log:
Fix Fetcher to disable speculative exexution, to keep it polite.  Also upgrade to latest hadoop jar that supports this  feature.  Note that Hadoop's environment specification has changed, with all environment variables settable from conf/hadoop-env.sh, and the slaves file is now in conf/, rather than in one's home directory.

Added:
    lucene/nutch/trunk/conf/hadoop-env.sh.template
    lucene/nutch/trunk/conf/slaves.template
Modified:
    lucene/nutch/trunk/conf/   (props changed)
    lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

Propchange: lucene/nutch/trunk/conf/
------------------------------------------------------------------------------
--- svn:ignore (original)
+++ svn:ignore Wed Feb 15 14:45:31 2006
@@ -1,5 +1,4 @@
-nutch-site.xml
-regex-normalize.xml
-crawl-urlfilter.txt
-regex-urlfilter.txt
-mapred-default.xml
+*.xml
+*.txt
+*.sh
+slaves

Added: lucene/nutch/trunk/conf/hadoop-env.sh.template
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/hadoop-env.sh.template?rev=378107&view=auto
==============================================================================
--- lucene/nutch/trunk/conf/hadoop-env.sh.template (added)
+++ lucene/nutch/trunk/conf/hadoop-env.sh.template Wed Feb 15 14:45:31 2006
@@ -0,0 +1,25 @@
+# Set Hadoop-specific environment variables here.
+
+# The java implementation to use.
+# export JAVA_HOME=/usr/bin/java
+
+# The maximum amount of heap to use, in MB. Default is 1000.
+# export HADOOP_HEAPSIZE=2000
+
+# Extra Java runtime options.  Empty by default.
+# export HADOOP_OPTS=-server
+
+# Where log files are stored.  $HADOOP_HOME/logs by default.
+# export HADOOP_LOG_DIR=${HADOOP_HOME}/logs
+
+# File naming remote slave hosts.  $HADOOP_HOME/conf/slaves by default.
+# export HADOOP_SLAVES=${HADOOP_HOME}/conf/slaves
+
+# host:path where hadoop code should be rsync'd from.  Unset by default.
+# export HADOOP_MASTER=master:/home/$USER/src/hadoop
+
+# The directory where pid files are stored. /tmp by default.
+# export HADOOP_PID_DIR=/var/hadoop/pids
+
+# A string representing this instance of hadoop. $USER by default.
+# export HADOOP_IDENT_STRING=$USER

Added: lucene/nutch/trunk/conf/slaves.template
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/slaves.template?rev=378107&view=auto
==============================================================================
--- lucene/nutch/trunk/conf/slaves.template (added)
+++ lucene/nutch/trunk/conf/slaves.template Wed Feb 15 14:45:31 2006
@@ -0,0 +1 @@
+localhost

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=378107&r1=378106&r2=378107&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=378107&r1=378106&r2=378107&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Feb 15 14:45:31 2006
@@ -348,6 +348,9 @@
     job.set(SEGMENT_NAME_KEY, segment.getName());
     job.setBoolean("fetcher.parse", parsing);
 
+    // for politeness, don't permit parallel execution of a single task
+    job.setBoolean("mapred.speculative.execution", false);
+
     job.setInputDir(new File(segment, CrawlDatum.GENERATE_DIR_NAME));
     job.setInputFormat(InputFormat.class);
     job.setInputKeyClass(UTF8.class);