You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2010/07/14 16:40:02 UTC

svn commit: r964063 - in /nutch/trunk: ./ conf/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/plugin/ src/java/org/apache/nutch/scoring/webgraph/ src/java/org/apache/nutch/util/ src/test/org/apache/nutch/plugin/

Author: ab
Date: Wed Jul 14 14:40:01 2010
New Revision: 964063

URL: http://svn.apache.org/viewvc?rev=964063&view=rev
Log:
NUTCH-844 Improve NutchConfiguration.

Removed:
    nutch/trunk/conf/crawl-tool.xml
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
    nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java
    nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java
    nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java
    nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java
    nutch/trunk/src/test/org/apache/nutch/plugin/TestPluginSystem.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=964063&r1=964062&r2=964063&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jul 14 14:40:01 2010
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 2.0 - Current Development
 
+* NUTCH-844 Improve NutchConfiguration (ab)
+
 * NUTCH-850 SolrDeleteDuplicates needs to clone the SolrRecord objects (jnioche)
 
 * NUTCH-845 Native hadoop libs not available through maven (ab)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=964063&r1=964062&r2=964063&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Wed Jul 14 14:40:01 2010
@@ -28,6 +28,8 @@ import org.apache.commons.logging.LogFac
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.parse.ParseSegment;
 import org.apache.nutch.indexer.solr.SolrDeleteDuplicates;
 import org.apache.nutch.indexer.solr.SolrIndexer;
@@ -37,7 +39,7 @@ import org.apache.nutch.util.NutchJob;
 
 import org.apache.nutch.fetcher.Fetcher;
 
-public class Crawl {
+public class Crawl extends Configured implements Tool {
   public static final Log LOG = LogFactory.getLog(Crawl.class);
 
   private static String getDate() {
@@ -48,18 +50,21 @@ public class Crawl {
 
   /* Perform complete crawling and indexing given a set of root urls. */
   public static void main(String args[]) throws Exception {
+    Configuration conf = NutchConfiguration.create();
+    int res = ToolRunner.run(conf, new Crawl(), args);
+    System.exit(res);
+  }
+  
+  @Override
+  public int run(String[] args) throws Exception {
     if (args.length < 1) {
       System.out.println
       ("Usage: Crawl <urlDir> -solr <solrURL> [-dir d] [-threads n] [-depth i] [-topN N]");
-      return;
+      return -1;
     }
-
-    Configuration conf = NutchConfiguration.createCrawlConfiguration();
-    JobConf job = new NutchJob(conf);
-
     Path rootUrlDir = null;
     Path dir = new Path("crawl-" + getDate());
-    int threads = job.getInt("fetcher.threads.fetch", 10);
+    int threads = getConf().getInt("fetcher.threads.fetch", 10);
     int depth = 5;
     long topN = Long.MAX_VALUE;
     String solrUrl = null;
@@ -85,6 +90,8 @@ public class Crawl {
       }
     }
     
+    JobConf job = new NutchJob(getConf());
+
     if (solrUrl == null) {
       LOG.warn("solrUrl is not set, indexing will be skipped...");
     }
@@ -108,12 +115,12 @@ public class Crawl {
     Path index = new Path(dir + "/index");
 
     Path tmpDir = job.getLocalPath("crawl"+Path.SEPARATOR+getDate());
-    Injector injector = new Injector(conf);
-    Generator generator = new Generator(conf);
-    Fetcher fetcher = new Fetcher(conf);
-    ParseSegment parseSegment = new ParseSegment(conf);
-    CrawlDb crawlDbTool = new CrawlDb(conf);
-    LinkDb linkDbTool = new LinkDb(conf);
+    Injector injector = new Injector(getConf());
+    Generator generator = new Generator(getConf());
+    Fetcher fetcher = new Fetcher(getConf());
+    ParseSegment parseSegment = new ParseSegment(getConf());
+    CrawlDb crawlDbTool = new CrawlDb(getConf());
+    LinkDb linkDbTool = new LinkDb(getConf());
       
     // initialize crawlDb
     injector.inject(crawlDb, rootUrlDir);
@@ -125,7 +132,7 @@ public class Crawl {
         LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
         break;
       }
-      fetcher.fetch(segs[0], threads, org.apache.nutch.fetcher.Fetcher.isParsing(conf));  // fetch it
+      fetcher.fetch(segs[0], threads, org.apache.nutch.fetcher.Fetcher.isParsing(getConf()));  // fetch it
       if (!Fetcher.isParsing(job)) {
         parseSegment.parse(segs[0]);    // parse it, if needed
       }
@@ -137,11 +144,11 @@ public class Crawl {
       if (solrUrl != null) {
         // index, dedup & merge
         FileStatus[] fstats = fs.listStatus(segments, HadoopFSUtil.getPassDirectoriesFilter(fs));
-        SolrIndexer indexer = new SolrIndexer(conf);
+        SolrIndexer indexer = new SolrIndexer(getConf());
         indexer.indexSolr(solrUrl, crawlDb, linkDb, 
           Arrays.asList(HadoopFSUtil.getPaths(fstats)));
         SolrDeleteDuplicates dedup = new SolrDeleteDuplicates();
-        dedup.setConf(conf);
+        dedup.setConf(getConf());
         dedup.dedup(solrUrl);
       }
       
@@ -149,5 +156,8 @@ public class Crawl {
       LOG.warn("No URLs to fetch - check your seed list and URL filters.");
     }
     if (LOG.isInfoEnabled()) { LOG.info("crawl finished: " + dir); }
+    return 0;
   }
+
+
 }

Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java?rev=964063&r1=964062&r2=964063&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java Wed Jul 14 14:40:01 2010
@@ -43,7 +43,7 @@ import org.apache.nutch.util.NutchConfig
  * @author joa23
  */
 public class PluginRepository {
-  private static final WeakHashMap<Configuration, PluginRepository> CACHE = new WeakHashMap<Configuration, PluginRepository>();
+  private static final WeakHashMap<String, PluginRepository> CACHE = new WeakHashMap<String, PluginRepository>();
 
   private boolean auto;
 
@@ -90,10 +90,14 @@ public class PluginRepository {
    * @return a cached instance of the plugin repository
    */
   public static synchronized PluginRepository get(Configuration conf) {
-    PluginRepository result = CACHE.get(conf);
+    String uuid = NutchConfiguration.getUUID(conf);
+    if (uuid == null) {
+      uuid = "nonNutchConf@" + conf.hashCode(); // fallback
+    }
+    PluginRepository result = CACHE.get(uuid);
     if (result == null) {
       result = new PluginRepository(conf);
-      CACHE.put(conf, result);
+      CACHE.put(uuid, result);
     }
     return result;
   }

Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java?rev=964063&r1=964062&r2=964063&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java Wed Jul 14 14:40:01 2010
@@ -26,6 +26,7 @@ import org.apache.commons.cli.Option;
 import org.apache.commons.cli.OptionBuilder;
 import org.apache.commons.cli.Options;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.MapFile;
@@ -39,11 +40,18 @@ import org.apache.nutch.util.NutchConfig
 /**
  * The LoopReader tool prints the loopset information for a single url.
  */
-public class LoopReader {
+public class LoopReader extends Configured {
 
-  private Configuration conf;
   private FileSystem fs;
   private MapFile.Reader[] loopReaders;
+  
+  public LoopReader() {
+    
+  }
+  
+  public LoopReader(Configuration conf) {
+    super(conf);
+  }
 
   /**
    * Prints loopset for a single url.  The loopset information will show any
@@ -58,10 +66,9 @@ public class LoopReader {
     throws IOException {
 
     // open the readers
-    conf = NutchConfiguration.create();
-    fs = FileSystem.get(conf);
+    fs = FileSystem.get(getConf());
     loopReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb,
-      Loops.LOOPS_DIR), conf);
+      Loops.LOOPS_DIR), getConf());
 
     // get the loopset for a given url, if any
     Text key = new Text(url);
@@ -110,7 +117,7 @@ public class LoopReader {
 
       String webGraphDb = line.getOptionValue("webgraphdb");
       String url = line.getOptionValue("url");
-      LoopReader reader = new LoopReader();
+      LoopReader reader = new LoopReader(NutchConfiguration.create());
       reader.dumpUrl(new Path(webGraphDb), url);
       return;
     }

Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java?rev=964063&r1=964062&r2=964063&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java Wed Jul 14 14:40:01 2010
@@ -26,6 +26,7 @@ import org.apache.commons.cli.Option;
 import org.apache.commons.cli.OptionBuilder;
 import org.apache.commons.cli.Options;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.MapFile;
@@ -39,12 +40,19 @@ import org.apache.nutch.util.NutchConfig
  * Reads and prints to system out information for a single node from the NodeDb 
  * in the WebGraph.
  */
-public class NodeReader {
+public class NodeReader extends Configured {
 
-  private Configuration conf;
   private FileSystem fs;
   private MapFile.Reader[] nodeReaders;
 
+  public NodeReader() {
+    
+  }
+  
+  public NodeReader(Configuration conf) {
+    super(conf);
+  }
+  
   /**
    * Prints the content of the Node represented by the url to system out.
    * 
@@ -56,10 +64,9 @@ public class NodeReader {
   public void dumpUrl(Path webGraphDb, String url)
     throws IOException {
 
-    conf = NutchConfiguration.create();
-    fs = FileSystem.get(conf);
+    fs = FileSystem.get(getConf());
     nodeReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb,
-      WebGraph.NODE_DIR), conf);
+      WebGraph.NODE_DIR), getConf());
 
     // open the readers, get the node, print out the info, and close the readers
     Text key = new Text(url);
@@ -108,7 +115,7 @@ public class NodeReader {
       // dump the values to system out and return
       String webGraphDb = line.getOptionValue("webgraphdb");
       String url = line.getOptionValue("url");
-      NodeReader reader = new NodeReader();
+      NodeReader reader = new NodeReader(NutchConfiguration.create());
       reader.dumpUrl(new Path(webGraphDb), url);
       
       return;

Modified: nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java?rev=964063&r1=964062&r2=964063&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java Wed Jul 14 14:40:01 2010
@@ -17,93 +17,77 @@
 
 package org.apache.nutch.util;
 
-// JDK imports
-import java.util.Enumeration;
+import java.util.Map.Entry;
+import java.util.Properties;
+import java.util.UUID;
 
-// Servlet imports
-import javax.servlet.ServletContext;
-
-// Hadoop imports
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.WritableName;
 
 
 /** Utility to create Hadoop {@link Configuration}s that include Nutch-specific
  * resources.  */
 public class NutchConfiguration {
-  
-  private final static String KEY = NutchConfiguration.class.getName();
+  public static final String UUID_KEY = "nutch.conf.uuid";
   
   private NutchConfiguration() {}                 // singleton
-
-  // for back-compatibility, add old aliases for these Writable classes
-  // this may be removed after the 0.8 release
-  static {
-    WritableName.addName(org.apache.nutch.parse.ParseData.class, "ParseData"); 
-    WritableName.addName(org.apache.nutch.parse.ParseText.class, "ParseText"); 
-    WritableName.addName(org.apache.nutch.protocol.Content.class, "Content");
+  
+  /*
+   * Configuration.hashCode() doesn't return values that
+   * correspond to a unique set of parameters. This is a workaround
+   * so that we can track instances of Configuration created by Nutch.
+   */
+  private static void setUUID(Configuration conf) {
+    UUID uuid = UUID.randomUUID();
+    conf.set(UUID_KEY, uuid.toString());
   }
-
-  /** Create a {@link Configuration} for Nutch. */
-  public static Configuration create() {
-    Configuration conf = new Configuration();
-    addNutchResources(conf, false);
-    return conf;
+  
+  /**
+   * Retrieve a Nutch UUID of this configuration object, or null
+   * if the configuration was created elsewhere.
+   * @param conf configuration instance
+   * @return uuid or null
+   */
+  public static String getUUID(Configuration conf) {
+    return conf.get(UUID_KEY);
   }
 
-  /**
-   * Create a {@link Configuration for Nutch invoked with the command
-   * line crawl command, i.e. bin/nutch crawl ...
+  /** Create a {@link Configuration} for Nutch. This will load the standard
+   * Nutch resources, <code>nutch-default.xml</code> and
+   * <code>nutch-site.xml</code> overrides.
    */
-  public static Configuration createCrawlConfiguration() {
+  public static Configuration create() {
     Configuration conf = new Configuration();
-    addNutchResources(conf, true);
+    setUUID(conf);
+    addNutchResources(conf);
     return conf;
   }
-
-  /**
-   * Create a {@link Configuration} for Nutch front-end.
-   *
-   * If a {@link Configuration} is found in the
-   * {@link javax.servlet.ServletContext} it is simply returned, otherwise,
-   * a new {@link Configuration} is created using the {@link #create()} method,
-   * and then all the init parameters found in the
-   * {@link javax.servlet.ServletContext} are added to the {@link Configuration}
-   * (the created {@link Configuration} is then saved into the
-   * {@link javax.servlet.ServletContext}).
-   *
-   * @param application is the ServletContext whose init parameters
-   *        must override those of Nutch.
+  
+  /** Create a {@link Configuration} from supplied properties.
+   * @param addNutchResources if true, then first <code>nutch-default.xml</code>,
+   * and then <code>nutch-site.xml</code> will be loaded prior to applying the
+   * properties. Otherwise these resources won't be used.
+   * @param nutchProperties a set of properties to define (or override)
    */
-  public static Configuration get(ServletContext application) {
-    Configuration conf = (Configuration) application.getAttribute(KEY);
-    if (conf == null) {
-      conf = create();
-      Enumeration e = application.getInitParameterNames();
-      while (e.hasMoreElements()) {
-        String name = (String) e.nextElement();
-        conf.set(name, application.getInitParameter(name));
-      }
-      application.setAttribute(KEY, conf);
+  public static Configuration create(boolean addNutchResources, Properties nutchProperties) {
+    Configuration conf = new Configuration();
+    setUUID(conf);
+    if (addNutchResources) {
+      addNutchResources(conf);
+    }
+    for (Entry<Object, Object> e : nutchProperties.entrySet()) {
+      conf.set(e.getKey().toString(), e.getValue().toString());
     }
     return conf;
   }
-  
+
   /**
    * Add the standard Nutch resources to {@link Configuration}.
    * 
    * @param conf               Configuration object to which
    *                           configuration is to be added.
-   * @param crawlConfiguration Whether configuration for command line
-   *                           crawl using 'bin/nutch crawl' command
-   *                           should be added.
    */
-  private static Configuration addNutchResources(Configuration conf,
-                                                 boolean crawlConfiguration) {
+  private static Configuration addNutchResources(Configuration conf) {
     conf.addResource("nutch-default.xml");
-    if (crawlConfiguration) {
-      conf.addResource("crawl-tool.xml");
-    }
     conf.addResource("nutch-site.xml");
     return conf;
   }

Modified: nutch/trunk/src/test/org/apache/nutch/plugin/TestPluginSystem.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/plugin/TestPluginSystem.java?rev=964063&r1=964062&r2=964063&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/plugin/TestPluginSystem.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/plugin/TestPluginSystem.java Wed Jul 14 14:40:01 2010
@@ -29,7 +29,9 @@ import java.util.Properties;
 import junit.framework.TestCase;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
 import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
 
 /**
  * Unit tests for the plugin system
@@ -95,6 +97,22 @@ public class TestPluginSystem extends Te
         }
     }
 
+    public void testRepositoryCache() {
+      Configuration config = NutchConfiguration.create();
+      PluginRepository repo = PluginRepository.get(config);
+      JobConf job = new NutchJob(config);
+      PluginRepository repo1 = PluginRepository.get(job);
+      assertTrue(repo == repo1);
+      // now construct a config without UUID
+      config = new Configuration();
+      config.addResource("nutch-default.xml");
+      config.addResource("nutch-site.xml");
+      repo = PluginRepository.get(config);
+      job = new NutchJob(config);
+      repo1 = PluginRepository.get(job);
+      assertTrue(repo1 != repo);
+    }
+
     /**
      *  
      */