You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2010/07/14 16:40:02 UTC
svn commit: r964063 - in /nutch/trunk: ./ conf/
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/plugin/
src/java/org/apache/nutch/scoring/webgraph/ src/java/org/apache/nutch/util/
src/test/org/apache/nutch/plugin/
Author: ab
Date: Wed Jul 14 14:40:01 2010
New Revision: 964063
URL: http://svn.apache.org/viewvc?rev=964063&view=rev
Log:
NUTCH-844 Improve NutchConfiguration.
Removed:
nutch/trunk/conf/crawl-tool.xml
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java
nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java
nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java
nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java
nutch/trunk/src/test/org/apache/nutch/plugin/TestPluginSystem.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=964063&r1=964062&r2=964063&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jul 14 14:40:01 2010
@@ -2,6 +2,8 @@ Nutch Change Log
Release 2.0 - Current Development
+* NUTCH-844 Improve NutchConfiguration (ab)
+
* NUTCH-850 SolrDeleteDuplicates needs to clone the SolrRecord objects (jnioche)
* NUTCH-845 Native hadoop libs not available through maven (ab)
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=964063&r1=964062&r2=964063&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Wed Jul 14 14:40:01 2010
@@ -28,6 +28,8 @@ import org.apache.commons.logging.LogFac
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.parse.ParseSegment;
import org.apache.nutch.indexer.solr.SolrDeleteDuplicates;
import org.apache.nutch.indexer.solr.SolrIndexer;
@@ -37,7 +39,7 @@ import org.apache.nutch.util.NutchJob;
import org.apache.nutch.fetcher.Fetcher;
-public class Crawl {
+public class Crawl extends Configured implements Tool {
public static final Log LOG = LogFactory.getLog(Crawl.class);
private static String getDate() {
@@ -48,18 +50,21 @@ public class Crawl {
/* Perform complete crawling and indexing given a set of root urls. */
public static void main(String args[]) throws Exception {
+ Configuration conf = NutchConfiguration.create();
+ int res = ToolRunner.run(conf, new Crawl(), args);
+ System.exit(res);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
if (args.length < 1) {
System.out.println
("Usage: Crawl <urlDir> -solr <solrURL> [-dir d] [-threads n] [-depth i] [-topN N]");
- return;
+ return -1;
}
-
- Configuration conf = NutchConfiguration.createCrawlConfiguration();
- JobConf job = new NutchJob(conf);
-
Path rootUrlDir = null;
Path dir = new Path("crawl-" + getDate());
- int threads = job.getInt("fetcher.threads.fetch", 10);
+ int threads = getConf().getInt("fetcher.threads.fetch", 10);
int depth = 5;
long topN = Long.MAX_VALUE;
String solrUrl = null;
@@ -85,6 +90,8 @@ public class Crawl {
}
}
+ JobConf job = new NutchJob(getConf());
+
if (solrUrl == null) {
LOG.warn("solrUrl is not set, indexing will be skipped...");
}
@@ -108,12 +115,12 @@ public class Crawl {
Path index = new Path(dir + "/index");
Path tmpDir = job.getLocalPath("crawl"+Path.SEPARATOR+getDate());
- Injector injector = new Injector(conf);
- Generator generator = new Generator(conf);
- Fetcher fetcher = new Fetcher(conf);
- ParseSegment parseSegment = new ParseSegment(conf);
- CrawlDb crawlDbTool = new CrawlDb(conf);
- LinkDb linkDbTool = new LinkDb(conf);
+ Injector injector = new Injector(getConf());
+ Generator generator = new Generator(getConf());
+ Fetcher fetcher = new Fetcher(getConf());
+ ParseSegment parseSegment = new ParseSegment(getConf());
+ CrawlDb crawlDbTool = new CrawlDb(getConf());
+ LinkDb linkDbTool = new LinkDb(getConf());
// initialize crawlDb
injector.inject(crawlDb, rootUrlDir);
@@ -125,7 +132,7 @@ public class Crawl {
LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
break;
}
- fetcher.fetch(segs[0], threads, org.apache.nutch.fetcher.Fetcher.isParsing(conf)); // fetch it
+ fetcher.fetch(segs[0], threads, org.apache.nutch.fetcher.Fetcher.isParsing(getConf())); // fetch it
if (!Fetcher.isParsing(job)) {
parseSegment.parse(segs[0]); // parse it, if needed
}
@@ -137,11 +144,11 @@ public class Crawl {
if (solrUrl != null) {
// index, dedup & merge
FileStatus[] fstats = fs.listStatus(segments, HadoopFSUtil.getPassDirectoriesFilter(fs));
- SolrIndexer indexer = new SolrIndexer(conf);
+ SolrIndexer indexer = new SolrIndexer(getConf());
indexer.indexSolr(solrUrl, crawlDb, linkDb,
Arrays.asList(HadoopFSUtil.getPaths(fstats)));
SolrDeleteDuplicates dedup = new SolrDeleteDuplicates();
- dedup.setConf(conf);
+ dedup.setConf(getConf());
dedup.dedup(solrUrl);
}
@@ -149,5 +156,8 @@ public class Crawl {
LOG.warn("No URLs to fetch - check your seed list and URL filters.");
}
if (LOG.isInfoEnabled()) { LOG.info("crawl finished: " + dir); }
+ return 0;
}
+
+
}
Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java?rev=964063&r1=964062&r2=964063&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java Wed Jul 14 14:40:01 2010
@@ -43,7 +43,7 @@ import org.apache.nutch.util.NutchConfig
* @author joa23
*/
public class PluginRepository {
- private static final WeakHashMap<Configuration, PluginRepository> CACHE = new WeakHashMap<Configuration, PluginRepository>();
+ private static final WeakHashMap<String, PluginRepository> CACHE = new WeakHashMap<String, PluginRepository>();
private boolean auto;
@@ -90,10 +90,14 @@ public class PluginRepository {
* @return a cached instance of the plugin repository
*/
public static synchronized PluginRepository get(Configuration conf) {
- PluginRepository result = CACHE.get(conf);
+ String uuid = NutchConfiguration.getUUID(conf);
+ if (uuid == null) {
+ uuid = "nonNutchConf@" + conf.hashCode(); // fallback
+ }
+ PluginRepository result = CACHE.get(uuid);
if (result == null) {
result = new PluginRepository(conf);
- CACHE.put(conf, result);
+ CACHE.put(uuid, result);
}
return result;
}
Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java?rev=964063&r1=964062&r2=964063&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java Wed Jul 14 14:40:01 2010
@@ -26,6 +26,7 @@ import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
@@ -39,11 +40,18 @@ import org.apache.nutch.util.NutchConfig
/**
* The LoopReader tool prints the loopset information for a single url.
*/
-public class LoopReader {
+public class LoopReader extends Configured {
- private Configuration conf;
private FileSystem fs;
private MapFile.Reader[] loopReaders;
+
+ public LoopReader() {
+
+ }
+
+ public LoopReader(Configuration conf) {
+ super(conf);
+ }
/**
* Prints loopset for a single url. The loopset information will show any
@@ -58,10 +66,9 @@ public class LoopReader {
throws IOException {
// open the readers
- conf = NutchConfiguration.create();
- fs = FileSystem.get(conf);
+ fs = FileSystem.get(getConf());
loopReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb,
- Loops.LOOPS_DIR), conf);
+ Loops.LOOPS_DIR), getConf());
// get the loopset for a given url, if any
Text key = new Text(url);
@@ -110,7 +117,7 @@ public class LoopReader {
String webGraphDb = line.getOptionValue("webgraphdb");
String url = line.getOptionValue("url");
- LoopReader reader = new LoopReader();
+ LoopReader reader = new LoopReader(NutchConfiguration.create());
reader.dumpUrl(new Path(webGraphDb), url);
return;
}
Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java?rev=964063&r1=964062&r2=964063&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java Wed Jul 14 14:40:01 2010
@@ -26,6 +26,7 @@ import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
@@ -39,12 +40,19 @@ import org.apache.nutch.util.NutchConfig
* Reads and prints to system out information for a single node from the NodeDb
* in the WebGraph.
*/
-public class NodeReader {
+public class NodeReader extends Configured {
- private Configuration conf;
private FileSystem fs;
private MapFile.Reader[] nodeReaders;
+ public NodeReader() {
+
+ }
+
+ public NodeReader(Configuration conf) {
+ super(conf);
+ }
+
/**
* Prints the content of the Node represented by the url to system out.
*
@@ -56,10 +64,9 @@ public class NodeReader {
public void dumpUrl(Path webGraphDb, String url)
throws IOException {
- conf = NutchConfiguration.create();
- fs = FileSystem.get(conf);
+ fs = FileSystem.get(getConf());
nodeReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb,
- WebGraph.NODE_DIR), conf);
+ WebGraph.NODE_DIR), getConf());
// open the readers, get the node, print out the info, and close the readers
Text key = new Text(url);
@@ -108,7 +115,7 @@ public class NodeReader {
// dump the values to system out and return
String webGraphDb = line.getOptionValue("webgraphdb");
String url = line.getOptionValue("url");
- NodeReader reader = new NodeReader();
+ NodeReader reader = new NodeReader(NutchConfiguration.create());
reader.dumpUrl(new Path(webGraphDb), url);
return;
Modified: nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java?rev=964063&r1=964062&r2=964063&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java Wed Jul 14 14:40:01 2010
@@ -17,93 +17,77 @@
package org.apache.nutch.util;
-// JDK imports
-import java.util.Enumeration;
+import java.util.Map.Entry;
+import java.util.Properties;
+import java.util.UUID;
-// Servlet imports
-import javax.servlet.ServletContext;
-
-// Hadoop imports
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.WritableName;
/** Utility to create Hadoop {@link Configuration}s that include Nutch-specific
* resources. */
public class NutchConfiguration {
-
- private final static String KEY = NutchConfiguration.class.getName();
+ public static final String UUID_KEY = "nutch.conf.uuid";
private NutchConfiguration() {} // singleton
-
- // for back-compatibility, add old aliases for these Writable classes
- // this may be removed after the 0.8 release
- static {
- WritableName.addName(org.apache.nutch.parse.ParseData.class, "ParseData");
- WritableName.addName(org.apache.nutch.parse.ParseText.class, "ParseText");
- WritableName.addName(org.apache.nutch.protocol.Content.class, "Content");
+
+ /*
+ * Configuration.hashCode() doesn't return values that
+ * correspond to a unique set of parameters. This is a workaround
+ * so that we can track instances of Configuration created by Nutch.
+ */
+ private static void setUUID(Configuration conf) {
+ UUID uuid = UUID.randomUUID();
+ conf.set(UUID_KEY, uuid.toString());
}
-
- /** Create a {@link Configuration} for Nutch. */
- public static Configuration create() {
- Configuration conf = new Configuration();
- addNutchResources(conf, false);
- return conf;
+
+ /**
+ * Retrieve a Nutch UUID of this configuration object, or null
+ * if the configuration was created elsewhere.
+ * @param conf configuration instance
+ * @return uuid or null
+ */
+ public static String getUUID(Configuration conf) {
+ return conf.get(UUID_KEY);
}
- /**
- * Create a {@link Configuration for Nutch invoked with the command
- * line crawl command, i.e. bin/nutch crawl ...
+ /** Create a {@link Configuration} for Nutch. This will load the standard
+ * Nutch resources, <code>nutch-default.xml</code> and
+ * <code>nutch-site.xml</code> overrides.
*/
- public static Configuration createCrawlConfiguration() {
+ public static Configuration create() {
Configuration conf = new Configuration();
- addNutchResources(conf, true);
+ setUUID(conf);
+ addNutchResources(conf);
return conf;
}
-
- /**
- * Create a {@link Configuration} for Nutch front-end.
- *
- * If a {@link Configuration} is found in the
- * {@link javax.servlet.ServletContext} it is simply returned, otherwise,
- * a new {@link Configuration} is created using the {@link #create()} method,
- * and then all the init parameters found in the
- * {@link javax.servlet.ServletContext} are added to the {@link Configuration}
- * (the created {@link Configuration} is then saved into the
- * {@link javax.servlet.ServletContext}).
- *
- * @param application is the ServletContext whose init parameters
- * must override those of Nutch.
+
+ /** Create a {@link Configuration} from supplied properties.
+ * @param addNutchResources if true, then first <code>nutch-default.xml</code>,
+ * and then <code>nutch-site.xml</code> will be loaded prior to applying the
+ * properties. Otherwise these resources won't be used.
+ * @param nutchProperties a set of properties to define (or override)
*/
- public static Configuration get(ServletContext application) {
- Configuration conf = (Configuration) application.getAttribute(KEY);
- if (conf == null) {
- conf = create();
- Enumeration e = application.getInitParameterNames();
- while (e.hasMoreElements()) {
- String name = (String) e.nextElement();
- conf.set(name, application.getInitParameter(name));
- }
- application.setAttribute(KEY, conf);
+ public static Configuration create(boolean addNutchResources, Properties nutchProperties) {
+ Configuration conf = new Configuration();
+ setUUID(conf);
+ if (addNutchResources) {
+ addNutchResources(conf);
+ }
+ for (Entry<Object, Object> e : nutchProperties.entrySet()) {
+ conf.set(e.getKey().toString(), e.getValue().toString());
}
return conf;
}
-
+
/**
* Add the standard Nutch resources to {@link Configuration}.
*
* @param conf Configuration object to which
* configuration is to be added.
- * @param crawlConfiguration Whether configuration for command line
- * crawl using 'bin/nutch crawl' command
- * should be added.
*/
- private static Configuration addNutchResources(Configuration conf,
- boolean crawlConfiguration) {
+ private static Configuration addNutchResources(Configuration conf) {
conf.addResource("nutch-default.xml");
- if (crawlConfiguration) {
- conf.addResource("crawl-tool.xml");
- }
conf.addResource("nutch-site.xml");
return conf;
}
Modified: nutch/trunk/src/test/org/apache/nutch/plugin/TestPluginSystem.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/plugin/TestPluginSystem.java?rev=964063&r1=964062&r2=964063&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/plugin/TestPluginSystem.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/plugin/TestPluginSystem.java Wed Jul 14 14:40:01 2010
@@ -29,7 +29,9 @@ import java.util.Properties;
import junit.framework.TestCase;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
/**
* Unit tests for the plugin system
@@ -95,6 +97,22 @@ public class TestPluginSystem extends Te
}
}
+ public void testRepositoryCache() {
+ Configuration config = NutchConfiguration.create();
+ PluginRepository repo = PluginRepository.get(config);
+ JobConf job = new NutchJob(config);
+ PluginRepository repo1 = PluginRepository.get(job);
+ assertTrue(repo == repo1);
+ // now construct a config without UUID
+ config = new Configuration();
+ config.addResource("nutch-default.xml");
+ config.addResource("nutch-site.xml");
+ repo = PluginRepository.get(config);
+ job = new NutchJob(config);
+ repo1 = PluginRepository.get(job);
+ assertTrue(repo1 != repo);
+ }
+
/**
*
*/