You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2022/01/08 04:09:29 UTC
[nutch] branch master updated: NUTCH-2429 Fix Plugin System to allow protocol plugins to bundle their URLStreamHandlers (#720)
This is an automated email from the ASF dual-hosted git repository.
lewismc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new e76d69f NUTCH-2429 Fix Plugin System to allow protocol plugins to bundle their URLStreamHandlers (#720)
e76d69f is described below
commit e76d69fe13902fd2f3a98660dd2bac52c2ea568c
Author: Lewis John McGibbney <le...@gmail.com>
AuthorDate: Fri Jan 7 20:07:54 2022 -0800
NUTCH-2429 Fix Plugin System to allow protocol plugins to bundle their URLStreamHandlers (#720)
* NUTCH-2429 Fix Plugin System to allow protocol plugins to bundle their URLStreamHandlers
Co-authored-by: Hiran Chaudhuri <hi...@mail.de>
---
build.xml | 1 +
src/java/org/apache/nutch/crawl/CrawlDbReader.java | 43 ++--
src/java/org/apache/nutch/parse/ParserChecker.java | 5 +
.../apache/nutch/plugin/PluginManifestParser.java | 66 +++---
.../org/apache/nutch/plugin/PluginRepository.java | 244 +++++++++++++++------
.../nutch/plugin/URLStreamHandlerFactory.java | 115 ++++++++++
.../apache/nutch/util/CrawlCompletionStats.java | 40 ++--
src/java/org/apache/nutch/util/NutchJob.java | 12 +-
src/java/org/apache/nutch/util/NutchTool.java | 9 +
.../org/apache/nutch/util/SitemapProcessor.java | 10 +-
.../apache/nutch/util/domain/DomainStatistics.java | 20 +-
.../apache/nutch/any23/Any23IndexingFilter.java | 2 +-
.../org/apache/nutch/any23/Any23ParseFilter.java | 2 +-
src/plugin/build.xml | 2 +
.../nutch/indexwriter/csv/CSVIndexWriter.java | 2 +-
.../indexwriter/rabbit/RabbitIndexWriter.java | 2 +-
src/plugin/protocol-foo/build.xml | 22 ++
src/plugin/protocol-foo/ivy.xml | 41 ++++
src/plugin/protocol-foo/plugin.xml | 48 ++++
.../java/org/apache/nutch/protocol/foo/Foo.java | 141 ++++++++++++
.../org/apache/nutch/protocol/foo/Handler.java | 28 +++
21 files changed, 696 insertions(+), 159 deletions(-)
diff --git a/build.xml b/build.xml
index ecef1e7..2c0eef0 100644
--- a/build.xml
+++ b/build.xml
@@ -1272,6 +1272,7 @@
<source path="${plugins.dir}/parsefilter-regex/src/test/" />
<source path="${plugins.dir}/protocol-file/src/java/" />
<source path="${plugins.dir}/protocol-file/src/test/" />
+ <source path="${plugins.dir}/protocol-foo/src/java/" />
<source path="${plugins.dir}/protocol-ftp/src/java/" />
<source path="${plugins.dir}/protocol-htmlunit/src/java/" />
<source path="${plugins.dir}/protocol-http/src/java/" />
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index 2a20a56..f31210a 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -16,11 +16,12 @@
*/
package org.apache.nutch.crawl;
+import java.io.Closeable;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
-import java.io.Closeable;
import java.lang.invoke.MethodHandles;
+import java.net.MalformedURLException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.util.ArrayList;
@@ -32,16 +33,11 @@ import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Random;
+import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import java.util.TreeMap;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.tdunning.math.stats.MergingDigest;
-import com.tdunning.math.stats.TDigest;
+import org.apache.commons.jexl3.JexlScript;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
@@ -55,18 +51,18 @@ import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
-import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
-import org.apache.hadoop.mapreduce.RecordWriter;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.AbstractChecker;
import org.apache.nutch.util.JexlUtil;
import org.apache.nutch.util.NutchConfiguration;
@@ -74,7 +70,8 @@ import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.SegmentReaderUtil;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.TimingUtil;
-import org.apache.commons.jexl3.JexlScript;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.JsonGenerationException;
import com.fasterxml.jackson.core.JsonGenerator;
@@ -84,6 +81,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectWriter;
import com.fasterxml.jackson.databind.SerializerProvider;
import com.fasterxml.jackson.databind.module.SimpleModule;
+import com.tdunning.math.stats.MergingDigest;
+import com.tdunning.math.stats.TDigest;
/**
* Read utility for the CrawlDB.
@@ -375,10 +374,14 @@ public class CrawlDbReader extends AbstractChecker implements Closeable {
context.write(new Text("fit"), fetchInterval);
if (sort) {
- URL u = new URL(key.toString());
- String host = u.getHost();
- context.write(new Text("status " + value.getStatus() + " " + host),
- COUNT_1);
+ try {
+ URL u = new URL(key.toString());
+ String host = u.getHost();
+ context.write(new Text("status " + value.getStatus() + " " + host),
+ COUNT_1);
+ } catch (MalformedURLException e) {
+ LOG.error("Failed to get host from URL {}: {}", key.toString(), e.getMessage());
+ }
}
}
}
diff --git a/src/java/org/apache/nutch/parse/ParserChecker.java b/src/java/org/apache/nutch/parse/ParserChecker.java
index 7b0e76a..6c82a51 100644
--- a/src/java/org/apache/nutch/parse/ParserChecker.java
+++ b/src/java/org/apache/nutch/parse/ParserChecker.java
@@ -28,6 +28,7 @@ import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.plugin.PluginRepository;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.scoring.ScoringFilters;
@@ -106,6 +107,10 @@ public class ParserChecker extends AbstractChecker {
System.exit(-1);
}
+ // initialize plugins early to register URL stream handlers to support
+ // custom protocol implementations
+ PluginRepository.get(getConf());
+
int numConsumed;
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-normalize")) {
diff --git a/src/java/org/apache/nutch/plugin/PluginManifestParser.java b/src/java/org/apache/nutch/plugin/PluginManifestParser.java
index d7280ad..4c845b4 100644
--- a/src/java/org/apache/nutch/plugin/PluginManifestParser.java
+++ b/src/java/org/apache/nutch/plugin/PluginManifestParser.java
@@ -29,9 +29,9 @@ import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
-import org.slf4j.Logger;
-
import org.apache.hadoop.conf.Configuration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
@@ -39,8 +39,9 @@ import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
/**
- * The <code>PluginManifestParser</code> parser just parse the manifest file in
- * all plugin directories.
+ * The <code>PluginManifestParser</code> provides a mechanism for
+ * parsing Nutch plugin manifest files (<code>plugin.xml</code>) contained
+ * in a {@link java.lang.String[]} of plugin directories.
*
* @author joa23
*/
@@ -49,17 +50,15 @@ public class PluginManifestParser {
private static final String ATTR_CLASS = "class";
private static final String ATTR_ID = "id";
- public static final Logger LOG = PluginRepository.LOG;
+ protected static final Logger LOG = LoggerFactory.getLogger(PluginManifestParser.class);
- private static final boolean WINDOWS = System.getProperty("os.name")
- .startsWith("Windows");
+ private static final boolean WINDOWS = System.getProperty("os.name").startsWith("Windows");
private Configuration conf;
private PluginRepository pluginRepository;
- public PluginManifestParser(Configuration conf,
- PluginRepository pluginRepository) {
+ public PluginManifestParser(Configuration conf, PluginRepository pluginRepository) {
this.conf = conf;
this.pluginRepository = pluginRepository;
}
@@ -83,18 +82,17 @@ public class PluginManifestParser {
if (directory == null) {
continue;
}
- LOG.info("Plugins: looking in: " + directory.getAbsolutePath());
+ LOG.info("Plugins: looking in: {}", directory.getAbsolutePath());
for (File oneSubFolder : directory.listFiles()) {
if (oneSubFolder.isDirectory()) {
String manifestPath = oneSubFolder.getAbsolutePath() + File.separator
- + "plugin.xml";
+ + "plugin.xml";
try {
- LOG.debug("parsing: " + manifestPath);
+ LOG.debug("Parsing: {}", manifestPath);
PluginDescriptor p = parseManifestFile(manifestPath);
map.put(p.getPluginId(), p);
} catch (Exception e) {
- LOG.warn("Error while loading plugin `" + manifestPath + "` "
- + e.toString());
+ LOG.warn("Error while loading plugin {}: {}", manifestPath, e.toString());
}
}
}
@@ -113,13 +111,13 @@ public class PluginManifestParser {
if (!directory.isAbsolute()) {
URL url = PluginManifestParser.class.getClassLoader().getResource(name);
if (url == null && directory.exists() && directory.isDirectory()
- && directory.listFiles().length > 0) {
+ && directory.listFiles().length > 0) {
return directory; // relative path that is not in the classpath
} else if (url == null) {
- LOG.warn("Plugins: directory not found: " + name);
+ LOG.warn("Plugins: directory not found: {}", name);
return null;
} else if (!"file".equals(url.getProtocol())) {
- LOG.warn("Plugins: not a file: url. Can't load plugins from: " + url);
+ LOG.warn("Plugins: not a file: url. Can't load plugins from: {}", url);
return null;
}
String path = url.getPath();
@@ -131,7 +129,7 @@ public class PluginManifestParser {
}
directory = new File(path);
} else if (!directory.exists()) {
- LOG.warn("Plugins: directory not found: " + name);
+ LOG.warn("Plugins: directory not found: {}", name);
return null;
}
return directory;
@@ -145,8 +143,8 @@ public class PluginManifestParser {
* @throws MalformedURLException
*/
private PluginDescriptor parseManifestFile(String pManifestPath)
- throws MalformedURLException, SAXException, IOException,
- ParserConfigurationException {
+ throws MalformedURLException, SAXException, IOException,
+ ParserConfigurationException {
Document document = parseXML(new File(pManifestPath).toURI().toURL());
String pPath = new File(pManifestPath).getParent();
return parsePlugin(document, pPath);
@@ -160,8 +158,8 @@ public class PluginManifestParser {
* @throws ParserConfigurationException
* @throws DocumentException
*/
- private Document parseXML(URL url) throws SAXException, IOException,
- ParserConfigurationException {
+ private Document parseXML(URL url)
+ throws SAXException, IOException, ParserConfigurationException {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
return builder.parse(url.openStream());
@@ -172,7 +170,7 @@ public class PluginManifestParser {
* @throws MalformedURLException
*/
private PluginDescriptor parsePlugin(Document pDocument, String pPath)
- throws MalformedURLException {
+ throws MalformedURLException {
Element rootElement = pDocument.getDocumentElement();
String id = rootElement.getAttribute(ATTR_ID);
String name = rootElement.getAttribute(ATTR_NAME);
@@ -183,9 +181,9 @@ public class PluginManifestParser {
pluginClazz = rootElement.getAttribute(ATTR_CLASS);
}
PluginDescriptor pluginDescriptor = new PluginDescriptor(id, version, name,
- providerName, pluginClazz, pPath, this.conf);
- LOG.debug("plugin: id=" + id + " name=" + name + " version=" + version
- + " provider=" + providerName + "class=" + pluginClazz);
+ providerName, pluginClazz, pPath, this.conf);
+ LOG.debug("plugin: id={} name={} version={} provider={} class={}",
+ id, name, version, providerName, pluginClazz);
parseExtension(rootElement, pluginDescriptor);
parseExtensionPoints(rootElement, pluginDescriptor);
parseLibraries(rootElement, pluginDescriptor);
@@ -199,7 +197,7 @@ public class PluginManifestParser {
* @throws MalformedURLException
*/
private void parseRequires(Element pRootElement, PluginDescriptor pDescriptor)
- throws MalformedURLException {
+ throws MalformedURLException {
NodeList nodelist = pRootElement.getElementsByTagName("requires");
if (nodelist.getLength() > 0) {
@@ -222,8 +220,8 @@ public class PluginManifestParser {
* @param pDescriptor
* @throws MalformedURLException
*/
- private void parseLibraries(Element pRootElement, PluginDescriptor pDescriptor)
- throws MalformedURLException {
+ private void parseLibraries(Element pRootElement,
+ PluginDescriptor pDescriptor) throws MalformedURLException {
NodeList nodelist = pRootElement.getElementsByTagName("runtime");
if (nodelist.getLength() > 0) {
@@ -248,7 +246,7 @@ public class PluginManifestParser {
* @param pluginDescriptor
*/
private void parseExtensionPoints(Element pRootElement,
- PluginDescriptor pPluginDescriptor) {
+ PluginDescriptor pPluginDescriptor) {
NodeList list = pRootElement.getElementsByTagName("extension-point");
if (list != null) {
for (int i = 0; i < list.getLength(); i++) {
@@ -267,7 +265,7 @@ public class PluginManifestParser {
* @param pluginDescriptor
*/
private void parseExtension(Element pRootElement,
- PluginDescriptor pPluginDescriptor) {
+ PluginDescriptor pPluginDescriptor) {
NodeList extensions = pRootElement.getElementsByTagName("extension");
if (extensions != null) {
for (int i = 0; i < extensions.getLength(); i++) {
@@ -286,14 +284,14 @@ public class PluginManifestParser {
String extensionClass = oneImplementation.getAttribute(ATTR_CLASS);
LOG.debug("impl: point=" + pointId + " class=" + extensionClass);
Extension extension = new Extension(pPluginDescriptor, pointId, id,
- extensionClass, this.conf, this.pluginRepository);
+ extensionClass, this.conf, this.pluginRepository);
NodeList parameters = oneImplementation
- .getElementsByTagName("parameter");
+ .getElementsByTagName("parameter");
if (parameters != null) {
for (int k = 0; k < parameters.getLength(); k++) {
Element param = (Element) parameters.item(k);
extension.addAttribute(param.getAttribute(ATTR_NAME),
- param.getAttribute("value"));
+ param.getAttribute("value"));
}
}
pPluginDescriptor.addExtension(extension);
diff --git a/src/java/org/apache/nutch/plugin/PluginRepository.java b/src/java/org/apache/nutch/plugin/PluginRepository.java
index 44df3a2..726da45 100644
--- a/src/java/org/apache/nutch/plugin/PluginRepository.java
+++ b/src/java/org/apache/nutch/plugin/PluginRepository.java
@@ -21,30 +21,39 @@ import java.lang.reflect.Array;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
+import java.net.URLStreamHandler;
+import java.net.URLStreamHandlerFactory;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
-import java.util.WeakHashMap;
import java.util.List;
import java.util.Map;
+import java.util.WeakHashMap;
import java.util.regex.Pattern;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.ObjectCache;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
- * The plugin repositority is a registry of all plugins.
+ * <p>The plugin repositority is a registry of all plugins.</p>
*
- * At system boot up a repositority is built by parsing the mainifest files of
+ * <p>At system boot up a repositority is built by parsing the mainifest files of
* all plugins. Plugins that require other plugins which do not exist are not
* registed. For each plugin a plugin descriptor instance will be created. The
* descriptor represents all meta information about a plugin. So a plugin
* instance will be created later when it is required, this allow lazy plugin
- * loading.
+ * loading.</p>
+ *
+ * <p>As protocol-plugins need to be registered with the JVM as well, this class
+ * also acts as an {@link java.net.URLStreamHandlerFactory} that registers with
+ * the JVM and supports all the new protocols as if they were native. Details of
+ * how the JVM creates URLs can be seen in the API documentation for the
+ * <a href="https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/net/URL.html#%3Cinit%3E(java.lang.String,java.lang.String,int,java.lang.String)">URL constructor</a>.</p>
*/
-public class PluginRepository {
+public class PluginRepository implements URLStreamHandlerFactory {
private static final WeakHashMap<String, PluginRepository> CACHE = new WeakHashMap<>();
private boolean auto;
@@ -59,8 +68,7 @@ public class PluginRepository {
private Configuration conf;
- protected static final Logger LOG = LoggerFactory
- .getLogger(MethodHandles.lookup().lookupClass());
+ protected static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
/**
* @param conf a populated {@link Configuration}
@@ -73,26 +81,29 @@ public class PluginRepository {
this.auto = conf.getBoolean("plugin.auto-activation", true);
String[] pluginFolders = conf.getStrings("plugin.folders");
PluginManifestParser manifestParser = new PluginManifestParser(this.conf,
- this);
+ this);
Map<String, PluginDescriptor> allPlugins = manifestParser
- .parsePluginFolder(pluginFolders);
+ .parsePluginFolder(pluginFolders);
if (allPlugins.isEmpty()) {
LOG.warn("No plugins found on paths of property plugin.folders=\"{}\"",
- conf.get("plugin.folders"));
+ conf.get("plugin.folders"));
}
Pattern excludes = Pattern.compile(conf.get("plugin.excludes", ""));
Pattern includes = Pattern.compile(conf.get("plugin.includes", ""));
Map<String, PluginDescriptor> filteredPlugins = filter(excludes, includes,
- allPlugins);
+ allPlugins);
fRegisteredPlugins = getDependencyCheckedPlugins(filteredPlugins,
- this.auto ? allPlugins : filteredPlugins);
+ this.auto ? allPlugins : filteredPlugins);
installExtensionPoints(fRegisteredPlugins);
try {
installExtensions(fRegisteredPlugins);
} catch (PluginRuntimeException e) {
- LOG.error(e.toString());
+ LOG.error("Could not install extensions.", e.toString());
throw new RuntimeException(e.getMessage());
}
+
+ registerURLStreamHandlerFactory();
+
displayStatus();
}
@@ -122,7 +133,7 @@ public class PluginRepository {
for (PluginDescriptor plugin : plugins) {
for (ExtensionPoint point : plugin.getExtenstionPoints()) {
String xpId = point.getId();
- LOG.debug("Adding extension point " + xpId);
+ LOG.debug("Adding extension point {}", xpId);
fExtensionPoints.put(xpId, point);
}
}
@@ -132,16 +143,15 @@ public class PluginRepository {
* @param pRegisteredPlugins
*/
private void installExtensions(List<PluginDescriptor> pRegisteredPlugins)
- throws PluginRuntimeException {
+ throws PluginRuntimeException {
for (PluginDescriptor descriptor : pRegisteredPlugins) {
for (Extension extension : descriptor.getExtensions()) {
String xpId = extension.getTargetPoint();
ExtensionPoint point = getExtensionPoint(xpId);
if (point == null) {
- throw new PluginRuntimeException("Plugin ("
- + descriptor.getPluginId() + "), " + "extension point: " + xpId
- + " does not exist.");
+ throw new PluginRuntimeException("Plugin (" + descriptor.getPluginId()
+ + "), " + "extension point: " + xpId + " does not exist.");
}
point.addExtension(extension);
}
@@ -149,10 +159,10 @@ public class PluginRepository {
}
private void getPluginCheckedDependencies(PluginDescriptor plugin,
- Map<String, PluginDescriptor> plugins,
- Map<String, PluginDescriptor> dependencies,
- Map<String, PluginDescriptor> branch) throws MissingDependencyException,
- CircularDependencyException {
+ Map<String, PluginDescriptor> plugins,
+ Map<String, PluginDescriptor> dependencies,
+ Map<String, PluginDescriptor> branch)
+ throws MissingDependencyException, CircularDependencyException {
if (dependencies == null) {
dependencies = new HashMap<>();
@@ -166,24 +176,24 @@ public class PluginRepository {
for (String id : plugin.getDependencies()) {
PluginDescriptor dependency = plugins.get(id);
if (dependency == null) {
- throw new MissingDependencyException("Missing dependency " + id
- + " for plugin " + plugin.getPluginId());
+ throw new MissingDependencyException(
+ "Missing dependency " + id + " for plugin " + plugin.getPluginId());
}
if (branch.containsKey(id)) {
throw new CircularDependencyException("Circular dependency detected "
- + id + " for plugin " + plugin.getPluginId());
+ + id + " for plugin " + plugin.getPluginId());
}
dependencies.put(id, dependency);
getPluginCheckedDependencies(plugins.get(id), plugins, dependencies,
- branch);
+ branch);
}
branch.remove(plugin.getPluginId());
}
private Map<String, PluginDescriptor> getPluginCheckedDependencies(
- PluginDescriptor plugin, Map<String, PluginDescriptor> plugins)
- throws MissingDependencyException, CircularDependencyException {
+ PluginDescriptor plugin, Map<String, PluginDescriptor> plugins)
+ throws MissingDependencyException, CircularDependencyException {
Map<String, PluginDescriptor> dependencies = new HashMap<>();
Map<String, PluginDescriptor> branch = new HashMap<>();
getPluginCheckedDependencies(plugin, plugins, dependencies, branch);
@@ -198,7 +208,8 @@ public class PluginRepository {
* @return List
*/
private List<PluginDescriptor> getDependencyCheckedPlugins(
- Map<String, PluginDescriptor> filtered, Map<String, PluginDescriptor> all) {
+ Map<String, PluginDescriptor> filtered,
+ Map<String, PluginDescriptor> all) {
if (filtered == null) {
return null;
}
@@ -209,7 +220,7 @@ public class PluginRepository {
checked.putAll(getPluginCheckedDependencies(plugin, all));
checked.put(plugin.getPluginId(), plugin);
} catch (MissingDependencyException mde) {
- // Logger exception and ignore plugin
+ // Log exception and ignore plugin
LOG.warn(mde.getMessage());
} catch (CircularDependencyException cde) {
// Simply ignore this plugin
@@ -225,8 +236,8 @@ public class PluginRepository {
* @return PluginDescriptor[]
*/
public PluginDescriptor[] getPluginDescriptors() {
- return fRegisteredPlugins.toArray(new PluginDescriptor[fRegisteredPlugins
- .size()]);
+ return fRegisteredPlugins
+ .toArray(new PluginDescriptor[fRegisteredPlugins.size()]);
}
/**
@@ -255,14 +266,14 @@ public class PluginRepository {
}
/**
- * Returns a instance of a plugin. Plugin instances are cached. So a plugin
+ * <p>Returns a instance of a plugin. Plugin instances are cached. So a plugin
* exist only as one instance. This allow a central management of plugin own
- * resources.
+ * resources.</p>
*
- * After creating the plugin instance the startUp() method is invoked. The
+ * <p>After creating the plugin instance the startUp() method is invoked. The
* plugin use a own classloader that is used as well by all instance of
* extensions of the same plugin. This class loader use all exported libraries
- * from the dependend plugins and all plugin libraries.
+ * from the dependend plugins and all plugin libraries.</p>
*
* @param pDescriptor a {@link PluginDescriptor} for which to retrieve a
* {@link Plugin} instance
@@ -270,7 +281,7 @@ public class PluginRepository {
* @throws PluginRuntimeException if there is a fatal runtime plugin error
*/
public Plugin getPluginInstance(PluginDescriptor pDescriptor)
- throws PluginRuntimeException {
+ throws PluginRuntimeException {
if (fActivatedPlugins.containsKey(pDescriptor.getPluginId()))
return fActivatedPlugins.get(pDescriptor.getPluginId());
try {
@@ -280,11 +291,11 @@ public class PluginRepository {
// Suggested by Stefan Groschupf <sg...@media-style.com>
synchronized (pDescriptor) {
Class<?> pluginClass = getCachedClass(pDescriptor,
- pDescriptor.getPluginClass());
- Constructor<?> constructor = pluginClass.getConstructor(new Class<?>[] {
- PluginDescriptor.class, Configuration.class });
- Plugin plugin = (Plugin) constructor.newInstance(new Object[] {
- pDescriptor, this.conf });
+ pDescriptor.getPluginClass());
+ Constructor<?> constructor = pluginClass.getConstructor(
+ new Class<?>[] { PluginDescriptor.class, Configuration.class });
+ Plugin plugin = (Plugin) constructor
+ .newInstance(new Object[] { pDescriptor, this.conf });
plugin.startUp();
fActivatedPlugins.put(pDescriptor.getPluginId(), plugin);
return plugin;
@@ -302,11 +313,13 @@ public class PluginRepository {
}
}
- /*
- * (non-Javadoc)
- *
+ /**
+ * Attempts to shut down all activated plugins.
+ * @deprecated
+ * @see <a href="https://openjdk.java.net/jeps/421">JEP 421: Deprecate Finalization for Removal</a>
* @see java.lang.Object#finalize()
*/
+ @Deprecated
public void finalize() throws Throwable {
shutDownActivatedPlugins();
}
@@ -323,7 +336,7 @@ public class PluginRepository {
}
public Class getCachedClass(PluginDescriptor pDescriptor, String className)
- throws ClassNotFoundException {
+ throws ClassNotFoundException {
Map<PluginClassLoader, Class> descMap = CLASS_CACHE.get(className);
if (descMap == null) {
descMap = new HashMap<>();
@@ -339,14 +352,14 @@ public class PluginRepository {
}
private void displayStatus() {
- LOG.info("Plugin Auto-activation mode: [" + this.auto + "]");
+ LOG.info("Plugin Auto-activation mode: [{}]", this.auto);
LOG.info("Registered Plugins:");
if ((fRegisteredPlugins == null) || (fRegisteredPlugins.size() == 0)) {
LOG.info("\tNONE");
} else {
for (PluginDescriptor plugin : fRegisteredPlugins) {
- LOG.info("\t" + plugin.getName() + " (" + plugin.getPluginId() + ")");
+ LOG.info("\t{} ({})", plugin.getName(), plugin.getPluginId());
}
}
@@ -355,7 +368,7 @@ public class PluginRepository {
LOG.info("\tNONE");
} else {
for (ExtensionPoint ep : fExtensionPoints.values()) {
- LOG.info("\t" + ep.getName() + " (" + ep.getId() + ")");
+ LOG.info("\t ({})", ep.getName(), ep.getId());
}
}
}
@@ -372,7 +385,7 @@ public class PluginRepository {
* @return map of plugins matching the configuration
*/
private Map<String, PluginDescriptor> filter(Pattern excludes,
- Pattern includes, Map<String, PluginDescriptor> plugins) {
+ Pattern includes, Map<String, PluginDescriptor> plugins) {
Map<String, PluginDescriptor> map = new HashMap<>();
@@ -391,11 +404,11 @@ public class PluginRepository {
}
if (!includes.matcher(id).matches()) {
- LOG.debug("not including: " + id);
+ LOG.debug("not including: {}", id);
continue;
}
if (excludes.matcher(id).matches()) {
- LOG.debug("excluding: " + id);
+ LOG.debug("excluding: {}", id);
continue;
}
map.put(plugin.getPluginId(), plugin);
@@ -419,7 +432,7 @@ public class PluginRepository {
* @return array of plugin instances
*/
public synchronized Object[] getOrderedPlugins(Class<?> clazz,
- String xPointId, String orderProperty) {
+ String xPointId, String orderProperty) {
Object[] filters;
ObjectCache objectCache = ObjectCache.get(conf);
filters = (Object[]) objectCache.getObject(clazz.getName());
@@ -434,8 +447,8 @@ public class PluginRepository {
}
try {
- ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
- xPointId);
+ ExtensionPoint point = PluginRepository.get(conf)
+ .getExtensionPoint(xPointId);
if (point == null)
throw new RuntimeException(xPointId + " not found.");
Extension[] extensions = point.getExtensions();
@@ -453,9 +466,9 @@ public class PluginRepository {
for (String orderedFilter : orderOfFilters) {
Object f = filterMap.get(orderedFilter);
if (f == null) {
- LOG.error(clazz.getSimpleName() + " : " + orderedFilter
- + " declared in configuration property " + orderProperty
- + " but not found in an active plugin - ignoring.");
+ LOG.error("{} : {} declared in configuration property {} "
+ + "but not found in an active plugin - ignoring.",
+ clazz.getSimpleName(), orderedFilter, orderProperty);
continue;
}
sorted.add(f);
@@ -464,8 +477,8 @@ public class PluginRepository {
for (int i = 0; i < sorted.size(); i++) {
filter[i] = sorted.get(i);
if (LOG.isTraceEnabled()) {
- LOG.trace(clazz.getSimpleName() + " : filters[" + i + "] = "
- + filter[i].getClass());
+ LOG.trace("{} : filters[{}] = {}", clazz.getSimpleName() , i,
+ filter[i].getClass());
}
}
objectCache.setObject(clazz.getName(), filter);
@@ -490,8 +503,8 @@ public class PluginRepository {
*/
public static void main(String[] args) throws Exception {
if (args.length < 2) {
- System.err
- .println("Usage: PluginRepository pluginId className [arg1 arg2 ...]");
+ System.err.println(
+ "Usage: PluginRepository pluginId className [arg1 arg2 ...]");
return;
}
Configuration conf = NutchConfiguration.create();
@@ -508,8 +521,8 @@ public class PluginRepository {
try {
clazz = Class.forName(args[1], true, cl);
} catch (Exception e) {
- System.err.println("Could not load the class '" + args[1] + ": "
- + e.getMessage());
+ System.err.println(
+ "Could not load the class '" + args[1] + ": " + e.getMessage());
return;
}
Method m = null;
@@ -517,11 +530,108 @@ public class PluginRepository {
m = clazz.getMethod("main", new Class<?>[] { args.getClass() });
} catch (Exception e) {
System.err.println("Could not find the 'main(String[])' method in class "
- + args[1] + ": " + e.getMessage());
+ + args[1] + ": " + e.getMessage());
return;
}
String[] subargs = new String[args.length - 2];
System.arraycopy(args, 2, subargs, 0, subargs.length);
m.invoke(null, new Object[] { subargs });
}
+
+ /**
+ * Registers this PluginRepository to be invoked whenever URLs have to be
+ * parsed. This allows to check the registered protocol plugins for uncommon
+ * protocols.
+ */
+ private void registerURLStreamHandlerFactory() {
+ org.apache.nutch.plugin.URLStreamHandlerFactory.getInstance().registerPluginRepository(this);
+ }
+
+ /**
+ * <p>Invoked whenever a {@link java.net.URL} needs to be instantiated. Tries to find a
+ * suitable extension and allows it to provide a {@link java.net.URLStreamHandler}.</p>
+ * This is done by several attempts:
+ * <ul>
+ * <li>Find a protocol plugin that implements the desired protocol. If found,
+ * instantiate it so eventually the plugin can install a {@link java.net.URLStreamHandler}
+ * through a static hook.</li>
+ * <li>If the plugin specifies a {@link java.net.URLStreamHandler} in its
+ * <code>plugin.xml</code> manifest, return an instance of this
+ * {@link java.net.URLStreamHandler}. Example:
+ *
+ * <pre>
+ * ...
+ * <implementation id="org.apache.nutch.protocol.foo.Foo" class="org.apache.nutch.protocol.foo.Foo">
+ * <parameter name="protocolName" value="foo"/>
+ * <parameter name="urlStreamHandler" value="org.apache.nutch.protocol.foo.Handler"/>
+ * </implementation>
+ * ...
+ * </pre>
+ * </li>
+ * <li>If all else fails, return null. This will fallback to the JVM's method
+ * of evaluating the system property <code>java.protocol.handler.pkgs</code>.</li>
+ * </ul>
+ *
+ * @return the URLStreamHandler found, or null.
+ * @see java.net.URL
+ * @see <a href="https://issues.apache.org/jira/browse/NUTCH-2429">NUTCH-2429</a>
+ */
+ public URLStreamHandler createURLStreamHandler(String protocol) {
+ LOG.debug("Creating URLStreamHandler for protocol: {}", protocol);
+
+ if (fExtensionPoints != null) {
+ ExtensionPoint ep = fExtensionPoints
+ .get("org.apache.nutch.protocol.Protocol");
+ if (ep != null) {
+ Extension[] extensions = ep.getExtensions();
+ for (Extension extension : extensions) {
+ String p = extension.getAttribute("protocolName");
+ if (p.equals(protocol)) {
+ LOG.debug("Suitable protocolName attribute located: {}", p);
+
+ // instantiate the plugin. This allows it to execute a static hook,
+ // if present. Extensions and PluginInstances are cached already, so we
+ // should not create too many instances
+ Object extinst = null;
+ try {
+ extinst = extension.getExtensionInstance();
+ LOG.debug("Located extension instance class: {}", extinst.getClass().getName());
+ } catch (Exception e) {
+ LOG.warn("Could not find {}", extension.getId(), e);
+ }
+
+ // return the handler here, if possible
+ String handlerClass = extension.getAttribute("urlStreamHandler");
+ LOG.debug("Located URLStreamHandler: {}", handlerClass);
+ if (handlerClass != null) {
+ // the nutch classloader
+ ClassLoader cl = this.getClass().getClassLoader();
+ if (extinst != null) {
+ // the extension's classloader
+ cl = extinst.getClass().getClassLoader();
+ }
+
+ try {
+ // instantiate the handler and return it
+ Class<?> clazz = cl.loadClass(handlerClass);
+ return (URLStreamHandler) clazz.getDeclaredConstructor().newInstance();
+ } catch (Exception e) {
+ LOG.error("Could not instantiate protocol {} handler class {} defined by extension {}",
+ protocol, handlerClass, extension.getId(), e);
+ return null;
+ }
+ }
+
+ LOG.debug("suitable protocol extension found that did not declare a handler");
+ return null;
+ }
+ }
+ LOG.debug("No suitable protocol extensions registered");
+ } else {
+ LOG.debug("No protocol extensions registered?");
+ }
+ }
+
+ return null;
+ }
}
diff --git a/src/java/org/apache/nutch/plugin/URLStreamHandlerFactory.java b/src/java/org/apache/nutch/plugin/URLStreamHandlerFactory.java
new file mode 100644
index 0000000..a64454c
--- /dev/null
+++ b/src/java/org/apache/nutch/plugin/URLStreamHandlerFactory.java
@@ -0,0 +1,115 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+import java.lang.ref.WeakReference;
+import java.net.URL;
+import java.net.URLStreamHandler;
+import java.util.ArrayList;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This URLStreamHandlerFactory knows about all the plugins
+ * in use and thus can create the correct URLStreamHandler
+ * even if it comes from a plugin classpath.
+ * As the JVM allows only one instance of URLStreamHandlerFactory
+ * to be registered, this class implements a singleton pattern.
+ * @author Hiran Chaudhuri
+ *
+ */
+public class URLStreamHandlerFactory
+ implements java.net.URLStreamHandlerFactory {
+
+ protected static final Logger LOG = LoggerFactory
+ .getLogger(URLStreamHandlerFactory.class);
+
+ /** The singleton instance. */
+ private static URLStreamHandlerFactory instance;
+
+ /** Here we register all PluginRepositories.
+ * In this class we do not know why several instances of PluginRepository
+ * are kept, nor do we know how long they will be used. To prevent
+ * a memory leak, this class must not keep references to PluginRepository
+ * but use WeakReference which allows PluginRepository to still be
+ * garbage collected. The prize is we need to clean the list for
+ * outdated references which is done in the {@link #removeInvalidRefs()} method.
+ */
+ private ArrayList<WeakReference<PluginRepository>> prs;
+
+ static {
+ instance = new URLStreamHandlerFactory();
+ URL.setURLStreamHandlerFactory(instance);
+ LOG.debug("Registered URLStreamHandlerFactory with the JVM.");
+ }
+
+ private URLStreamHandlerFactory() {
+ prs = new ArrayList<>();
+ }
+
+ /**
+ * Get the singleton instance of this class.
+ * @return a {@link org.apache.nutch.plugin.URLStreamHandlerFactory} instance
+ */
+ public static URLStreamHandlerFactory getInstance() {
+ return instance;
+ }
+
+ /** Use this method once a new PluginRepository was created to register it.
+ *
+ * @param pr The PluginRepository to be registered.
+ */
+ public void registerPluginRepository(PluginRepository pr) {
+ prs.add(new WeakReference<PluginRepository>(pr));
+
+ removeInvalidRefs();
+ }
+
+ @Override
+ public URLStreamHandler createURLStreamHandler(String protocol) {
+ LOG.debug("Creating URLStreamHandler for protocol: {}", protocol);
+
+ removeInvalidRefs();
+
+ // find the 'correct' PluginRepository. For now we simply take the first.
+ // then ask it to return the URLStreamHandler
+ for(WeakReference<PluginRepository> ref: prs) {
+ PluginRepository pr = ref.get();
+ if(pr != null) {
+ // found PluginRepository. Let's get the URLStreamHandler...
+ return pr.createURLStreamHandler(protocol);
+ }
+ }
+ return null;
+ }
+
+ /** Maintains the list of PluginRepositories by
+ * removing the references whose referents have been
+ * garbage collected meanwhile.
+ */
+ private void removeInvalidRefs() {
+ LOG.debug("removeInvalidRefs()");
+ ArrayList<WeakReference<PluginRepository>> copy = new ArrayList<>(prs);
+ for(WeakReference<PluginRepository> ref: copy) {
+ if(ref.get() == null) {
+ prs.remove(ref);
+ }
+ }
+ LOG.debug("Removed the following invalid references: '{}' Remaining: '{}'", copy.size()-prs.size(), prs.size());
+ }
+}
diff --git a/src/java/org/apache/nutch/util/CrawlCompletionStats.java b/src/java/org/apache/nutch/util/CrawlCompletionStats.java
index 8a23fbf..621484c 100644
--- a/src/java/org/apache/nutch/util/CrawlCompletionStats.java
+++ b/src/java/org/apache/nutch/util/CrawlCompletionStats.java
@@ -16,40 +16,37 @@
*/
package org.apache.nutch.util;
-import java.io.File;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
+import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.MissingOptionException;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.TimingUtil;
-import org.apache.nutch.util.URLUtil;
-
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.CommandLineParser;
-import org.apache.commons.cli.GnuParser;
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.OptionBuilder;
-import org.apache.commons.cli.Options;
-import org.apache.commons.cli.MissingOptionException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* Extracts some simple crawl completion stats from the crawldb
@@ -204,7 +201,14 @@ public class CrawlCompletionStats extends Configured implements Tool {
public void map(Text urlText, CrawlDatum datum, Context context)
throws IOException, InterruptedException {
- URL url = new URL(urlText.toString());
+ URL url;
+ try {
+ url = new URL(urlText.toString());
+ } catch (MalformedURLException e) {
+ LOG.error("Failed to get host or domain from URL {}: {}",
+ urlText, e.getMessage());
+ return;
+ }
String out = "";
switch (mode) {
case MODE_HOST:
diff --git a/src/java/org/apache/nutch/util/NutchJob.java b/src/java/org/apache/nutch/util/NutchJob.java
index 04b38df..3e852eb 100644
--- a/src/java/org/apache/nutch/util/NutchJob.java
+++ b/src/java/org/apache/nutch/util/NutchJob.java
@@ -19,12 +19,13 @@ package org.apache.nutch.util;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
+import org.apache.nutch.plugin.PluginRepository;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/** A {@link Job} for Nutch jobs. */
public class NutchJob extends Job {
@@ -35,6 +36,11 @@ public class NutchJob extends Job {
@SuppressWarnings("deprecation")
public NutchJob(Configuration conf, String jobName) throws IOException {
super(conf, jobName);
+ if (conf != null) {
+ // initialize plugins early to register URL stream handlers to support
+ // custom protocol implementations
+ PluginRepository.get(conf);
+ }
}
public static Job getInstance(Configuration conf) throws IOException {
diff --git a/src/java/org/apache/nutch/util/NutchTool.java b/src/java/org/apache/nutch/util/NutchTool.java
index 244ae99..a6a1cee 100644
--- a/src/java/org/apache/nutch/util/NutchTool.java
+++ b/src/java/org/apache/nutch/util/NutchTool.java
@@ -25,6 +25,7 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.mapreduce.Job;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.plugin.PluginRepository;
public abstract class NutchTool extends Configured {
@@ -53,6 +54,14 @@ public abstract class NutchTool extends Configured {
public NutchTool(){
super(null);
}
+
+ @Override
+ public void setConf(Configuration conf) {
+ super.setConf(conf);
+ if(conf != null) {
+ PluginRepository.get(conf);
+ }
+ }
/**
* Get relative progress of the tool. Progress is represented as a
diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java
index b191f23..1a1955e 100644
--- a/src/java/org/apache/nutch/util/SitemapProcessor.java
+++ b/src/java/org/apache/nutch/util/SitemapProcessor.java
@@ -41,7 +41,6 @@ import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
-
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.hostdb.HostDatum;
import org.apache.nutch.net.URLFilters;
@@ -51,7 +50,6 @@ import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatus;
-
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -82,13 +80,11 @@ import crawlercommons.sitemaps.SiteMapURL;
* the sitemaps into the CrawlDb.</li>
* </ol>
*
- * <p>
- * For more details see:
- * https://cwiki.apache.org/confluence/display/NUTCH/SitemapFeature
- * </p>
+ * @see
+ * <a href="https://cwiki.apache.org/confluence/display/NUTCH/SitemapFeature">SitemapFeature</a>
*/
public class SitemapProcessor extends Configured implements Tool {
- public static final Logger LOG = LoggerFactory.getLogger(SitemapProcessor.class);
+ private static final Logger LOG = LoggerFactory.getLogger(SitemapProcessor.class);
public static final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
public static final String CURRENT_NAME = "current";
diff --git a/src/java/org/apache/nutch/util/domain/DomainStatistics.java b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
index 24e7a1c..0d789ed 100644
--- a/src/java/org/apache/nutch/util/domain/DomainStatistics.java
+++ b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
@@ -16,31 +16,32 @@
*/
package org.apache.nutch.util.domain;
-import java.io.File;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
+import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* Extracts some very basic statistics about domains from the crawldb
@@ -175,7 +176,14 @@ public class DomainStatistics extends Configured implements Tool {
|| datum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
try {
- URL url = new URL(urlText.toString());
+ URL url;
+ try {
+ url = new URL(urlText.toString());
+ } catch (MalformedURLException e) {
+ LOG.error("Failed to get host or domain from URL {}: {}",
+ urlText, e.getMessage());
+ return;
+ }
String out = null;
switch (mode) {
case MODE_HOST:
diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
index e56aaa6..c0f1d6f 100644
--- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
+++ b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
@@ -41,7 +41,7 @@ import org.slf4j.LoggerFactory;
public class Any23IndexingFilter implements IndexingFilter {
/** Logging instance */
- public static final Logger LOG = LoggerFactory.getLogger(Any23IndexingFilter.class);
+ private static final Logger LOG = LoggerFactory.getLogger(Any23IndexingFilter.class);
public static final String STRUCTURED_DATA = "structured_data";
diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
index d9f0896..af7f135 100644
--- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
+++ b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
@@ -58,7 +58,7 @@ import org.w3c.dom.DocumentFragment;
public class Any23ParseFilter implements HtmlParseFilter {
/** Logging instance */
- public static final Logger LOG = LoggerFactory.getLogger(Any23ParseFilter.class);
+ private static final Logger LOG = LoggerFactory.getLogger(Any23ParseFilter.class);
private Configuration conf = null;
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 95d7a16..7378096 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -71,6 +71,7 @@
<ant dir="parsefilter-naivebayes" target="deploy"/>
<ant dir="parsefilter-regex" target="deploy"/>
<ant dir="protocol-file" target="deploy"/>
+ <ant dir="protocol-foo" target="deploy" />
<ant dir="protocol-ftp" target="deploy"/>
<ant dir="protocol-htmlunit" target="deploy" />
<ant dir="protocol-http" target="deploy"/>
@@ -219,6 +220,7 @@
<ant dir="parsefilter-naivebayes" target="clean" />
<ant dir="parsefilter-regex" target="clean"/>
<ant dir="protocol-file" target="clean"/>
+ <ant dir="protocol-foo" target="clean" />
<ant dir="protocol-ftp" target="clean"/>
<ant dir="protocol-htmlunit" target="clean" />
<ant dir="protocol-http" target="clean"/>
diff --git a/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java b/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java
index 6989feb..58e8993 100644
--- a/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java
+++ b/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java
@@ -52,7 +52,7 @@ import org.slf4j.LoggerFactory;
*/
public class CSVIndexWriter implements IndexWriter {
- public static final Logger LOG = LoggerFactory
+ private static final Logger LOG = LoggerFactory
.getLogger(CSVIndexWriter.class);
private Configuration config;
diff --git a/src/plugin/indexer-rabbit/src/java/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriter.java b/src/plugin/indexer-rabbit/src/java/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriter.java
index f7a47e2..37acf12 100644
--- a/src/plugin/indexer-rabbit/src/java/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriter.java
+++ b/src/plugin/indexer-rabbit/src/java/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriter.java
@@ -40,7 +40,7 @@ import java.util.regex.Pattern;
public class RabbitIndexWriter implements IndexWriter {
- public static final Logger LOG = LoggerFactory
+ private static final Logger LOG = LoggerFactory
.getLogger(RabbitIndexWriter.class);
private String uri;
diff --git a/src/plugin/protocol-foo/build.xml b/src/plugin/protocol-foo/build.xml
new file mode 100755
index 0000000..240f448
--- /dev/null
+++ b/src/plugin/protocol-foo/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-foo" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
diff --git a/src/plugin/protocol-foo/ivy.xml b/src/plugin/protocol-foo/ivy.xml
new file mode 100755
index 0000000..1a86d68
--- /dev/null
+++ b/src/plugin/protocol-foo/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
diff --git a/src/plugin/protocol-foo/plugin.xml b/src/plugin/protocol-foo/plugin.xml
new file mode 100755
index 0000000..850afe3
--- /dev/null
+++ b/src/plugin/protocol-foo/plugin.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<plugin
+ id="protocol-foo"
+ name="Foo Protocol Example Plug-in"
+ version="1.0.0"
+ provider-name="Hiran Chaudhuri">
+
+ <runtime>
+ <library name="protocol-foo.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.protocol.foo"
+ name="FooProtocol"
+ point="org.apache.nutch.protocol.Protocol">
+
+ <implementation id="org.apache.nutch.protocol.foo.Foo"
+ class="org.apache.nutch.protocol.foo.Foo">
+ <parameter name="protocolName" value="foo"/>
+ <parameter name="urlStreamHandler" value="org.apache.nutch.protocol.foo.Handler"/>
+ </implementation>
+
+ </extension>
+
+</plugin>
diff --git a/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Foo.java b/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Foo.java
new file mode 100755
index 0000000..0f56f23
--- /dev/null
+++ b/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Foo.java
@@ -0,0 +1,141 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.foo;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.protocol.RobotRulesParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import crawlercommons.robots.BaseRobotRules;
+
+public class Foo implements Protocol {
+ protected static final Logger LOG = LoggerFactory.getLogger(Foo.class);
+
+ private Configuration conf;
+
+ @Override
+ public Configuration getConf() {
+ LOG.debug("getConf()");
+ return this.conf;
+ }
+
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ /**
+ * This is a dummy implementation only. So what we will do is return this
+ * structure:
+ *
+ * <pre>
+ * foo://example.com - will contain one directory and one file
+ * foo://example.com/a - directory, will contain two files
+ * foo://example.com/a/aa.txt - text file
+ * foo://example.com/a/ab.txt - text file
+ * foo://example.com/a.txt - text file
+ * </pre>
+ */
+ @Override
+ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
+ LOG.debug("getProtocolOutput({}, {})", url, datum);
+
+ try {
+ String urlstr = String.valueOf(url);
+ URL u = new URL(urlstr);
+ URL base = new URL(u, ".");
+ byte[] bytes = new byte[0];
+ String contentType = "foo/something";
+ ProtocolStatus status = ProtocolStatus.STATUS_GONE;
+
+ switch (urlstr) {
+ case "foo://example.com":
+ case "foo://example.com/": {
+ String time = HttpDateFormat.toString(System.currentTimeMillis());
+ contentType = "text/html";
+ StringBuffer sb = new StringBuffer();
+ sb.append("<html><head>");
+ sb.append("<title>Index of /</title></head>\n");
+ sb.append("<body><h1>Index of /</h1><pre>\n");
+ sb.append("<a href='a/" + "'>a/</a>\t"+ time + "\t-\n"); // add directory
+ sb.append("<a href='a.txt'>a.txt</a>\t" + time + "\t" + 0 + "\n"); // add file
+ sb.append("</pre></html></body>");
+ bytes = sb.toString().getBytes();
+ status = ProtocolStatus.STATUS_SUCCESS;
+ break;
+ }
+ case "foo://example.com/a/": {
+ String time = HttpDateFormat.toString(System.currentTimeMillis());
+ contentType = "text/html";
+ StringBuffer sb = new StringBuffer();
+ sb.append("<html><head>");
+ sb.append("<title>Index of /a/</title></head>\n");
+ sb.append("<body><h1>Index of /a/</h1><pre>\n");
+ sb.append("<a href='aa.txt'>aa.txt</a>\t" + time + "\t" + 0 + "\n"); // add file
+ sb.append("<a href='ab.txt'>ab.txt</a>\t" + time + "\t" + 0 + "\n"); // add file
+ sb.append("</pre></html></body>");
+ bytes = sb.toString().getBytes();
+ status = ProtocolStatus.STATUS_SUCCESS;
+ break;
+ }
+ case "foo://example.com/a.txt":
+ case "foo://example.com/a/aa.txt":
+ case "foo://example.com/a/ab.txt": {
+ contentType = "text/plain";
+ bytes = "In publishing and graphic design, lorem ipsum is a filler text or greeking commonly used to demonstrate the textual elements of a graphic document or visual presentation. Replacing meaningful content with placeholder text allows designers to design the form of the content before the content itself has been produced.".getBytes();
+ status = ProtocolStatus.STATUS_SUCCESS;
+ break;
+ }
+ default:
+ LOG.warn("Unknown url '{}'. This dummy implementation only supports 'foo://example.com'", url);
+ // all our default values are set for URLs that do not exist.
+ break;
+ }
+
+ Metadata metadata = new Metadata();
+ Content content = new Content(String.valueOf(url), String.valueOf(base),
+ bytes, contentType, metadata, getConf());
+
+ return new ProtocolOutput(content, status);
+ } catch (MalformedURLException mue) {
+ LOG.error("Could not retrieve {}", url);
+ LOG.error("", mue);
+ // claim STATUS_GONE to tell nutch to never ever re-request this URL
+ return new ProtocolOutput(null, ProtocolStatus.STATUS_GONE);
+ }
+ }
+
+ @Override
+ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
+ List<Content> robotsTxtContent) {
+ LOG.debug("getRobotRules({}, {}, {})", url, datum, robotsTxtContent);
+ return RobotRulesParser.EMPTY_RULES;
+ }
+}
diff --git a/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Handler.java b/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Handler.java
new file mode 100644
index 0000000..27f1837
--- /dev/null
+++ b/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Handler.java
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.foo;
+
+import java.net.URL;
+import java.net.URLConnection;
+import java.net.URLStreamHandler;
+
+public class Handler extends URLStreamHandler {
+
+ protected URLConnection openConnection(URL u) {
+ throw new UnsupportedOperationException("not yet implemented");
+ }
+}