You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:25 UTC
[09/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build
for nutch-core and nutch-plugins
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/plugin.xml b/nutch-plugins/parsefilter-regex/plugin.xml
new file mode 100644
index 0000000..0725492
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/plugin.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="parsefilter-regex"
+ name="Regex Parse Filter"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="parsefilter-regex.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.htmlparsefilter.regex"
+ name="Nutch Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter">
+ <implementation id="RegexParseFilter"
+ class="org.apache.nutch.parsefilter.regex.RegexParseFilter">
+ <parameter name="file" value="regex-parsefilter.txt"/>
+ </implementation>
+ </extension>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/pom.xml b/nutch-plugins/parsefilter-regex/pom.xml
new file mode 100644
index 0000000..19b6452
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>parsefilter-regex</artifactId>
+ <packaging>jar</packaging>
+
+ <name>parsefilter-regex</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java b/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
new file mode 100644
index 0000000..0752c91
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
@@ -0,0 +1,199 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parsefilter.regex;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.FileReader;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.protocol.Content;
+
+import org.apache.commons.lang.StringUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.w3c.dom.*;
+
+/**
+ * RegexParseFilter. If a regular expression matches either HTML or
+ * extracted text, a configurable field is set to true.
+ */
+public class RegexParseFilter implements HtmlParseFilter {
+
+ private static final Logger LOG = LoggerFactory.getLogger(RegexParseFilter.class);
+ private static String attributeFile = null;
+ private String regexFile = null;
+
+ private Configuration conf;
+ private DocumentFragment doc;
+
+ private static final Map<String,RegexRule> rules = new HashMap<String,RegexRule>();
+
+ public RegexParseFilter() {}
+
+ public RegexParseFilter(String regexFile) {
+ this.regexFile = regexFile;
+ }
+
+ public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
+ Parse parse = parseResult.get(content.getUrl());
+ String html = new String(content.getContent());
+ String text = parse.getText();
+
+ for (Map.Entry<String, RegexRule> entry : rules.entrySet()) {
+ String field = entry.getKey();
+ RegexRule regexRule = entry.getValue();
+
+ String source = null;
+ if (regexRule.source.equalsIgnoreCase("html")) {
+ source = html;
+ }
+ if (regexRule.source.equalsIgnoreCase("text")) {
+ source = text;
+ }
+
+ if (source == null) {
+ LOG.error("source for regex rule: " + field + " misconfigured");
+ }
+
+ if (matches(source, regexRule.regex)) {
+ parse.getData().getParseMeta().set(field, "true");
+ } else {
+ parse.getData().getParseMeta().set(field, "false");
+ }
+ }
+
+ return parseResult;
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+
+ // get the extensions for domain urlfilter
+ String pluginName = "parsefilter-regex";
+ Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
+ HtmlParseFilter.class.getName()).getExtensions();
+ for (int i = 0; i < extensions.length; i++) {
+ Extension extension = extensions[i];
+ if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+ attributeFile = extension.getAttribute("file");
+ break;
+ }
+ }
+
+ // handle blank non empty input
+ if (attributeFile != null && attributeFile.trim().equals("")) {
+ attributeFile = null;
+ }
+
+ if (attributeFile != null) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+ + " as " + attributeFile);
+ }
+ }
+ else {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+ + pluginName);
+ }
+ }
+
+ // domain file and attribute "file" take precedence if defined
+ String file = conf.get("parsefilter.regex.file");
+ String stringRules = conf.get("parsefilter.regex.rules");
+ if (regexFile != null) {
+ file = regexFile;
+ }
+ else if (attributeFile != null) {
+ file = attributeFile;
+ }
+ Reader reader = null;
+ if (stringRules != null) { // takes precedence over files
+ reader = new StringReader(stringRules);
+ } else {
+ reader = conf.getConfResourceAsReader(file);
+ }
+ try {
+ if (reader == null) {
+ reader = new FileReader(file);
+ }
+ readConfiguration(reader);
+ }
+ catch (IOException e) {
+ LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ }
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ private boolean matches(String value, Pattern pattern) {
+ if (value != null) {
+ Matcher matcher = pattern.matcher(value);
+ return matcher.find();
+ }
+
+ return false;
+ }
+
+ private synchronized void readConfiguration(Reader configReader) throws IOException {
+ if (rules.size() > 0) {
+ return;
+ }
+
+ String line;
+ BufferedReader reader = new BufferedReader(configReader);
+ while ((line = reader.readLine()) != null) {
+ if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+ line = line.trim();
+ String[] parts = line.split("\t");
+
+ String field = parts[0].trim();
+ String source = parts[1].trim();
+ String regex = parts[2].trim();
+
+ rules.put(field, new RegexRule(source, regex));
+ }
+ }
+ }
+
+ private static class RegexRule {
+ public RegexRule(String source, String regex) {
+ this.source = source;
+ this.regex = Pattern.compile(regex);
+ }
+ String source;
+ Pattern regex;
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/package-info.java b/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/package-info.java
new file mode 100644
index 0000000..f8f46ee
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * RegexParseFilter. If a regular expression matches either HTML or
+ * extracted text, a configurable field is set to true.
+ */
+package org.apache.nutch.parsefilter.regex;
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/src/test/java/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/src/test/java/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java b/nutch-plugins/parsefilter-regex/src/test/java/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
new file mode 100644
index 0000000..9bd7149
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/src/test/java/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parsefilter.regex;
+
+import java.net.MalformedURLException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import junit.framework.TestCase;
+
+public class TestRegexParseFilter extends TestCase {
+
+ private final static String SEPARATOR = System.getProperty("file.separator");
+ private final static String SAMPLES = System.getProperty("test.data", ".");
+
+ public void testPositiveFilter() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+
+ String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
+ RegexParseFilter filter = new RegexParseFilter(file);
+ filter.setConf(conf);
+
+ String url = "http://nutch.apache.org/";
+ String html = "<body><html><h1>nutch</h1><p>this is the extracted text blablabla</p></body></html>";
+ Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
+ Parse parse = new ParseImpl("nutch this is the extracted text blablabla", new ParseData());
+
+ ParseResult result = ParseResult.createParseResult(url, parse);
+ result = filter.filter(content, result, null, null);
+
+ Metadata meta = parse.getData().getParseMeta();
+
+ assertEquals("true", meta.get("first"));
+ assertEquals("true", meta.get("second"));
+ }
+
+ public void testNegativeFilter() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+
+ String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
+ RegexParseFilter filter = new RegexParseFilter(file);
+ filter.setConf(conf);
+
+ String url = "http://nutch.apache.org/";
+ String html = "<body><html><h2>nutch</h2><p>this is the extracted text no bla</p></body></html>";
+ Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
+ Parse parse = new ParseImpl("nutch this is the extracted text bla", new ParseData());
+
+ ParseResult result = ParseResult.createParseResult(url, parse);
+ result = filter.filter(content, result, null, null);
+
+ Metadata meta = parse.getData().getParseMeta();
+
+ assertEquals("false", meta.get("first"));
+ assertEquals("false", meta.get("second"));
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/src/test/resources/regex-parsefilter.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/src/test/resources/regex-parsefilter.txt b/nutch-plugins/parsefilter-regex/src/test/resources/regex-parsefilter.txt
new file mode 100644
index 0000000..9d15cd8
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/src/test/resources/regex-parsefilter.txt
@@ -0,0 +1,10 @@
+# Example configuration file for parsefilter-regex
+#
+# Parse metadata field <name> is set to true if the HTML matches the regex. The
+# source can either be html or text. If source is html, the regex is applied to
+# the entire HTML tree. If source is text, the regex is applied to the
+# extracted text.
+#
+# format: <name>\t<source>\t<regex>\n
+first html h1
+second text blablabla
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/plugin.dtd
----------------------------------------------------------------------
diff --git a/nutch-plugins/plugin.dtd b/nutch-plugins/plugin.dtd
new file mode 100644
index 0000000..9b67da7
--- /dev/null
+++ b/nutch-plugins/plugin.dtd
@@ -0,0 +1,206 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ ! Licensed to the Apache Software Foundation (ASF) under one or more
+ ! contributor license agreements. See the NOTICE file distributed with
+ ! this work for additional information regarding copyright ownership.
+ ! The ASF licenses this file to You under the Apache License, Version 2.0
+ ! (the "License"); you may not use this file except in compliance with
+ ! the License. You may obtain a copy of the License at
+ !
+ ! http://www.apache.org/licenses/LICENSE-2.0
+ !
+ ! Unless required by applicable law or agreed to in writing, software
+ ! distributed under the License is distributed on an "AS IS" BASIS,
+ ! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ! See the License for the specific language governing permissions and
+ ! limitations under the License.
+ !
+ !
+ ! Document : plugin.dtd
+ ! Created on : 14 avril 2006, 22:14
+ ! Author : Chris Mattmann, Jerome Charron
+ ! Description: Nutch plug-in manifest DTD
+ !
+ ! PUBLIC ID : -//Apache Software Fundation//DTD Nutch Plugin Manifest 1.0//EN
+ ! SYSTEM ID : http://lucene.apache.org/nutch/plugin.dtd
+-->
+
+
+
+<!--
+ ! The <plugin> element defines the body of the manifest.
+ ! It optionally contains definitions for the plug-in runtime,
+ ! definitions of other plug-ins required by this one,
+ ! declarations of any new extension points being introduced by the plug-in,
+ ! as well as configuration of functional extensions
+ ! (configured into extension points defined by other plug-ins,
+ ! or introduced by this plug-in).
+ !-->
+<!ELEMENT plugin (runtime?, requires?, extension-point*, extension*)>
+
+<!-- A user displayable name for the plug-in -->
+<!ATTLIST plugin name CDATA #REQUIRED>
+
+<!--
+ ! A unique identifier for the plug-in.
+ ! To minimize potential for naming collisions,
+ ! the identifier should be derived from the internet domain id
+ ! of the supplying provider (reversing the domain name tokens and
+ ! appending additional name tokens separated by dot [.]).
+ ! For example, provider nutch.org could define plug-in identifier
+ ! org.nutch.myplugin
+ !-->
+<!ATTLIST plugin id CDATA #REQUIRED>
+
+<!--
+ ! The plug-in version number.
+ ! NOTE : Version numbers compatibility are not yet implemented.
+ !-->
+<!ATTLIST plugin version CDATA #REQUIRED>
+
+<!-- The user-displayable name of the provider supplying the plug-in. -->
+<!ATTLIST plugin provider-name CDATA #IMPLIED>
+
+<!--
+ ! The name of the plug-in class for this plug-in.
+ ! The class must be a subclass of org.apache.nutch.plugin.Plugin
+ !-->
+<!ATTLIST plugin class CDATA #IMPLIED>
+
+
+<!--
+ ! The <requires> section of the manifest declares
+ ! any dependencies on other plug-ins.
+ !-->
+<!ELEMENT requires (import+)>
+
+
+<!-- Each dependency is specified using an <import> element. -->
+<!ELEMENT import EMPTY>
+
+<!-- The identifier of the required plug-in. -->
+<!ATTLIST import plugin CDATA #REQUIRED>
+
+
+<!--
+ ! The <runtime> section of the manifest contains a definition of one or more
+ ! libraries that make up the plug-in runtime.
+ ! The referenced libraries are used by the plugin execution mechanisms
+ ! (the plug-in class loader) to load and execute the correct code required by
+ ! the plug-in.
+ !-->
+<!ELEMENT runtime (library+)>
+
+
+<!--
+ !The <library> elements collectively define the plug-in runtime.
+ ! At least one <library> must be specified.
+ !-->
+<!ELEMENT library (export*)>
+
+<!--
+ ! A string reference to a library file or directory containing classes
+ ! (relative to the plug-in install directory).
+ ! Directory references must contain trailing file separator.
+ !-->
+<!ATTLIST library name CDATA #REQUIRED>
+
+
+<!--
+ ! Each <library> element can specify which portion
+ ! of the library should be exported.
+ ! The export rules are specified as a set of export masks.
+ ! By default (no export rules specified),
+ ! the library is considered to be private.
+ ! Each export mask is specified using the name attribute.
+ !-->
+<!ELEMENT export EMPTY>
+
+<!--
+ ! The export mask can have the following values:
+ ! * - indicates all contents of library are exported (public)
+ ! package.name.* - indicates all classes in the specified package
+ ! are exported. The matching rules are the same as in the
+ ! Java import statement.
+ ! package.name.ClassName - fully qualified java class name
+ !
+ ! NOTE : export mask is not yet implemented in Nutch.
+ !-->
+<!ATTLIST export name CDATA #REQUIRED>
+
+
+<!--
+ ! Nutch's architecture is based on the notion of configurable extension points.
+ ! Nutch itself predefines a set of extension points that cover the task of
+ ! extending it (for example, adding parser, indexing filter, ...).
+ ! In addition to the predefined extension points, each supplied plug-in can
+ ! declare additional extension points. By declaring an extension point the
+ ! plug-in is essentially advertising the ability to configure the plug-in
+ ! function with externally supplied extensions.
+ !-->
+<!ELEMENT extension-point EMPTY>
+
+<!-- A user-displayable name for the extension point. -->
+<!ATTLIST extension-point name CDATA #REQUIRED>
+
+<!-- A simple id, unique within this plug-in -->
+<!ATTLIST extension-point id CDATA #REQUIRED>
+
+
+<!--
+ ! Actual extensions are configured into extension points
+ ! (predefined, or newly declared in this plug-in) in the <extension> section.
+ !
+ ! The configuration information is specified by at least one implementation
+ ! with some parameters.
+ !-->
+<!ELEMENT extension (implementation+)>
+
+<!--
+ ! A reference to an extension point being configured.
+ ! The extension point can be one defined in this plug-in or another plug-in.
+ !-->
+<!ATTLIST extension point CDATA #REQUIRED>
+
+<!--
+ ! Optional identifier for this extension point configuration instance.
+ ! This is used by extension points that need to uniquely identify
+ ! (rather than just enumerate) the specific configured extensions.
+ ! The identifier is specified as a simple token unique within the definition
+ ! of the declaring plug-in. When used globally, the extension identifier
+ ! is qualified by the plug-in identifier.
+ ! FIXME : Seems it is never read in the code.
+ !-->
+<!ATTLIST extension id CDATA #IMPLIED>
+
+<!--
+ ! A user-displayable name for the extension.
+ ! FIXME : Seems it is never read in the code.
+ !-->
+<!ATTLIST extension name CDATA #IMPLIED>
+
+
+<!--
+ ! Defines a specific implementation for the extension.
+ ! This implementation can define some special name/value parameters
+ ! used at runtime.
+ !-->
+<!ELEMENT implementation (parameter*)>
+
+<!-- A unique identifier for this implementation -->
+<!ATTLIST implementation id CDATA #REQUIRED>
+
+<!-- The fully-qualified Java Class that implements this extension-point -->
+<!ATTLIST implementation class CDATA #REQUIRED>
+
+
+<!-- Defines a name/value parameter -->
+<!ELEMENT parameter EMPTY>
+
+<!-- The parameter's name (should be unique for an extension) -->
+<!ATTLIST parameter name CDATA #REQUIRED>
+
+<!-- The parameter's value -->
+<!ATTLIST parameter value CDATA #REQUIRED>
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/plugin/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/plugin/pom.xml b/nutch-plugins/plugin/pom.xml
new file mode 100644
index 0000000..2ac06ee
--- /dev/null
+++ b/nutch-plugins/plugin/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>plugin</artifactId>
+ <packaging>jar</packaging>
+
+ <name>plugin</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/pom.xml b/nutch-plugins/pom.xml
new file mode 100644
index 0000000..e07f487
--- /dev/null
+++ b/nutch-plugins/pom.xml
@@ -0,0 +1,164 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-parent</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>nutch-plugins</artifactId>
+ <packaging>pom</packaging>
+
+ <name>nutch-plugins</name>
+ <url>http://nutch.apache.org</url>
+
+ <modules>
+ <!--<module>indexer-solr</module>-->
+ <module>creativecommons</module>
+ <module>feed</module>
+ <module>headings</module>
+ <module>index-anchor</module>
+ <module>index-basic</module>
+ <module>index-geoip</module>
+ <module>index-links</module>
+ <module>index-metadata</module>
+ <module>index-more</module>
+ <module>index-replace</module>
+ <module>index-static</module>
+ <module>indexer-cloudsearch</module>
+ <module>indexer-dummy</module>
+ <module>indexer-elastic</module>
+ <module>indexer-solr</module>
+ <module>language-identifier</module>
+ <module>lib-htmlunit</module>
+ <module>lib-http</module>
+ <module>lib-nekohtml</module>
+ <module>lib-regex-filter</module>
+ <module>lib-selenium</module>
+ <module>lib-xml</module>
+ <module>microformats-reltag</module>
+ <module>mimetype-filter</module>
+ <module>nutch-extensionpoints</module>
+ <module>parse-ext</module>
+ <module>parse-html</module>
+ <module>parse-js</module>
+ <module>parse-metatags</module>
+ <module>parse-replace</module>
+ <module>parse-swf</module>
+ <module>parse-tika</module>
+ <module>parse-zip</module>
+ <module>parsefilter-naivebayes</module>
+ <module>parsefilter-regex</module>
+ <module>plugin</module>
+ <module>protocol-file</module>
+ <module>protocol-ftp</module>
+ <module>protocol-htmlunit</module>
+ <module>protocol-http</module>
+ <module>protocol-httpclient</module>
+ <module>protocol-interactiveselenium</module>
+ <module>protocol-selenium</module>
+ <module>scoring-depth</module>
+ <module>scoring-link</module>
+ <module>scoring-opic</module>
+ <module>scoring-similarity</module>
+ <module>subcollection</module>
+ <module>tld</module>
+ <module>urlfilter-automaton</module>
+ <module>urlfilter-domain</module>
+ <module>urlfilter-domainblacklist</module>
+ <module>urlfilter-ignoreexempt</module>
+ <module>urlfilter-prefix</module>
+ <module>urlfilter-regex</module>
+ <module>urlfilter-suffix</module>
+ <module>urlfilter-validator</module>
+ <module>urlmeta</module>
+ <module>urlnormalizer-ajax</module>
+ <module>urlnormalizer-basic</module>
+ <module>urlnormalizer-host</module>
+ <module>urlnormalizer-pass</module>
+ <module>urlnormalizer-protocol</module>
+ <module>urlnormalizer-querystring</module>
+ <module>urlnormalizer-regex</module>
+ <module>urlnormalizer-slash</module>
+ </modules>
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ <!-- Note : an additional level is for the child modules (defined ahead in hierarchy)-->
+ <dir.root>..${file.separator}..${file.separator}</dir.root>
+ <libs.dir>${dir.local.plugins}${file.separator}${project.artifactId}</libs.dir>
+ </properties>
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-core</artifactId>
+ <version>${project.parent.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-core</artifactId>
+ <version>${project.parent.version}</version>
+ <scope>test</scope>
+ <type>test-jar</type>
+ </dependency>
+ </dependencies>
+ <build>
+ <finalName>${project.artifactId}</finalName>
+ <plugins>
+ <plugin>
+ <artifactId>maven-resources-plugin</artifactId>
+ <version>3.0.1</version>
+ <executions>
+ <execution>
+ <id>copy-resources</id>
+ <phase>package</phase>
+ <goals>
+ <goal>copy-resources</goal>
+ </goals>
+ <configuration>
+ <outputDirectory>${libs.dir}</outputDirectory>
+ <resources>
+ <resource>
+ <directory>${project.build.directory}</directory>
+ <include>${build.finalName}.jar</include>
+ </resource>
+ <resource>
+ <directory>${project.basedir}</directory>
+ <include>plugin.xml</include>
+ </resource>
+ </resources>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <version>2.19.1</version>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ <version>2.19.1</version>
+ </plugin>
+ </plugins>
+ </build>
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/build.xml b/nutch-plugins/protocol-file/build.xml
new file mode 100644
index 0000000..121b1fe
--- /dev/null
+++ b/nutch-plugins/protocol-file/build.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-file" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- for junit test -->
+ <mkdir dir="${build.test}/data"/>
+ <copy todir="${build.test}/data">
+ <fileset dir="sample">
+ <include name="*.txt"/>
+ </fileset>
+ </copy>
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/ivy.xml b/nutch-plugins/protocol-file/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/protocol-file/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/plugin.xml b/nutch-plugins/protocol-file/plugin.xml
new file mode 100644
index 0000000..1647ce4
--- /dev/null
+++ b/nutch-plugins/protocol-file/plugin.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="protocol-file"
+ name="File Protocol Plug-in"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+
+ <runtime>
+ <library name="protocol-file.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.protocol.file"
+ name="FileProtocol"
+ point="org.apache.nutch.protocol.Protocol">
+
+ <implementation id="org.apache.nutch.protocol.file.File"
+ class="org.apache.nutch.protocol.file.File">
+ <parameter name="protocolName" value="file"/>
+ </implementation>
+
+ </extension>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/pom.xml b/nutch-plugins/protocol-file/pom.xml
new file mode 100644
index 0000000..2ab2f75
--- /dev/null
+++ b/nutch-plugins/protocol-file/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>protocol-file</artifactId>
+ <packaging>jar</packaging>
+
+ <name>protocol-file</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/File.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/File.java b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/File.java
new file mode 100644
index 0000000..2712218
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/File.java
@@ -0,0 +1,228 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.file;
+
+import java.net.URL;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.protocol.RobotRulesParser;
+import org.apache.nutch.util.NutchConfiguration;
+
+import crawlercommons.robots.BaseRobotRules;
+
+/**
+ * This class is a protocol plugin used for file: scheme. It creates
+ * {@link FileResponse} object and gets the content of the url from it.
+ * Configurable parameters are {@code file.content.limit} and
+ * {@code file.crawl.parent} in nutch-default.xml defined under
+ * "file properties" section.
+ *
+ * @author John Xing
+ */
+public class File implements Protocol {
+
+ public static final Logger LOG = LoggerFactory.getLogger(File.class);
+
+ static final int MAX_REDIRECTS = 5;
+
+ int maxContentLength;
+ boolean crawlParents;
+
+ /**
+ * if true return a redirect for symbolic links and do not resolve the links
+ * internally
+ */
+ boolean symlinksAsRedirects = true;
+
+ private Configuration conf;
+
+ public File() {
+ }
+
+ /**
+ * Set the {@link Configuration} object
+ */
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
+ this.crawlParents = conf.getBoolean("file.crawl.parent", true);
+ this.symlinksAsRedirects = conf.getBoolean(
+ "file.crawl.redirect_noncanonical", true);
+ }
+
+ /**
+ * Get the {@link Configuration} object
+ */
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ /**
+ * Set the length after at which content is truncated.
+ */
+ public void setMaxContentLength(int maxContentLength) {
+ this.maxContentLength = maxContentLength;
+ }
+
+ /**
+ * Creates a {@link FileResponse} object corresponding to the url and return a
+ * {@link ProtocolOutput} object as per the content received
+ *
+ * @param url
+ * Text containing the url
+ * @param datum
+ * The CrawlDatum object corresponding to the url
+ *
+ * @return {@link ProtocolOutput} object for the content of the file indicated
+ * by url
+ */
+ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
+ String urlString = url.toString();
+ try {
+ URL u = new URL(urlString);
+
+ int redirects = 0;
+
+ while (true) {
+ FileResponse response;
+ response = new FileResponse(u, datum, this, getConf()); // make a
+ // request
+
+ int code = response.getCode();
+
+ if (code == 200) { // got a good response
+ return new ProtocolOutput(response.toContent()); // return it
+
+ } else if (code == 304) { // got not modified
+ return new ProtocolOutput(response.toContent(),
+ ProtocolStatus.STATUS_NOTMODIFIED);
+
+ } else if (code == 401) { // access denied / no read permissions
+ return new ProtocolOutput(response.toContent(), new ProtocolStatus(
+ ProtocolStatus.ACCESS_DENIED));
+
+ } else if (code == 404) { // no such file
+ return new ProtocolOutput(response.toContent(),
+ ProtocolStatus.STATUS_NOTFOUND);
+
+ } else if (code >= 300 && code < 400) { // handle redirect
+ u = new URL(response.getHeader("Location"));
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("redirect to " + u);
+ }
+ if (symlinksAsRedirects) {
+ return new ProtocolOutput(response.toContent(), new ProtocolStatus(
+ ProtocolStatus.MOVED, u));
+ } else if (redirects == MAX_REDIRECTS) {
+ LOG.trace("Too many redirects: {}", url);
+ return new ProtocolOutput(response.toContent(), new ProtocolStatus(
+ ProtocolStatus.REDIR_EXCEEDED, u));
+ }
+ redirects++;
+
+ } else { // convert to exception
+ throw new FileError(code);
+ }
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ return new ProtocolOutput(null, new ProtocolStatus(e));
+ }
+ }
+
+ /**
+ * Quick way for running this class. Useful for debugging.
+ */
+ public static void main(String[] args) throws Exception {
+ int maxContentLength = Integer.MIN_VALUE;
+ String logLevel = "info";
+ boolean dumpContent = false;
+ String urlString = null;
+
+ String usage = "Usage: File [-logLevel level] [-maxContentLength L] [-dumpContent] url";
+
+ if (args.length == 0) {
+ System.err.println(usage);
+ System.exit(-1);
+ }
+
+ for (int i = 0; i < args.length; i++) {
+ if (args[i].equals("-logLevel")) {
+ logLevel = args[++i];
+ } else if (args[i].equals("-maxContentLength")) {
+ maxContentLength = Integer.parseInt(args[++i]);
+ } else if (args[i].equals("-dumpContent")) {
+ dumpContent = true;
+ } else if (i != args.length - 1) {
+ System.err.println(usage);
+ System.exit(-1);
+ } else
+ urlString = args[i];
+ }
+
+ File file = new File();
+ file.setConf(NutchConfiguration.create());
+
+ if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
+ file.setMaxContentLength(maxContentLength);
+
+ // set log level
+ // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
+
+ ProtocolOutput output = file.getProtocolOutput(new Text(urlString),
+ new CrawlDatum());
+ Content content = output.getContent();
+
+ System.err.println("URL: " + content.getUrl());
+ System.err.println("Status: " + output.getStatus());
+ System.err.println("Content-Type: " + content.getContentType());
+ System.err.println("Content-Length: "
+ + content.getMetadata().get(Response.CONTENT_LENGTH));
+ System.err.println("Last-Modified: "
+ + content.getMetadata().get(Response.LAST_MODIFIED));
+ String redirectLocation = content.getMetadata().get("Location");
+ if (redirectLocation != null) {
+ System.err.println("Location: " + redirectLocation);
+ }
+
+ if (dumpContent) {
+ System.out.print(new String(content.getContent()));
+ }
+
+ file = null;
+ }
+
+ /**
+ * No robots parsing is done for file protocol. So this returns a set of empty
+ * rules which will allow every url.
+ */
+ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
+ return RobotRulesParser.EMPTY_RULES;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileError.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileError.java b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileError.java
new file mode 100644
index 0000000..4fef340
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileError.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.file;
+
+/**
+ * Thrown for File error codes.
+ */
+public class FileError extends FileException {
+
+ private int code;
+
+ public int getCode(int code) {
+ return code;
+ }
+
+ public FileError(int code) {
+ super("File Error: " + code);
+ this.code = code;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileException.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileException.java b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileException.java
new file mode 100644
index 0000000..f0467de
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileException.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.file;
+
+import org.apache.nutch.protocol.ProtocolException;
+
+public class FileException extends ProtocolException {
+
+ public FileException() {
+ super();
+ }
+
+ public FileException(String message) {
+ super(message);
+ }
+
+ public FileException(String message, Throwable cause) {
+ super(message, cause);
+ }
+
+ public FileException(Throwable cause) {
+ super(cause);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileResponse.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileResponse.java b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileResponse.java
new file mode 100644
index 0000000..b6e74ff
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileResponse.java
@@ -0,0 +1,317 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.file;
+
+// JDK imports
+import java.net.URL;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+
+// Tika imports
+import org.apache.tika.Tika;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+/************************************
+ * FileResponse.java mimics file replies as http response. It tries its best to
+ * follow http's way for headers, response codes as well as exceptions.
+ *
+ * Comments: (1) java.net.URL and java.net.URLConnection can handle file:
+ * scheme. However they are not flexible enough, so not used in this
+ * implementation.
+ *
+ * (2) java.io.File is used for its abstractness across platforms. Warning:
+ * java.io.File API (1.4.2) does not elaborate on how special files, such as
+ * /dev/* in unix and /proc/* on linux, are treated. Tests show (a)
+ * java.io.File.isFile() return false for /dev/* (b) java.io.File.isFile()
+ * return true for /proc/* (c) java.io.File.length() return 0 for /proc/* We are
+ * probably oaky for now. Could be buggy here. How about special files on
+ * windows?
+ *
+ * (3) java.io.File API (1.4.2) does not seem to know unix hard link files. They
+ * are just treated as individual files.
+ *
+ * (4) No funcy POSIX file attributes yet. May never need?
+ *
+ * @author John Xing
+ ***********************************/
+public class FileResponse {
+
+ private String orig;
+ private String base;
+ private byte[] content;
+ private static final byte[] EMPTY_CONTENT = new byte[0];
+ private int code;
+ private Metadata headers = new Metadata();
+
+ private final File file;
+ private Configuration conf;
+
+ private MimeUtil MIME;
+ private Tika tika;
+
+ /** Returns the response code. */
+ public int getCode() {
+ return code;
+ }
+
+ /** Returns the value of a named header. */
+ public String getHeader(String name) {
+ return headers.get(name);
+ }
+
+ public byte[] getContent() {
+ return content;
+ }
+
+ public Content toContent() {
+ return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
+ getHeader(Response.CONTENT_TYPE), headers, this.conf);
+ }
+
+ /**
+ * Default public constructor
+ *
+ * @param url
+ * @param datum
+ * @param file
+ * @param conf
+ * @throws FileException
+ * @throws IOException
+ */
+ public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
+ throws FileException, IOException {
+
+ this.orig = url.toString();
+ this.base = url.toString();
+ this.file = file;
+ this.conf = conf;
+
+ MIME = new MimeUtil(conf);
+ tika = new Tika();
+
+ if (!"file".equals(url.getProtocol()))
+ throw new FileException("Not a file url:" + url);
+
+ if (File.LOG.isTraceEnabled()) {
+ File.LOG.trace("fetching " + url);
+ }
+
+ if (url.getPath() != url.getFile()) {
+ if (File.LOG.isWarnEnabled()) {
+ File.LOG.warn("url.getPath() != url.getFile(): " + url);
+ }
+ }
+
+ String path = "".equals(url.getPath()) ? "/" : url.getPath();
+
+ try {
+ // specify the encoding via the config later?
+ path = java.net.URLDecoder.decode(path, "UTF-8");
+ } catch (UnsupportedEncodingException ex) {
+ }
+
+ try {
+
+ this.content = null;
+
+ // url.toURI() is only in j2se 1.5.0
+ // java.io.File f = new java.io.File(url.toURI());
+ java.io.File f = new java.io.File(path);
+
+ if (!f.exists()) {
+ this.code = 404; // http Not Found
+ return;
+ }
+
+ if (!f.canRead()) {
+ this.code = 401; // http Unauthorized
+ return;
+ }
+
+ // symbolic link or relative path on unix
+ // fix me: what's the consequence on windows platform
+ // where case is insensitive
+ if (!f.equals(f.getCanonicalFile())) {
+ // set headers
+ // hdrs.put("Location", f.getCanonicalFile().toURI());
+ //
+ // we want to automatically escape characters that are illegal in URLs.
+ // It is recommended that new code convert an abstract pathname into a
+ // URL
+ // by first converting it into a URI, via the toURI method, and then
+ // converting the URI into a URL via the URI.toURL method.
+ headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL()
+ .toString());
+
+ this.code = 300; // http redirect
+ return;
+ }
+ if (f.lastModified() <= datum.getModifiedTime()) {
+ this.code = 304;
+ this.headers.set("Last-Modified",
+ HttpDateFormat.toString(f.lastModified()));
+ return;
+ }
+
+ if (f.isDirectory()) {
+ getDirAsHttpResponse(f);
+ } else if (f.isFile()) {
+ getFileAsHttpResponse(f);
+ } else {
+ this.code = 500; // http Internal Server Error
+ return;
+ }
+
+ } catch (IOException e) {
+ throw e;
+ }
+
+ }
+
+ // get file as http response
+ private void getFileAsHttpResponse(java.io.File f) throws FileException,
+ IOException {
+
+ // ignore file of size larger than
+ // Integer.MAX_VALUE = 2^31-1 = 2147483647
+ long size = f.length();
+ if (size > Integer.MAX_VALUE) {
+ throw new FileException("file is too large, size: " + size);
+ // or we can do this?
+ // this.code = 400; // http Bad request
+ // return;
+ }
+
+ // capture content
+ int len = (int) size;
+
+ if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength)
+ len = this.file.maxContentLength;
+
+ this.content = new byte[len];
+
+ java.io.InputStream is = new java.io.FileInputStream(f);
+ int offset = 0;
+ int n = 0;
+ while (offset < len
+ && (n = is.read(this.content, offset, len - offset)) >= 0) {
+ offset += n;
+ }
+ if (offset < len) { // keep whatever already have, but issue a warning
+ if (File.LOG.isWarnEnabled()) {
+ File.LOG.warn("not enough bytes read from file: " + f.getPath());
+ }
+ }
+ is.close();
+
+ // set headers
+ headers.set(Response.CONTENT_LENGTH, new Long(size).toString());
+ headers.set(Response.LAST_MODIFIED,
+ HttpDateFormat.toString(f.lastModified()));
+
+ String mimeType = tika.detect(f);
+
+ headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : "");
+
+ // response code
+ this.code = 200; // http OK
+ }
+
+ /**
+ * get dir list as http response
+ *
+ * @param f
+ * @throws IOException
+ */
+ private void getDirAsHttpResponse(java.io.File f) throws IOException {
+
+ String path = f.toString();
+ if (this.file.crawlParents)
+ this.content = list2html(f.listFiles(), path, "/".equals(path) ? false
+ : true);
+ else
+ this.content = list2html(f.listFiles(), path, false);
+
+ // set headers
+ headers.set(Response.CONTENT_LENGTH,
+ new Integer(this.content.length).toString());
+ headers.set(Response.CONTENT_TYPE, "text/html");
+ headers.set(Response.LAST_MODIFIED,
+ HttpDateFormat.toString(f.lastModified()));
+
+ // response code
+ this.code = 200; // http OK
+ }
+
+ /**
+ * generate html page from dir list
+ *
+ * @param list
+ * @param path
+ * @param includeDotDot
+ * @return
+ */
+ private byte[] list2html(java.io.File[] list, String path,
+ boolean includeDotDot) {
+
+ StringBuffer x = new StringBuffer("<html><head>");
+ x.append("<title>Index of " + path + "</title></head>\n");
+ x.append("<body><h1>Index of " + path + "</h1><pre>\n");
+
+ if (includeDotDot) {
+ x.append("<a href='../'>../</a>\t-\t-\t-\n");
+ }
+
+ // fix me: we might want to sort list here! but not now.
+
+ java.io.File f;
+ for (int i = 0; i < list.length; i++) {
+ f = list[i];
+ String name = f.getName();
+ String time = HttpDateFormat.toString(f.lastModified());
+ if (f.isDirectory()) {
+ // java 1.4.2 api says dir itself and parent dir are not listed
+ // so the following is not needed.
+ // if (name.equals(".") || name.equals(".."))
+ // continue;
+ x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t");
+ x.append(time + "\t-\n");
+ } else if (f.isFile()) {
+ x.append("<a href='" + name + "'>" + name + "</a>\t");
+ x.append(time + "\t" + f.length() + "\n");
+ } else {
+ // ignore any other
+ }
+ }
+
+ x.append("</pre></body></html>\n");
+
+ return new String(x).getBytes();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/package.html b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/package.html
new file mode 100644
index 0000000..221c79c
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving local file resources.</p><p></p>
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/test/java/org/apache/nutch/protocol/file/TestProtocolFile.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/test/java/org/apache/nutch/protocol/file/TestProtocolFile.java b/nutch-plugins/protocol-file/src/test/java/org/apache/nutch/protocol/file/TestProtocolFile.java
new file mode 100644
index 0000000..5f95377
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/test/java/org/apache/nutch/protocol/file/TestProtocolFile.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.file;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * @author mattmann
+ * @version $Revision$
+ *
+ * <p>
+ * Unit tests for the {@link File}Protocol.
+ * </p>
+ * .
+ */
+public class TestProtocolFile {
+
+ private String fileSeparator = System.getProperty("file.separator");
+ private String sampleDir = System.getProperty("test.data", ".");
+
+ private static final String[] testTextFiles = new String[] {
+ "testprotocolfile.txt", "testprotocolfile_(encoded).txt",
+ "testprotocolfile_%28encoded%29.txt" };
+
+ private static final CrawlDatum datum = new CrawlDatum();
+
+ private static final String expectedMimeType = "text/plain";
+
+ private Configuration conf;
+
+ @Before
+ public void setUp() {
+ conf = NutchConfiguration.create();
+ }
+
+ @Test
+ public void testSetContentType() throws ProtocolException {
+ for (String testTextFile : testTextFiles) {
+ setContentType(testTextFile);
+ }
+ }
+
+ /**
+ * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field.
+ *
+ * @since NUTCH-384
+ *
+ */
+ public void setContentType(String testTextFile) throws ProtocolException {
+ String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
+ Assert.assertNotNull(urlString);
+ Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+ ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString),
+ datum);
+ Assert.assertNotNull(output);
+ Assert.assertEquals("Status code: [" + output.getStatus().getCode()
+ + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: ["
+ + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output
+ .getStatus().getCode());
+ Assert.assertNotNull(output.getContent());
+ Assert.assertNotNull(output.getContent().getContentType());
+ Assert.assertEquals(expectedMimeType, output.getContent().getContentType());
+ Assert.assertNotNull(output.getContent().getMetadata());
+ Assert.assertEquals(expectedMimeType, output.getContent().getMetadata()
+ .get(Response.CONTENT_TYPE));
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/test/resources/testprotocolfile.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/test/resources/testprotocolfile.txt b/nutch-plugins/protocol-file/src/test/resources/testprotocolfile.txt
new file mode 100644
index 0000000..fbe8a8a
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/test/resources/testprotocolfile.txt
@@ -0,0 +1 @@
+Protocol File Test
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/test/resources/testprotocolfile_(encoded).txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/test/resources/testprotocolfile_(encoded).txt b/nutch-plugins/protocol-file/src/test/resources/testprotocolfile_(encoded).txt
new file mode 100644
index 0000000..fbe8a8a
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/test/resources/testprotocolfile_(encoded).txt
@@ -0,0 +1 @@
+Protocol File Test
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/build.xml b/nutch-plugins/protocol-ftp/build.xml
new file mode 100644
index 0000000..79314d4
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-ftp" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/ivy.xml b/nutch-plugins/protocol-ftp/ivy.xml
new file mode 100644
index 0000000..214c445
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ <dependency org="commons-net" name="commons-net" rev="1.2.2" conf="*->master"/>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/plugin.xml b/nutch-plugins/protocol-ftp/plugin.xml
new file mode 100644
index 0000000..1421e37
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/plugin.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="protocol-ftp"
+ name="Ftp Protocol Plug-in"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="protocol-ftp.jar">
+ <export name="*"/>
+ </library>
+ <library name="commons-net-1.2.0-dev.jar"/>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.protocol.ftp"
+ name="FtpProtocol"
+ point="org.apache.nutch.protocol.Protocol">
+
+ <implementation id="org.apache.nutch.protocol.ftp.Ftp"
+ class="org.apache.nutch.protocol.ftp.Ftp">
+ <parameter name="protocolName" value="ftp"/>
+ </implementation>
+
+ </extension>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/pom.xml b/nutch-plugins/protocol-ftp/pom.xml
new file mode 100644
index 0000000..fe9a61b
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>protocol-ftp</artifactId>
+ <packaging>jar</packaging>
+
+ <name>protocol-ftp</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>