You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:25 UTC

[09/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/plugin.xml b/nutch-plugins/parsefilter-regex/plugin.xml
new file mode 100644
index 0000000..0725492
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/plugin.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parsefilter-regex"
+   name="Regex Parse Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parsefilter-regex.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.htmlparsefilter.regex"
+        name="Nutch Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="RegexParseFilter" 
+                      class="org.apache.nutch.parsefilter.regex.RegexParseFilter">
+          <parameter name="file" value="regex-parsefilter.txt"/>
+      </implementation>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/pom.xml b/nutch-plugins/parsefilter-regex/pom.xml
new file mode 100644
index 0000000..19b6452
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parsefilter-regex</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parsefilter-regex</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java b/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
new file mode 100644
index 0000000..0752c91
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
@@ -0,0 +1,199 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parsefilter.regex;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.FileReader;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.protocol.Content;
+
+import org.apache.commons.lang.StringUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.w3c.dom.*;
+
+/**
+ * RegexParseFilter. If a regular expression matches either HTML or 
+ * extracted text, a configurable field is set to true.
+ */
+public class RegexParseFilter implements HtmlParseFilter {
+  
+  private static final Logger LOG = LoggerFactory.getLogger(RegexParseFilter.class);
+  private static String attributeFile = null;
+  private String regexFile = null;
+  
+  private Configuration conf;
+  private DocumentFragment doc;
+  
+  private static final Map<String,RegexRule> rules = new HashMap<String,RegexRule>();
+  
+  public RegexParseFilter() {}
+  
+  public RegexParseFilter(String regexFile) {
+    this.regexFile = regexFile;
+  }
+
+  public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
+    Parse parse = parseResult.get(content.getUrl());
+    String html = new String(content.getContent());
+    String text = parse.getText();
+    
+    for (Map.Entry<String, RegexRule> entry : rules.entrySet()) {
+      String field = entry.getKey();
+      RegexRule regexRule = entry.getValue();
+      
+      String source = null;
+      if (regexRule.source.equalsIgnoreCase("html")) {
+        source = html;
+      }
+      if (regexRule.source.equalsIgnoreCase("text")) {
+        source = text;
+      }
+      
+      if (source == null) {
+        LOG.error("source for regex rule: " + field + " misconfigured");
+      }
+      
+      if (matches(source, regexRule.regex)) {
+        parse.getData().getParseMeta().set(field, "true");
+      } else {
+        parse.getData().getParseMeta().set(field, "false");
+      }
+    }
+    
+    return parseResult;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    // get the extensions for domain urlfilter
+    String pluginName = "parsefilter-regex";
+    Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
+      HtmlParseFilter.class.getName()).getExtensions();
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+        attributeFile = extension.getAttribute("file");
+        break;
+      }
+    }
+
+    // handle blank non empty input
+    if (attributeFile != null && attributeFile.trim().equals("")) {
+      attributeFile = null;
+    }
+
+    if (attributeFile != null) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+          + " as " + attributeFile);
+      }
+    }
+    else {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+          + pluginName);
+      }
+    }
+
+    // domain file and attribute "file" take precedence if defined
+    String file = conf.get("parsefilter.regex.file");
+    String stringRules = conf.get("parsefilter.regex.rules");
+    if (regexFile != null) {
+      file = regexFile;
+    }
+    else if (attributeFile != null) {
+      file = attributeFile;
+    }
+    Reader reader = null;
+    if (stringRules != null) { // takes precedence over files
+      reader = new StringReader(stringRules);
+    } else {
+      reader = conf.getConfResourceAsReader(file);
+    }
+    try {
+      if (reader == null) {
+        reader = new FileReader(file);
+      }
+      readConfiguration(reader);
+    }
+    catch (IOException e) {
+      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+  
+  private boolean matches(String value, Pattern pattern) {
+    if (value != null) {
+      Matcher matcher = pattern.matcher(value);
+      return matcher.find();
+    }
+       
+    return false;
+  }
+  
+  private synchronized void readConfiguration(Reader configReader) throws IOException {
+    if (rules.size() > 0) {
+      return;
+    }
+
+    String line;
+    BufferedReader reader = new BufferedReader(configReader);
+    while ((line = reader.readLine()) != null) {
+      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+        line = line.trim();
+        String[] parts = line.split("\t");
+
+        String field = parts[0].trim();
+        String source = parts[1].trim();
+        String regex = parts[2].trim();
+        
+        rules.put(field, new RegexRule(source, regex));
+      }
+    }
+  }
+  
+  private static class RegexRule {
+    public RegexRule(String source, String regex) {
+      this.source = source;
+      this.regex = Pattern.compile(regex);
+    }
+    String source;
+    Pattern regex;
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/package-info.java b/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/package-info.java
new file mode 100644
index 0000000..f8f46ee
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * RegexParseFilter. If a regular expression matches either HTML or 
+ * extracted text, a configurable field is set to true.
+ */
+package org.apache.nutch.parsefilter.regex;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/src/test/java/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/src/test/java/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java b/nutch-plugins/parsefilter-regex/src/test/java/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
new file mode 100644
index 0000000..9bd7149
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/src/test/java/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parsefilter.regex;
+
+import java.net.MalformedURLException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import junit.framework.TestCase;
+
+public class TestRegexParseFilter extends TestCase {
+
+  private final static String SEPARATOR = System.getProperty("file.separator");
+  private final static String SAMPLES = System.getProperty("test.data", ".");
+
+  public void testPositiveFilter() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+
+    String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
+    RegexParseFilter filter = new RegexParseFilter(file);
+    filter.setConf(conf);
+
+    String url = "http://nutch.apache.org/";
+    String html = "<body><html><h1>nutch</h1><p>this is the extracted text blablabla</p></body></html>";
+    Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
+    Parse parse = new ParseImpl("nutch this is the extracted text blablabla", new ParseData());
+    
+    ParseResult result = ParseResult.createParseResult(url, parse);
+    result = filter.filter(content, result, null, null);
+
+    Metadata meta = parse.getData().getParseMeta();
+    
+    assertEquals("true", meta.get("first"));
+    assertEquals("true", meta.get("second"));
+  }
+  
+  public void testNegativeFilter() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+
+    String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
+    RegexParseFilter filter = new RegexParseFilter(file);
+    filter.setConf(conf);
+
+    String url = "http://nutch.apache.org/";
+    String html = "<body><html><h2>nutch</h2><p>this is the extracted text no bla</p></body></html>";
+    Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
+    Parse parse = new ParseImpl("nutch this is the extracted text bla", new ParseData());
+    
+    ParseResult result = ParseResult.createParseResult(url, parse);
+    result = filter.filter(content, result, null, null);
+
+    Metadata meta = parse.getData().getParseMeta();
+    
+    assertEquals("false", meta.get("first"));
+    assertEquals("false", meta.get("second"));
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/src/test/resources/regex-parsefilter.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/src/test/resources/regex-parsefilter.txt b/nutch-plugins/parsefilter-regex/src/test/resources/regex-parsefilter.txt
new file mode 100644
index 0000000..9d15cd8
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/src/test/resources/regex-parsefilter.txt
@@ -0,0 +1,10 @@
+# Example configuration file for parsefilter-regex
+#
+# Parse metadata field <name> is set to true if the HTML matches the regex. The
+# source can either be html or text. If source is html, the regex is applied to
+# the entire HTML tree. If source is text, the regex is applied to the
+# extracted text.
+#
+# format: <name>\t<source>\t<regex>\n
+first	html	h1
+second	text	blablabla

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/plugin.dtd
----------------------------------------------------------------------
diff --git a/nutch-plugins/plugin.dtd b/nutch-plugins/plugin.dtd
new file mode 100644
index 0000000..9b67da7
--- /dev/null
+++ b/nutch-plugins/plugin.dtd
@@ -0,0 +1,206 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ ! Licensed to the Apache Software Foundation (ASF) under one or more
+ ! contributor license agreements.  See the NOTICE file distributed with
+ ! this work for additional information regarding copyright ownership.
+ ! The ASF licenses this file to You under the Apache License, Version 2.0
+ ! (the "License"); you may not use this file except in compliance with
+ ! the License.  You may obtain a copy of the License at
+ !
+ !     http://www.apache.org/licenses/LICENSE-2.0
+ !
+ ! Unless required by applicable law or agreed to in writing, software
+ ! distributed under the License is distributed on an "AS IS" BASIS,
+ ! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ! See the License for the specific language governing permissions and
+ ! limitations under the License.
+ !
+ !
+ !  Document   : plugin.dtd
+ !  Created on : 14 avril 2006, 22:14
+ !  Author     : Chris Mattmann, Jerome Charron
+ !  Description: Nutch plug-in manifest DTD
+ !
+ !  PUBLIC ID  : -//Apache Software Fundation//DTD Nutch Plugin Manifest 1.0//EN
+ !  SYSTEM ID  : http://lucene.apache.org/nutch/plugin.dtd
+-->
+
+
+
+<!--
+ ! The <plugin> element defines the body of the manifest.
+ ! It optionally contains definitions for the plug-in runtime,
+ ! definitions of other plug-ins required by this one,
+ ! declarations of any new extension points being introduced by the plug-in,
+ ! as well as configuration of functional extensions
+ ! (configured into extension points defined by other plug-ins,
+ ! or introduced by this plug-in).
+ !-->
+<!ELEMENT plugin (runtime?, requires?, extension-point*, extension*)>
+
+<!-- A user displayable name for the plug-in -->
+<!ATTLIST plugin name CDATA #REQUIRED>
+
+<!-- 
+ ! A unique identifier for the plug-in.
+ ! To minimize potential for naming collisions,
+ ! the identifier should be derived from the internet domain id
+ ! of the supplying provider (reversing the domain name tokens and
+ ! appending additional name tokens separated by dot [.]).
+ ! For example, provider nutch.org could define plug-in identifier
+ ! org.nutch.myplugin
+ !-->
+<!ATTLIST plugin id CDATA #REQUIRED>
+
+<!--
+ ! The plug-in version number.
+ ! NOTE : Version numbers compatibility are not yet implemented.
+ !-->
+<!ATTLIST plugin version CDATA #REQUIRED>
+
+<!-- The user-displayable name of the provider supplying the plug-in. -->
+<!ATTLIST plugin provider-name CDATA #IMPLIED>
+
+<!--
+ ! The name of the plug-in class for this plug-in.
+ ! The class must be a subclass of org.apache.nutch.plugin.Plugin
+ !-->
+<!ATTLIST plugin class CDATA #IMPLIED>
+
+
+<!-- 
+ ! The <requires> section of the manifest declares
+ ! any dependencies on other plug-ins.
+ !-->
+<!ELEMENT requires (import+)>
+
+
+<!-- Each dependency is specified using an <import> element. -->
+<!ELEMENT import EMPTY>
+
+<!-- The identifier of the required plug-in. -->
+<!ATTLIST import plugin CDATA #REQUIRED>
+
+
+<!--
+ ! The <runtime> section of the manifest contains a definition of one or more
+ ! libraries that make up the plug-in runtime.
+ ! The referenced libraries are used by the plugin execution mechanisms
+ ! (the plug-in class loader) to load and execute the correct code required by
+ ! the plug-in.
+ !-->
+<!ELEMENT runtime (library+)>
+
+
+<!--
+ !The <library> elements collectively define the plug-in runtime.
+ ! At least one <library> must be specified.
+ !-->
+<!ELEMENT library (export*)>
+
+<!--
+ ! A string reference to a library file or directory containing classes
+ ! (relative to the plug-in install directory).
+ ! Directory references must contain trailing file separator.
+ !-->
+<!ATTLIST library name CDATA #REQUIRED>
+
+
+<!--
+ ! Each <library> element can specify which portion
+ ! of the library should be exported.
+ ! The export rules are specified as a set of export masks.
+ ! By default (no export rules specified),
+ ! the library is considered to be private.
+ ! Each export mask is specified using the name attribute.
+ !-->
+<!ELEMENT export EMPTY>
+
+<!--
+ ! The export mask can have the following values:
+ !   * - indicates all contents of library are exported (public)
+ !   package.name.* - indicates all classes in the specified package
+ !                    are exported. The matching rules are the same as in the
+ !                    Java import statement.
+ !   package.name.ClassName - fully qualified java class name
+ !
+ ! NOTE : export mask is not yet implemented in Nutch.
+ !-->
+<!ATTLIST export name CDATA #REQUIRED>
+
+
+<!--
+ ! Nutch's architecture is based on the notion of configurable extension points.
+ ! Nutch itself predefines a set of extension points that cover the task of
+ ! extending it (for example, adding parser, indexing filter, ...).
+ ! In addition to the predefined extension points, each supplied plug-in can
+ ! declare additional extension points. By declaring an extension point the
+ ! plug-in is essentially advertising the ability to configure the plug-in
+ ! function with externally supplied extensions.
+ !-->
+<!ELEMENT extension-point EMPTY>
+
+<!-- A user-displayable name for the extension point. -->
+<!ATTLIST extension-point name CDATA #REQUIRED>
+
+<!-- A simple id, unique within this plug-in -->
+<!ATTLIST extension-point id CDATA #REQUIRED>
+
+
+<!--
+ ! Actual extensions are configured into extension points
+ ! (predefined, or newly declared in this plug-in) in the <extension> section.
+ !
+ ! The configuration information is specified by at least one implementation
+ ! with some parameters.
+ !-->
+<!ELEMENT extension (implementation+)>
+
+<!-- 
+ ! A reference to an extension point being configured.
+ ! The extension point can be one defined in this plug-in or another plug-in.
+ !-->
+<!ATTLIST extension point CDATA #REQUIRED>
+
+<!--
+ ! Optional identifier for this extension point configuration instance.
+ ! This is used by extension points that need to uniquely identify
+ ! (rather than just enumerate) the specific configured extensions.
+ ! The identifier is specified as a simple token unique within the definition
+ ! of the declaring plug-in. When used globally, the extension identifier
+ ! is qualified by the plug-in identifier.
+ ! FIXME : Seems it is never read in the code.
+ !-->
+<!ATTLIST extension id CDATA #IMPLIED>
+
+<!--
+ ! A user-displayable name for the extension.
+ ! FIXME : Seems it is never read in the code.
+ !-->
+<!ATTLIST extension name CDATA #IMPLIED>
+
+
+<!--
+ ! Defines a specific implementation for the extension.
+ ! This implementation can define some special name/value parameters
+ ! used at runtime.
+ !-->
+<!ELEMENT implementation (parameter*)>
+
+<!-- A unique identifier for this implementation -->
+<!ATTLIST implementation id CDATA #REQUIRED>
+
+<!-- The fully-qualified Java Class that implements this extension-point -->
+<!ATTLIST implementation class CDATA #REQUIRED>
+
+
+<!-- Defines a name/value parameter -->
+<!ELEMENT parameter EMPTY>
+
+<!-- The parameter's name (should be unique for an extension) -->
+<!ATTLIST parameter name CDATA #REQUIRED>
+
+<!-- The parameter's value -->
+<!ATTLIST parameter value CDATA #REQUIRED> 
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/plugin/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/plugin/pom.xml b/nutch-plugins/plugin/pom.xml
new file mode 100644
index 0000000..2ac06ee
--- /dev/null
+++ b/nutch-plugins/plugin/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>plugin</artifactId>
+    <packaging>jar</packaging>
+
+    <name>plugin</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/pom.xml b/nutch-plugins/pom.xml
new file mode 100644
index 0000000..e07f487
--- /dev/null
+++ b/nutch-plugins/pom.xml
@@ -0,0 +1,164 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-parent</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>nutch-plugins</artifactId>
+    <packaging>pom</packaging>
+
+    <name>nutch-plugins</name>
+    <url>http://nutch.apache.org</url>
+
+    <modules>
+        <!--<module>indexer-solr</module>-->
+        <module>creativecommons</module>
+        <module>feed</module>
+        <module>headings</module>
+        <module>index-anchor</module>
+        <module>index-basic</module>
+        <module>index-geoip</module>
+        <module>index-links</module>
+        <module>index-metadata</module>
+        <module>index-more</module>
+        <module>index-replace</module>
+        <module>index-static</module>
+        <module>indexer-cloudsearch</module>
+        <module>indexer-dummy</module>
+        <module>indexer-elastic</module>
+        <module>indexer-solr</module>
+        <module>language-identifier</module>
+        <module>lib-htmlunit</module>
+        <module>lib-http</module>
+        <module>lib-nekohtml</module>
+        <module>lib-regex-filter</module>
+        <module>lib-selenium</module>
+        <module>lib-xml</module>
+        <module>microformats-reltag</module>
+        <module>mimetype-filter</module>
+        <module>nutch-extensionpoints</module>
+        <module>parse-ext</module>
+        <module>parse-html</module>
+        <module>parse-js</module>
+        <module>parse-metatags</module>
+        <module>parse-replace</module>
+        <module>parse-swf</module>
+        <module>parse-tika</module>
+        <module>parse-zip</module>
+        <module>parsefilter-naivebayes</module>
+        <module>parsefilter-regex</module>
+        <module>plugin</module>
+        <module>protocol-file</module>
+        <module>protocol-ftp</module>
+        <module>protocol-htmlunit</module>
+        <module>protocol-http</module>
+        <module>protocol-httpclient</module>
+        <module>protocol-interactiveselenium</module>
+        <module>protocol-selenium</module>
+        <module>scoring-depth</module>
+        <module>scoring-link</module>
+        <module>scoring-opic</module>
+        <module>scoring-similarity</module>
+        <module>subcollection</module>
+        <module>tld</module>
+        <module>urlfilter-automaton</module>
+        <module>urlfilter-domain</module>
+        <module>urlfilter-domainblacklist</module>
+        <module>urlfilter-ignoreexempt</module>
+        <module>urlfilter-prefix</module>
+        <module>urlfilter-regex</module>
+        <module>urlfilter-suffix</module>
+        <module>urlfilter-validator</module>
+        <module>urlmeta</module>
+        <module>urlnormalizer-ajax</module>
+        <module>urlnormalizer-basic</module>
+        <module>urlnormalizer-host</module>
+        <module>urlnormalizer-pass</module>
+        <module>urlnormalizer-protocol</module>
+        <module>urlnormalizer-querystring</module>
+        <module>urlnormalizer-regex</module>
+        <module>urlnormalizer-slash</module>
+    </modules>
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <!-- Note : an additional level is for the child modules (defined ahead in hierarchy)-->
+        <dir.root>..${file.separator}..${file.separator}</dir.root>
+        <libs.dir>${dir.local.plugins}${file.separator}${project.artifactId}</libs.dir>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>nutch-core</artifactId>
+            <version>${project.parent.version}</version>
+            <scope>provided</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>nutch-core</artifactId>
+            <version>${project.parent.version}</version>
+            <scope>test</scope>
+            <type>test-jar</type>
+        </dependency>
+    </dependencies>
+    <build>
+        <finalName>${project.artifactId}</finalName>
+        <plugins>
+            <plugin>
+                <artifactId>maven-resources-plugin</artifactId>
+                <version>3.0.1</version>
+                <executions>
+                    <execution>
+                        <id>copy-resources</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>copy-resources</goal>
+                        </goals>
+                        <configuration>
+                            <outputDirectory>${libs.dir}</outputDirectory>
+                            <resources>
+                                <resource>
+                                    <directory>${project.build.directory}</directory>
+                                    <include>${build.finalName}.jar</include>
+                                </resource>
+                                <resource>
+                                    <directory>${project.basedir}</directory>
+                                    <include>plugin.xml</include>
+                                </resource>
+                            </resources>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-surefire-plugin</artifactId>
+                <version>2.19.1</version>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-failsafe-plugin</artifactId>
+                <version>2.19.1</version>
+            </plugin>
+        </plugins>
+    </build>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/build.xml b/nutch-plugins/protocol-file/build.xml
new file mode 100644
index 0000000..121b1fe
--- /dev/null
+++ b/nutch-plugins/protocol-file/build.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-file" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+  
+ <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="sample">
+      <include name="*.txt"/>
+    </fileset>
+  </copy>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/ivy.xml b/nutch-plugins/protocol-file/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/protocol-file/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/plugin.xml b/nutch-plugins/protocol-file/plugin.xml
new file mode 100644
index 0000000..1647ce4
--- /dev/null
+++ b/nutch-plugins/protocol-file/plugin.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="protocol-file"
+   name="File Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="protocol-file.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.protocol.file"
+              name="FileProtocol"
+              point="org.apache.nutch.protocol.Protocol">
+
+      <implementation id="org.apache.nutch.protocol.file.File"
+                      class="org.apache.nutch.protocol.file.File">
+        <parameter name="protocolName" value="file"/>
+      </implementation>
+
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/pom.xml b/nutch-plugins/protocol-file/pom.xml
new file mode 100644
index 0000000..2ab2f75
--- /dev/null
+++ b/nutch-plugins/protocol-file/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>protocol-file</artifactId>
+    <packaging>jar</packaging>
+
+    <name>protocol-file</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/File.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/File.java b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/File.java
new file mode 100644
index 0000000..2712218
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/File.java
@@ -0,0 +1,228 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.file;
+
+import java.net.URL;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.protocol.RobotRulesParser;
+import org.apache.nutch.util.NutchConfiguration;
+
+import crawlercommons.robots.BaseRobotRules;
+
+/**
+ * This class is a protocol plugin used for file: scheme. It creates
+ * {@link FileResponse} object and gets the content of the url from it.
+ * Configurable parameters are {@code file.content.limit} and
+ * {@code file.crawl.parent} in nutch-default.xml defined under
+ * "file properties" section.
+ * 
+ * @author John Xing
+ */
+public class File implements Protocol {
+
+  public static final Logger LOG = LoggerFactory.getLogger(File.class);
+
+  static final int MAX_REDIRECTS = 5;
+
+  int maxContentLength;
+  boolean crawlParents;
+
+  /**
+   * if true return a redirect for symbolic links and do not resolve the links
+   * internally
+   */
+  boolean symlinksAsRedirects = true;
+
+  private Configuration conf;
+
+  public File() {
+  }
+
+  /**
+   * Set the {@link Configuration} object
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
+    this.crawlParents = conf.getBoolean("file.crawl.parent", true);
+    this.symlinksAsRedirects = conf.getBoolean(
+        "file.crawl.redirect_noncanonical", true);
+  }
+
+  /**
+   * Get the {@link Configuration} object
+   */
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * Set the length after at which content is truncated.
+   */
+  public void setMaxContentLength(int maxContentLength) {
+    this.maxContentLength = maxContentLength;
+  }
+
+  /**
+   * Creates a {@link FileResponse} object corresponding to the url and return a
+   * {@link ProtocolOutput} object as per the content received
+   * 
+   * @param url
+   *          Text containing the url
+   * @param datum
+   *          The CrawlDatum object corresponding to the url
+   * 
+   * @return {@link ProtocolOutput} object for the content of the file indicated
+   *         by url
+   */
+  public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
+    String urlString = url.toString();
+    try {
+      URL u = new URL(urlString);
+
+      int redirects = 0;
+
+      while (true) {
+        FileResponse response;
+        response = new FileResponse(u, datum, this, getConf()); // make a
+                                                                // request
+
+        int code = response.getCode();
+
+        if (code == 200) { // got a good response
+          return new ProtocolOutput(response.toContent()); // return it
+
+        } else if (code == 304) { // got not modified
+          return new ProtocolOutput(response.toContent(),
+              ProtocolStatus.STATUS_NOTMODIFIED);
+
+        } else if (code == 401) { // access denied / no read permissions
+          return new ProtocolOutput(response.toContent(), new ProtocolStatus(
+              ProtocolStatus.ACCESS_DENIED));
+
+        } else if (code == 404) { // no such file
+          return new ProtocolOutput(response.toContent(),
+              ProtocolStatus.STATUS_NOTFOUND);
+
+        } else if (code >= 300 && code < 400) { // handle redirect
+          u = new URL(response.getHeader("Location"));
+          if (LOG.isTraceEnabled()) {
+            LOG.trace("redirect to " + u);
+          }
+          if (symlinksAsRedirects) {
+            return new ProtocolOutput(response.toContent(), new ProtocolStatus(
+                ProtocolStatus.MOVED, u));
+          } else if (redirects == MAX_REDIRECTS) {
+            LOG.trace("Too many redirects: {}", url);
+            return new ProtocolOutput(response.toContent(), new ProtocolStatus(
+                ProtocolStatus.REDIR_EXCEEDED, u));
+          }
+          redirects++;
+
+        } else { // convert to exception
+          throw new FileError(code);
+        }
+      }
+    } catch (Exception e) {
+      e.printStackTrace();
+      return new ProtocolOutput(null, new ProtocolStatus(e));
+    }
+  }
+
+  /**
+   * Quick way for running this class. Useful for debugging.
+   */
+  public static void main(String[] args) throws Exception {
+    int maxContentLength = Integer.MIN_VALUE;
+    String logLevel = "info";
+    boolean dumpContent = false;
+    String urlString = null;
+
+    String usage = "Usage: File [-logLevel level] [-maxContentLength L] [-dumpContent] url";
+
+    if (args.length == 0) {
+      System.err.println(usage);
+      System.exit(-1);
+    }
+
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-logLevel")) {
+        logLevel = args[++i];
+      } else if (args[i].equals("-maxContentLength")) {
+        maxContentLength = Integer.parseInt(args[++i]);
+      } else if (args[i].equals("-dumpContent")) {
+        dumpContent = true;
+      } else if (i != args.length - 1) {
+        System.err.println(usage);
+        System.exit(-1);
+      } else
+        urlString = args[i];
+    }
+
+    File file = new File();
+    file.setConf(NutchConfiguration.create());
+
+    if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
+      file.setMaxContentLength(maxContentLength);
+
+    // set log level
+    // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
+
+    ProtocolOutput output = file.getProtocolOutput(new Text(urlString),
+        new CrawlDatum());
+    Content content = output.getContent();
+
+    System.err.println("URL: " + content.getUrl());
+    System.err.println("Status: " + output.getStatus());
+    System.err.println("Content-Type: " + content.getContentType());
+    System.err.println("Content-Length: "
+        + content.getMetadata().get(Response.CONTENT_LENGTH));
+    System.err.println("Last-Modified: "
+        + content.getMetadata().get(Response.LAST_MODIFIED));
+    String redirectLocation = content.getMetadata().get("Location");
+    if (redirectLocation != null) {
+      System.err.println("Location: " + redirectLocation);
+    }
+
+    if (dumpContent) {
+      System.out.print(new String(content.getContent()));
+    }
+
+    file = null;
+  }
+
+  /**
+   * No robots parsing is done for file protocol. So this returns a set of empty
+   * rules which will allow every url.
+   */
+  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
+    return RobotRulesParser.EMPTY_RULES;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileError.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileError.java b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileError.java
new file mode 100644
index 0000000..4fef340
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileError.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.file;
+
+/**
+ * Thrown for File error codes.
+ */
+public class FileError extends FileException {
+
+  private int code;
+
+  public int getCode(int code) {
+    return code;
+  }
+
+  public FileError(int code) {
+    super("File Error: " + code);
+    this.code = code;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileException.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileException.java b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileException.java
new file mode 100644
index 0000000..f0467de
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileException.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.file;
+
+import org.apache.nutch.protocol.ProtocolException;
+
+public class FileException extends ProtocolException {
+
+  public FileException() {
+    super();
+  }
+
+  public FileException(String message) {
+    super(message);
+  }
+
+  public FileException(String message, Throwable cause) {
+    super(message, cause);
+  }
+
+  public FileException(Throwable cause) {
+    super(cause);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileResponse.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileResponse.java b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileResponse.java
new file mode 100644
index 0000000..b6e74ff
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileResponse.java
@@ -0,0 +1,317 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.file;
+
+// JDK imports
+import java.net.URL;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+
+// Tika imports
+import org.apache.tika.Tika;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+/************************************
+ * FileResponse.java mimics file replies as http response. It tries its best to
+ * follow http's way for headers, response codes as well as exceptions.
+ * 
+ * Comments: (1) java.net.URL and java.net.URLConnection can handle file:
+ * scheme. However they are not flexible enough, so not used in this
+ * implementation.
+ * 
+ * (2) java.io.File is used for its abstractness across platforms. Warning:
+ * java.io.File API (1.4.2) does not elaborate on how special files, such as
+ * /dev/* in unix and /proc/* on linux, are treated. Tests show (a)
+ * java.io.File.isFile() return false for /dev/* (b) java.io.File.isFile()
+ * return true for /proc/* (c) java.io.File.length() return 0 for /proc/* We are
+ * probably oaky for now. Could be buggy here. How about special files on
+ * windows?
+ * 
+ * (3) java.io.File API (1.4.2) does not seem to know unix hard link files. They
+ * are just treated as individual files.
+ * 
+ * (4) No funcy POSIX file attributes yet. May never need?
+ * 
+ * @author John Xing
+ ***********************************/
+public class FileResponse {
+
+  private String orig;
+  private String base;
+  private byte[] content;
+  private static final byte[] EMPTY_CONTENT = new byte[0];
+  private int code;
+  private Metadata headers = new Metadata();
+
+  private final File file;
+  private Configuration conf;
+
+  private MimeUtil MIME;
+  private Tika tika;
+
+  /** Returns the response code. */
+  public int getCode() {
+    return code;
+  }
+
+  /** Returns the value of a named header. */
+  public String getHeader(String name) {
+    return headers.get(name);
+  }
+
+  public byte[] getContent() {
+    return content;
+  }
+
+  public Content toContent() {
+    return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
+        getHeader(Response.CONTENT_TYPE), headers, this.conf);
+  }
+
+  /**
+   * Default public constructor
+   * 
+   * @param url
+   * @param datum
+   * @param file
+   * @param conf
+   * @throws FileException
+   * @throws IOException
+   */
+  public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
+      throws FileException, IOException {
+
+    this.orig = url.toString();
+    this.base = url.toString();
+    this.file = file;
+    this.conf = conf;
+
+    MIME = new MimeUtil(conf);
+    tika = new Tika();
+
+    if (!"file".equals(url.getProtocol()))
+      throw new FileException("Not a file url:" + url);
+
+    if (File.LOG.isTraceEnabled()) {
+      File.LOG.trace("fetching " + url);
+    }
+
+    if (url.getPath() != url.getFile()) {
+      if (File.LOG.isWarnEnabled()) {
+        File.LOG.warn("url.getPath() != url.getFile(): " + url);
+      }
+    }
+
+    String path = "".equals(url.getPath()) ? "/" : url.getPath();
+
+    try {
+      // specify the encoding via the config later?
+      path = java.net.URLDecoder.decode(path, "UTF-8");
+    } catch (UnsupportedEncodingException ex) {
+    }
+
+    try {
+
+      this.content = null;
+
+      // url.toURI() is only in j2se 1.5.0
+      // java.io.File f = new java.io.File(url.toURI());
+      java.io.File f = new java.io.File(path);
+
+      if (!f.exists()) {
+        this.code = 404; // http Not Found
+        return;
+      }
+
+      if (!f.canRead()) {
+        this.code = 401; // http Unauthorized
+        return;
+      }
+
+      // symbolic link or relative path on unix
+      // fix me: what's the consequence on windows platform
+      // where case is insensitive
+      if (!f.equals(f.getCanonicalFile())) {
+        // set headers
+        // hdrs.put("Location", f.getCanonicalFile().toURI());
+        //
+        // we want to automatically escape characters that are illegal in URLs.
+        // It is recommended that new code convert an abstract pathname into a
+        // URL
+        // by first converting it into a URI, via the toURI method, and then
+        // converting the URI into a URL via the URI.toURL method.
+        headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL()
+            .toString());
+
+        this.code = 300; // http redirect
+        return;
+      }
+      if (f.lastModified() <= datum.getModifiedTime()) {
+        this.code = 304;
+        this.headers.set("Last-Modified",
+            HttpDateFormat.toString(f.lastModified()));
+        return;
+      }
+
+      if (f.isDirectory()) {
+        getDirAsHttpResponse(f);
+      } else if (f.isFile()) {
+        getFileAsHttpResponse(f);
+      } else {
+        this.code = 500; // http Internal Server Error
+        return;
+      }
+
+    } catch (IOException e) {
+      throw e;
+    }
+
+  }
+
+  // get file as http response
+  private void getFileAsHttpResponse(java.io.File f) throws FileException,
+      IOException {
+
+    // ignore file of size larger than
+    // Integer.MAX_VALUE = 2^31-1 = 2147483647
+    long size = f.length();
+    if (size > Integer.MAX_VALUE) {
+      throw new FileException("file is too large, size: " + size);
+      // or we can do this?
+      // this.code = 400; // http Bad request
+      // return;
+    }
+
+    // capture content
+    int len = (int) size;
+
+    if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength)
+      len = this.file.maxContentLength;
+
+    this.content = new byte[len];
+
+    java.io.InputStream is = new java.io.FileInputStream(f);
+    int offset = 0;
+    int n = 0;
+    while (offset < len
+        && (n = is.read(this.content, offset, len - offset)) >= 0) {
+      offset += n;
+    }
+    if (offset < len) { // keep whatever already have, but issue a warning
+      if (File.LOG.isWarnEnabled()) {
+        File.LOG.warn("not enough bytes read from file: " + f.getPath());
+      }
+    }
+    is.close();
+
+    // set headers
+    headers.set(Response.CONTENT_LENGTH, new Long(size).toString());
+    headers.set(Response.LAST_MODIFIED,
+        HttpDateFormat.toString(f.lastModified()));
+
+    String mimeType = tika.detect(f);
+
+    headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : "");
+
+    // response code
+    this.code = 200; // http OK
+  }
+
+  /**
+   * get dir list as http response
+   * 
+   * @param f
+   * @throws IOException
+   */
+  private void getDirAsHttpResponse(java.io.File f) throws IOException {
+
+    String path = f.toString();
+    if (this.file.crawlParents)
+      this.content = list2html(f.listFiles(), path, "/".equals(path) ? false
+          : true);
+    else
+      this.content = list2html(f.listFiles(), path, false);
+
+    // set headers
+    headers.set(Response.CONTENT_LENGTH,
+        new Integer(this.content.length).toString());
+    headers.set(Response.CONTENT_TYPE, "text/html");
+    headers.set(Response.LAST_MODIFIED,
+        HttpDateFormat.toString(f.lastModified()));
+
+    // response code
+    this.code = 200; // http OK
+  }
+
+  /**
+   * generate html page from dir list
+   * 
+   * @param list
+   * @param path
+   * @param includeDotDot
+   * @return
+   */
+  private byte[] list2html(java.io.File[] list, String path,
+      boolean includeDotDot) {
+
+    StringBuffer x = new StringBuffer("<html><head>");
+    x.append("<title>Index of " + path + "</title></head>\n");
+    x.append("<body><h1>Index of " + path + "</h1><pre>\n");
+
+    if (includeDotDot) {
+      x.append("<a href='../'>../</a>\t-\t-\t-\n");
+    }
+
+    // fix me: we might want to sort list here! but not now.
+
+    java.io.File f;
+    for (int i = 0; i < list.length; i++) {
+      f = list[i];
+      String name = f.getName();
+      String time = HttpDateFormat.toString(f.lastModified());
+      if (f.isDirectory()) {
+        // java 1.4.2 api says dir itself and parent dir are not listed
+        // so the following is not needed.
+        // if (name.equals(".") || name.equals(".."))
+        // continue;
+        x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t");
+        x.append(time + "\t-\n");
+      } else if (f.isFile()) {
+        x.append("<a href='" + name + "'>" + name + "</a>\t");
+        x.append(time + "\t" + f.length() + "\n");
+      } else {
+        // ignore any other
+      }
+    }
+
+    x.append("</pre></body></html>\n");
+
+    return new String(x).getBytes();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/package.html b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/package.html
new file mode 100644
index 0000000..221c79c
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving local file resources.</p><p></p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/test/java/org/apache/nutch/protocol/file/TestProtocolFile.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/test/java/org/apache/nutch/protocol/file/TestProtocolFile.java b/nutch-plugins/protocol-file/src/test/java/org/apache/nutch/protocol/file/TestProtocolFile.java
new file mode 100644
index 0000000..5f95377
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/test/java/org/apache/nutch/protocol/file/TestProtocolFile.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.file;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * @author mattmann
+ * @version $Revision$
+ * 
+ *          <p>
+ *          Unit tests for the {@link File}Protocol.
+ *          </p>
+ *          .
+ */
+public class TestProtocolFile {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  private String sampleDir = System.getProperty("test.data", ".");
+
+  private static final String[] testTextFiles = new String[] {
+      "testprotocolfile.txt", "testprotocolfile_(encoded).txt",
+      "testprotocolfile_%28encoded%29.txt" };
+
+  private static final CrawlDatum datum = new CrawlDatum();
+
+  private static final String expectedMimeType = "text/plain";
+
+  private Configuration conf;
+
+  @Before
+  public void setUp() {
+    conf = NutchConfiguration.create();
+  }
+
+  @Test
+  public void testSetContentType() throws ProtocolException {
+    for (String testTextFile : testTextFiles) {
+      setContentType(testTextFile);
+    }
+  }
+
+  /**
+   * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field.
+   * 
+   * @since NUTCH-384
+   * 
+   */
+  public void setContentType(String testTextFile) throws ProtocolException {
+    String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
+    Assert.assertNotNull(urlString);
+    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+    ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString),
+        datum);
+    Assert.assertNotNull(output);
+    Assert.assertEquals("Status code: [" + output.getStatus().getCode()
+        + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: ["
+        + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output
+        .getStatus().getCode());
+    Assert.assertNotNull(output.getContent());
+    Assert.assertNotNull(output.getContent().getContentType());
+    Assert.assertEquals(expectedMimeType, output.getContent().getContentType());
+    Assert.assertNotNull(output.getContent().getMetadata());
+    Assert.assertEquals(expectedMimeType, output.getContent().getMetadata()
+        .get(Response.CONTENT_TYPE));
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/test/resources/testprotocolfile.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/test/resources/testprotocolfile.txt b/nutch-plugins/protocol-file/src/test/resources/testprotocolfile.txt
new file mode 100644
index 0000000..fbe8a8a
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/test/resources/testprotocolfile.txt
@@ -0,0 +1 @@
+Protocol File Test

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/test/resources/testprotocolfile_(encoded).txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/test/resources/testprotocolfile_(encoded).txt b/nutch-plugins/protocol-file/src/test/resources/testprotocolfile_(encoded).txt
new file mode 100644
index 0000000..fbe8a8a
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/test/resources/testprotocolfile_(encoded).txt
@@ -0,0 +1 @@
+Protocol File Test

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/build.xml b/nutch-plugins/protocol-ftp/build.xml
new file mode 100644
index 0000000..79314d4
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-ftp" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/ivy.xml b/nutch-plugins/protocol-ftp/ivy.xml
new file mode 100644
index 0000000..214c445
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="commons-net" name="commons-net" rev="1.2.2" conf="*->master"/>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/plugin.xml b/nutch-plugins/protocol-ftp/plugin.xml
new file mode 100644
index 0000000..1421e37
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/plugin.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="protocol-ftp"
+   name="Ftp Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="protocol-ftp.jar">
+         <export name="*"/>
+      </library>
+      <library name="commons-net-1.2.0-dev.jar"/>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.protocol.ftp"
+              name="FtpProtocol"
+              point="org.apache.nutch.protocol.Protocol">
+
+      <implementation id="org.apache.nutch.protocol.ftp.Ftp"
+                      class="org.apache.nutch.protocol.ftp.Ftp">
+        <parameter name="protocolName" value="ftp"/>
+      </implementation>
+      
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/pom.xml b/nutch-plugins/protocol-ftp/pom.xml
new file mode 100644
index 0000000..fe9a61b
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>protocol-ftp</artifactId>
+    <packaging>jar</packaging>
+
+    <name>protocol-ftp</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>