You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:48:50 UTC

[06/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/plugin.dtd
----------------------------------------------------------------------
diff --git a/src/plugin/plugin.dtd b/src/plugin/plugin.dtd
deleted file mode 100644
index 9b67da7..0000000
--- a/src/plugin/plugin.dtd
+++ /dev/null
@@ -1,206 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-<!--
- ! Licensed to the Apache Software Foundation (ASF) under one or more
- ! contributor license agreements.  See the NOTICE file distributed with
- ! this work for additional information regarding copyright ownership.
- ! The ASF licenses this file to You under the Apache License, Version 2.0
- ! (the "License"); you may not use this file except in compliance with
- ! the License.  You may obtain a copy of the License at
- !
- !     http://www.apache.org/licenses/LICENSE-2.0
- !
- ! Unless required by applicable law or agreed to in writing, software
- ! distributed under the License is distributed on an "AS IS" BASIS,
- ! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ! See the License for the specific language governing permissions and
- ! limitations under the License.
- !
- !
- !  Document   : plugin.dtd
- !  Created on : 14 avril 2006, 22:14
- !  Author     : Chris Mattmann, Jerome Charron
- !  Description: Nutch plug-in manifest DTD
- !
- !  PUBLIC ID  : -//Apache Software Fundation//DTD Nutch Plugin Manifest 1.0//EN
- !  SYSTEM ID  : http://lucene.apache.org/nutch/plugin.dtd
--->
-
-
-
-<!--
- ! The <plugin> element defines the body of the manifest.
- ! It optionally contains definitions for the plug-in runtime,
- ! definitions of other plug-ins required by this one,
- ! declarations of any new extension points being introduced by the plug-in,
- ! as well as configuration of functional extensions
- ! (configured into extension points defined by other plug-ins,
- ! or introduced by this plug-in).
- !-->
-<!ELEMENT plugin (runtime?, requires?, extension-point*, extension*)>
-
-<!-- A user displayable name for the plug-in -->
-<!ATTLIST plugin name CDATA #REQUIRED>
-
-<!-- 
- ! A unique identifier for the plug-in.
- ! To minimize potential for naming collisions,
- ! the identifier should be derived from the internet domain id
- ! of the supplying provider (reversing the domain name tokens and
- ! appending additional name tokens separated by dot [.]).
- ! For example, provider nutch.org could define plug-in identifier
- ! org.nutch.myplugin
- !-->
-<!ATTLIST plugin id CDATA #REQUIRED>
-
-<!--
- ! The plug-in version number.
- ! NOTE : Version numbers compatibility are not yet implemented.
- !-->
-<!ATTLIST plugin version CDATA #REQUIRED>
-
-<!-- The user-displayable name of the provider supplying the plug-in. -->
-<!ATTLIST plugin provider-name CDATA #IMPLIED>
-
-<!--
- ! The name of the plug-in class for this plug-in.
- ! The class must be a subclass of org.apache.nutch.plugin.Plugin
- !-->
-<!ATTLIST plugin class CDATA #IMPLIED>
-
-
-<!-- 
- ! The <requires> section of the manifest declares
- ! any dependencies on other plug-ins.
- !-->
-<!ELEMENT requires (import+)>
-
-
-<!-- Each dependency is specified using an <import> element. -->
-<!ELEMENT import EMPTY>
-
-<!-- The identifier of the required plug-in. -->
-<!ATTLIST import plugin CDATA #REQUIRED>
-
-
-<!--
- ! The <runtime> section of the manifest contains a definition of one or more
- ! libraries that make up the plug-in runtime.
- ! The referenced libraries are used by the plugin execution mechanisms
- ! (the plug-in class loader) to load and execute the correct code required by
- ! the plug-in.
- !-->
-<!ELEMENT runtime (library+)>
-
-
-<!--
- !The <library> elements collectively define the plug-in runtime.
- ! At least one <library> must be specified.
- !-->
-<!ELEMENT library (export*)>
-
-<!--
- ! A string reference to a library file or directory containing classes
- ! (relative to the plug-in install directory).
- ! Directory references must contain trailing file separator.
- !-->
-<!ATTLIST library name CDATA #REQUIRED>
-
-
-<!--
- ! Each <library> element can specify which portion
- ! of the library should be exported.
- ! The export rules are specified as a set of export masks.
- ! By default (no export rules specified),
- ! the library is considered to be private.
- ! Each export mask is specified using the name attribute.
- !-->
-<!ELEMENT export EMPTY>
-
-<!--
- ! The export mask can have the following values:
- !   * - indicates all contents of library are exported (public)
- !   package.name.* - indicates all classes in the specified package
- !                    are exported. The matching rules are the same as in the
- !                    Java import statement.
- !   package.name.ClassName - fully qualified java class name
- !
- ! NOTE : export mask is not yet implemented in Nutch.
- !-->
-<!ATTLIST export name CDATA #REQUIRED>
-
-
-<!--
- ! Nutch's architecture is based on the notion of configurable extension points.
- ! Nutch itself predefines a set of extension points that cover the task of
- ! extending it (for example, adding parser, indexing filter, ...).
- ! In addition to the predefined extension points, each supplied plug-in can
- ! declare additional extension points. By declaring an extension point the
- ! plug-in is essentially advertising the ability to configure the plug-in
- ! function with externally supplied extensions.
- !-->
-<!ELEMENT extension-point EMPTY>
-
-<!-- A user-displayable name for the extension point. -->
-<!ATTLIST extension-point name CDATA #REQUIRED>
-
-<!-- A simple id, unique within this plug-in -->
-<!ATTLIST extension-point id CDATA #REQUIRED>
-
-
-<!--
- ! Actual extensions are configured into extension points
- ! (predefined, or newly declared in this plug-in) in the <extension> section.
- !
- ! The configuration information is specified by at least one implementation
- ! with some parameters.
- !-->
-<!ELEMENT extension (implementation+)>
-
-<!-- 
- ! A reference to an extension point being configured.
- ! The extension point can be one defined in this plug-in or another plug-in.
- !-->
-<!ATTLIST extension point CDATA #REQUIRED>
-
-<!--
- ! Optional identifier for this extension point configuration instance.
- ! This is used by extension points that need to uniquely identify
- ! (rather than just enumerate) the specific configured extensions.
- ! The identifier is specified as a simple token unique within the definition
- ! of the declaring plug-in. When used globally, the extension identifier
- ! is qualified by the plug-in identifier.
- ! FIXME : Seems it is never read in the code.
- !-->
-<!ATTLIST extension id CDATA #IMPLIED>
-
-<!--
- ! A user-displayable name for the extension.
- ! FIXME : Seems it is never read in the code.
- !-->
-<!ATTLIST extension name CDATA #IMPLIED>
-
-
-<!--
- ! Defines a specific implementation for the extension.
- ! This implementation can define some special name/value parameters
- ! used at runtime.
- !-->
-<!ELEMENT implementation (parameter*)>
-
-<!-- A unique identifier for this implementation -->
-<!ATTLIST implementation id CDATA #REQUIRED>
-
-<!-- The fully-qualified Java Class that implements this extension-point -->
-<!ATTLIST implementation class CDATA #REQUIRED>
-
-
-<!-- Defines a name/value parameter -->
-<!ELEMENT parameter EMPTY>
-
-<!-- The parameter's name (should be unique for an extension) -->
-<!ATTLIST parameter name CDATA #REQUIRED>
-
-<!-- The parameter's value -->
-<!ATTLIST parameter value CDATA #REQUIRED> 
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-file/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-file/build.xml b/src/plugin/protocol-file/build.xml
deleted file mode 100644
index 121b1fe..0000000
--- a/src/plugin/protocol-file/build.xml
+++ /dev/null
@@ -1,29 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="protocol-file" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-  
- <!-- for junit test -->
-  <mkdir dir="${build.test}/data"/>
-  <copy todir="${build.test}/data">
-    <fileset dir="sample">
-      <include name="*.txt"/>
-    </fileset>
-  </copy>
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-file/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-file/ivy.xml b/src/plugin/protocol-file/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/protocol-file/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-file/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-file/plugin.xml b/src/plugin/protocol-file/plugin.xml
deleted file mode 100644
index 1647ce4..0000000
--- a/src/plugin/protocol-file/plugin.xml
+++ /dev/null
@@ -1,46 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="protocol-file"
-   name="File Protocol Plug-in"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-
-   <runtime>
-      <library name="protocol-file.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-   <extension id="org.apache.nutch.protocol.file"
-              name="FileProtocol"
-              point="org.apache.nutch.protocol.Protocol">
-
-      <implementation id="org.apache.nutch.protocol.file.File"
-                      class="org.apache.nutch.protocol.file.File">
-        <parameter name="protocolName" value="file"/>
-      </implementation>
-
-   </extension>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-file/sample/testprotocolfile.txt
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-file/sample/testprotocolfile.txt b/src/plugin/protocol-file/sample/testprotocolfile.txt
deleted file mode 100644
index fbe8a8a..0000000
--- a/src/plugin/protocol-file/sample/testprotocolfile.txt
+++ /dev/null
@@ -1 +0,0 @@
-Protocol File Test

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt b/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt
deleted file mode 100644
index fbe8a8a..0000000
--- a/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt
+++ /dev/null
@@ -1 +0,0 @@
-Protocol File Test

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
deleted file mode 100644
index 2712218..0000000
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
+++ /dev/null
@@ -1,228 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.file;
-
-import java.net.URL;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolOutput;
-import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.protocol.RobotRulesParser;
-import org.apache.nutch.util.NutchConfiguration;
-
-import crawlercommons.robots.BaseRobotRules;
-
-/**
- * This class is a protocol plugin used for file: scheme. It creates
- * {@link FileResponse} object and gets the content of the url from it.
- * Configurable parameters are {@code file.content.limit} and
- * {@code file.crawl.parent} in nutch-default.xml defined under
- * "file properties" section.
- * 
- * @author John Xing
- */
-public class File implements Protocol {
-
-  public static final Logger LOG = LoggerFactory.getLogger(File.class);
-
-  static final int MAX_REDIRECTS = 5;
-
-  int maxContentLength;
-  boolean crawlParents;
-
-  /**
-   * if true return a redirect for symbolic links and do not resolve the links
-   * internally
-   */
-  boolean symlinksAsRedirects = true;
-
-  private Configuration conf;
-
-  public File() {
-  }
-
-  /**
-   * Set the {@link Configuration} object
-   */
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
-    this.crawlParents = conf.getBoolean("file.crawl.parent", true);
-    this.symlinksAsRedirects = conf.getBoolean(
-        "file.crawl.redirect_noncanonical", true);
-  }
-
-  /**
-   * Get the {@link Configuration} object
-   */
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  /**
-   * Set the length after at which content is truncated.
-   */
-  public void setMaxContentLength(int maxContentLength) {
-    this.maxContentLength = maxContentLength;
-  }
-
-  /**
-   * Creates a {@link FileResponse} object corresponding to the url and return a
-   * {@link ProtocolOutput} object as per the content received
-   * 
-   * @param url
-   *          Text containing the url
-   * @param datum
-   *          The CrawlDatum object corresponding to the url
-   * 
-   * @return {@link ProtocolOutput} object for the content of the file indicated
-   *         by url
-   */
-  public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
-    String urlString = url.toString();
-    try {
-      URL u = new URL(urlString);
-
-      int redirects = 0;
-
-      while (true) {
-        FileResponse response;
-        response = new FileResponse(u, datum, this, getConf()); // make a
-                                                                // request
-
-        int code = response.getCode();
-
-        if (code == 200) { // got a good response
-          return new ProtocolOutput(response.toContent()); // return it
-
-        } else if (code == 304) { // got not modified
-          return new ProtocolOutput(response.toContent(),
-              ProtocolStatus.STATUS_NOTMODIFIED);
-
-        } else if (code == 401) { // access denied / no read permissions
-          return new ProtocolOutput(response.toContent(), new ProtocolStatus(
-              ProtocolStatus.ACCESS_DENIED));
-
-        } else if (code == 404) { // no such file
-          return new ProtocolOutput(response.toContent(),
-              ProtocolStatus.STATUS_NOTFOUND);
-
-        } else if (code >= 300 && code < 400) { // handle redirect
-          u = new URL(response.getHeader("Location"));
-          if (LOG.isTraceEnabled()) {
-            LOG.trace("redirect to " + u);
-          }
-          if (symlinksAsRedirects) {
-            return new ProtocolOutput(response.toContent(), new ProtocolStatus(
-                ProtocolStatus.MOVED, u));
-          } else if (redirects == MAX_REDIRECTS) {
-            LOG.trace("Too many redirects: {}", url);
-            return new ProtocolOutput(response.toContent(), new ProtocolStatus(
-                ProtocolStatus.REDIR_EXCEEDED, u));
-          }
-          redirects++;
-
-        } else { // convert to exception
-          throw new FileError(code);
-        }
-      }
-    } catch (Exception e) {
-      e.printStackTrace();
-      return new ProtocolOutput(null, new ProtocolStatus(e));
-    }
-  }
-
-  /**
-   * Quick way for running this class. Useful for debugging.
-   */
-  public static void main(String[] args) throws Exception {
-    int maxContentLength = Integer.MIN_VALUE;
-    String logLevel = "info";
-    boolean dumpContent = false;
-    String urlString = null;
-
-    String usage = "Usage: File [-logLevel level] [-maxContentLength L] [-dumpContent] url";
-
-    if (args.length == 0) {
-      System.err.println(usage);
-      System.exit(-1);
-    }
-
-    for (int i = 0; i < args.length; i++) {
-      if (args[i].equals("-logLevel")) {
-        logLevel = args[++i];
-      } else if (args[i].equals("-maxContentLength")) {
-        maxContentLength = Integer.parseInt(args[++i]);
-      } else if (args[i].equals("-dumpContent")) {
-        dumpContent = true;
-      } else if (i != args.length - 1) {
-        System.err.println(usage);
-        System.exit(-1);
-      } else
-        urlString = args[i];
-    }
-
-    File file = new File();
-    file.setConf(NutchConfiguration.create());
-
-    if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
-      file.setMaxContentLength(maxContentLength);
-
-    // set log level
-    // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
-
-    ProtocolOutput output = file.getProtocolOutput(new Text(urlString),
-        new CrawlDatum());
-    Content content = output.getContent();
-
-    System.err.println("URL: " + content.getUrl());
-    System.err.println("Status: " + output.getStatus());
-    System.err.println("Content-Type: " + content.getContentType());
-    System.err.println("Content-Length: "
-        + content.getMetadata().get(Response.CONTENT_LENGTH));
-    System.err.println("Last-Modified: "
-        + content.getMetadata().get(Response.LAST_MODIFIED));
-    String redirectLocation = content.getMetadata().get("Location");
-    if (redirectLocation != null) {
-      System.err.println("Location: " + redirectLocation);
-    }
-
-    if (dumpContent) {
-      System.out.print(new String(content.getContent()));
-    }
-
-    file = null;
-  }
-
-  /**
-   * No robots parsing is done for file protocol. So this returns a set of empty
-   * rules which will allow every url.
-   */
-  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
-    return RobotRulesParser.EMPTY_RULES;
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java
deleted file mode 100644
index 4fef340..0000000
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.file;
-
-/**
- * Thrown for File error codes.
- */
-public class FileError extends FileException {
-
-  private int code;
-
-  public int getCode(int code) {
-    return code;
-  }
-
-  public FileError(int code) {
-    super("File Error: " + code);
-    this.code = code;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileException.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileException.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileException.java
deleted file mode 100644
index f0467de..0000000
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileException.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.file;
-
-import org.apache.nutch.protocol.ProtocolException;
-
-public class FileException extends ProtocolException {
-
-  public FileException() {
-    super();
-  }
-
-  public FileException(String message) {
-    super(message);
-  }
-
-  public FileException(String message, Throwable cause) {
-    super(message, cause);
-  }
-
-  public FileException(Throwable cause) {
-    super(cause);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
deleted file mode 100644
index b6e74ff..0000000
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
+++ /dev/null
@@ -1,317 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.file;
-
-// JDK imports
-import java.net.URL;
-import java.io.IOException;
-import java.io.UnsupportedEncodingException;
-
-// Nutch imports
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.MimeUtil;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.HttpDateFormat;
-import org.apache.nutch.net.protocols.Response;
-
-// Tika imports
-import org.apache.tika.Tika;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-
-/************************************
- * FileResponse.java mimics file replies as http response. It tries its best to
- * follow http's way for headers, response codes as well as exceptions.
- * 
- * Comments: (1) java.net.URL and java.net.URLConnection can handle file:
- * scheme. However they are not flexible enough, so not used in this
- * implementation.
- * 
- * (2) java.io.File is used for its abstractness across platforms. Warning:
- * java.io.File API (1.4.2) does not elaborate on how special files, such as
- * /dev/* in unix and /proc/* on linux, are treated. Tests show (a)
- * java.io.File.isFile() return false for /dev/* (b) java.io.File.isFile()
- * return true for /proc/* (c) java.io.File.length() return 0 for /proc/* We are
- * probably oaky for now. Could be buggy here. How about special files on
- * windows?
- * 
- * (3) java.io.File API (1.4.2) does not seem to know unix hard link files. They
- * are just treated as individual files.
- * 
- * (4) No funcy POSIX file attributes yet. May never need?
- * 
- * @author John Xing
- ***********************************/
-public class FileResponse {
-
-  private String orig;
-  private String base;
-  private byte[] content;
-  private static final byte[] EMPTY_CONTENT = new byte[0];
-  private int code;
-  private Metadata headers = new Metadata();
-
-  private final File file;
-  private Configuration conf;
-
-  private MimeUtil MIME;
-  private Tika tika;
-
-  /** Returns the response code. */
-  public int getCode() {
-    return code;
-  }
-
-  /** Returns the value of a named header. */
-  public String getHeader(String name) {
-    return headers.get(name);
-  }
-
-  public byte[] getContent() {
-    return content;
-  }
-
-  public Content toContent() {
-    return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
-        getHeader(Response.CONTENT_TYPE), headers, this.conf);
-  }
-
-  /**
-   * Default public constructor
-   * 
-   * @param url
-   * @param datum
-   * @param file
-   * @param conf
-   * @throws FileException
-   * @throws IOException
-   */
-  public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
-      throws FileException, IOException {
-
-    this.orig = url.toString();
-    this.base = url.toString();
-    this.file = file;
-    this.conf = conf;
-
-    MIME = new MimeUtil(conf);
-    tika = new Tika();
-
-    if (!"file".equals(url.getProtocol()))
-      throw new FileException("Not a file url:" + url);
-
-    if (File.LOG.isTraceEnabled()) {
-      File.LOG.trace("fetching " + url);
-    }
-
-    if (url.getPath() != url.getFile()) {
-      if (File.LOG.isWarnEnabled()) {
-        File.LOG.warn("url.getPath() != url.getFile(): " + url);
-      }
-    }
-
-    String path = "".equals(url.getPath()) ? "/" : url.getPath();
-
-    try {
-      // specify the encoding via the config later?
-      path = java.net.URLDecoder.decode(path, "UTF-8");
-    } catch (UnsupportedEncodingException ex) {
-    }
-
-    try {
-
-      this.content = null;
-
-      // url.toURI() is only in j2se 1.5.0
-      // java.io.File f = new java.io.File(url.toURI());
-      java.io.File f = new java.io.File(path);
-
-      if (!f.exists()) {
-        this.code = 404; // http Not Found
-        return;
-      }
-
-      if (!f.canRead()) {
-        this.code = 401; // http Unauthorized
-        return;
-      }
-
-      // symbolic link or relative path on unix
-      // fix me: what's the consequence on windows platform
-      // where case is insensitive
-      if (!f.equals(f.getCanonicalFile())) {
-        // set headers
-        // hdrs.put("Location", f.getCanonicalFile().toURI());
-        //
-        // we want to automatically escape characters that are illegal in URLs.
-        // It is recommended that new code convert an abstract pathname into a
-        // URL
-        // by first converting it into a URI, via the toURI method, and then
-        // converting the URI into a URL via the URI.toURL method.
-        headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL()
-            .toString());
-
-        this.code = 300; // http redirect
-        return;
-      }
-      if (f.lastModified() <= datum.getModifiedTime()) {
-        this.code = 304;
-        this.headers.set("Last-Modified",
-            HttpDateFormat.toString(f.lastModified()));
-        return;
-      }
-
-      if (f.isDirectory()) {
-        getDirAsHttpResponse(f);
-      } else if (f.isFile()) {
-        getFileAsHttpResponse(f);
-      } else {
-        this.code = 500; // http Internal Server Error
-        return;
-      }
-
-    } catch (IOException e) {
-      throw e;
-    }
-
-  }
-
-  // get file as http response
-  private void getFileAsHttpResponse(java.io.File f) throws FileException,
-      IOException {
-
-    // ignore file of size larger than
-    // Integer.MAX_VALUE = 2^31-1 = 2147483647
-    long size = f.length();
-    if (size > Integer.MAX_VALUE) {
-      throw new FileException("file is too large, size: " + size);
-      // or we can do this?
-      // this.code = 400; // http Bad request
-      // return;
-    }
-
-    // capture content
-    int len = (int) size;
-
-    if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength)
-      len = this.file.maxContentLength;
-
-    this.content = new byte[len];
-
-    java.io.InputStream is = new java.io.FileInputStream(f);
-    int offset = 0;
-    int n = 0;
-    while (offset < len
-        && (n = is.read(this.content, offset, len - offset)) >= 0) {
-      offset += n;
-    }
-    if (offset < len) { // keep whatever already have, but issue a warning
-      if (File.LOG.isWarnEnabled()) {
-        File.LOG.warn("not enough bytes read from file: " + f.getPath());
-      }
-    }
-    is.close();
-
-    // set headers
-    headers.set(Response.CONTENT_LENGTH, new Long(size).toString());
-    headers.set(Response.LAST_MODIFIED,
-        HttpDateFormat.toString(f.lastModified()));
-
-    String mimeType = tika.detect(f);
-
-    headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : "");
-
-    // response code
-    this.code = 200; // http OK
-  }
-
-  /**
-   * get dir list as http response
-   * 
-   * @param f
-   * @throws IOException
-   */
-  private void getDirAsHttpResponse(java.io.File f) throws IOException {
-
-    String path = f.toString();
-    if (this.file.crawlParents)
-      this.content = list2html(f.listFiles(), path, "/".equals(path) ? false
-          : true);
-    else
-      this.content = list2html(f.listFiles(), path, false);
-
-    // set headers
-    headers.set(Response.CONTENT_LENGTH,
-        new Integer(this.content.length).toString());
-    headers.set(Response.CONTENT_TYPE, "text/html");
-    headers.set(Response.LAST_MODIFIED,
-        HttpDateFormat.toString(f.lastModified()));
-
-    // response code
-    this.code = 200; // http OK
-  }
-
-  /**
-   * generate html page from dir list
-   * 
-   * @param list
-   * @param path
-   * @param includeDotDot
-   * @return
-   */
-  private byte[] list2html(java.io.File[] list, String path,
-      boolean includeDotDot) {
-
-    StringBuffer x = new StringBuffer("<html><head>");
-    x.append("<title>Index of " + path + "</title></head>\n");
-    x.append("<body><h1>Index of " + path + "</h1><pre>\n");
-
-    if (includeDotDot) {
-      x.append("<a href='../'>../</a>\t-\t-\t-\n");
-    }
-
-    // fix me: we might want to sort list here! but not now.
-
-    java.io.File f;
-    for (int i = 0; i < list.length; i++) {
-      f = list[i];
-      String name = f.getName();
-      String time = HttpDateFormat.toString(f.lastModified());
-      if (f.isDirectory()) {
-        // java 1.4.2 api says dir itself and parent dir are not listed
-        // so the following is not needed.
-        // if (name.equals(".") || name.equals(".."))
-        // continue;
-        x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t");
-        x.append(time + "\t-\n");
-      } else if (f.isFile()) {
-        x.append("<a href='" + name + "'>" + name + "</a>\t");
-        x.append(time + "\t" + f.length() + "\n");
-      } else {
-        // ignore any other
-      }
-    }
-
-    x.append("</pre></body></html>\n");
-
-    return new String(x).getBytes();
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/package.html
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/package.html b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/package.html
deleted file mode 100644
index 221c79c..0000000
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/package.html
+++ /dev/null
@@ -1,5 +0,0 @@
-<html>
-<body>
-<p>Protocol plugin which supports retrieving local file resources.</p><p></p>
-</body>
-</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java b/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
deleted file mode 100644
index 5f95377..0000000
--- a/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.file;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-
-// Nutch imports
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.ProtocolOutput;
-import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- * @author mattmann
- * @version $Revision$
- * 
- *          <p>
- *          Unit tests for the {@link File}Protocol.
- *          </p>
- *          .
- */
-public class TestProtocolFile {
-
-  private String fileSeparator = System.getProperty("file.separator");
-  private String sampleDir = System.getProperty("test.data", ".");
-
-  private static final String[] testTextFiles = new String[] {
-      "testprotocolfile.txt", "testprotocolfile_(encoded).txt",
-      "testprotocolfile_%28encoded%29.txt" };
-
-  private static final CrawlDatum datum = new CrawlDatum();
-
-  private static final String expectedMimeType = "text/plain";
-
-  private Configuration conf;
-
-  @Before
-  public void setUp() {
-    conf = NutchConfiguration.create();
-  }
-
-  @Test
-  public void testSetContentType() throws ProtocolException {
-    for (String testTextFile : testTextFiles) {
-      setContentType(testTextFile);
-    }
-  }
-
-  /**
-   * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field.
-   * 
-   * @since NUTCH-384
-   * 
-   */
-  public void setContentType(String testTextFile) throws ProtocolException {
-    String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
-    Assert.assertNotNull(urlString);
-    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
-    ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString),
-        datum);
-    Assert.assertNotNull(output);
-    Assert.assertEquals("Status code: [" + output.getStatus().getCode()
-        + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: ["
-        + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output
-        .getStatus().getCode());
-    Assert.assertNotNull(output.getContent());
-    Assert.assertNotNull(output.getContent().getContentType());
-    Assert.assertEquals(expectedMimeType, output.getContent().getContentType());
-    Assert.assertNotNull(output.getContent().getMetadata());
-    Assert.assertEquals(expectedMimeType, output.getContent().getMetadata()
-        .get(Response.CONTENT_TYPE));
-
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/build.xml b/src/plugin/protocol-ftp/build.xml
deleted file mode 100644
index 79314d4..0000000
--- a/src/plugin/protocol-ftp/build.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="protocol-ftp" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/ivy.xml b/src/plugin/protocol-ftp/ivy.xml
deleted file mode 100644
index 214c445..0000000
--- a/src/plugin/protocol-ftp/ivy.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-    <dependency org="commons-net" name="commons-net" rev="1.2.2" conf="*->master"/>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/plugin.xml b/src/plugin/protocol-ftp/plugin.xml
deleted file mode 100644
index 1421e37..0000000
--- a/src/plugin/protocol-ftp/plugin.xml
+++ /dev/null
@@ -1,46 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="protocol-ftp"
-   name="Ftp Protocol Plug-in"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-   <runtime>
-      <library name="protocol-ftp.jar">
-         <export name="*"/>
-      </library>
-      <library name="commons-net-1.2.0-dev.jar"/>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-   <extension id="org.apache.nutch.protocol.ftp"
-              name="FtpProtocol"
-              point="org.apache.nutch.protocol.Protocol">
-
-      <implementation id="org.apache.nutch.protocol.ftp.Ftp"
-                      class="org.apache.nutch.protocol.ftp.Ftp">
-        <parameter name="protocolName" value="ftp"/>
-      </implementation>
-      
-   </extension>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
deleted file mode 100644
index da25d87..0000000
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
+++ /dev/null
@@ -1,595 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.ftp;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.OutputStream;
-
-import java.net.InetAddress;
-import java.net.Socket;
-
-import java.util.List;
-//import java.util.LinkedList;
-
-import org.apache.commons.net.MalformedServerReplyException;
-
-import org.apache.commons.net.ftp.FTP;
-import org.apache.commons.net.ftp.FTPCommand;
-import org.apache.commons.net.ftp.FTPFile;
-import org.apache.commons.net.ftp.FTPFileEntryParser;
-import org.apache.commons.net.ftp.FTPReply;
-
-import org.apache.commons.net.ftp.FTPConnectionClosedException;
-
-/***********************************************
- * Client.java encapsulates functionalities necessary for nutch to get dir list
- * and retrieve file from an FTP server. This class takes care of all low level
- * details of interacting with an FTP server and provides a convenient higher
- * level interface.
- * 
- * Modified from FtpClient.java in apache commons-net.
- * 
- * Notes by John Xing: ftp server implementations are hardly uniform and none
- * seems to follow RFCs whole-heartedly. We have no choice, but assume common
- * denominator as following: (1) Use stream mode for data transfer. Block mode
- * will be better for multiple file downloading and partial file downloading.
- * However not every ftpd has block mode support. (2) Use passive mode for data
- * connection. So Nutch will work if we run behind firewall. (3) Data connection
- * is opened/closed per ftp command for the reasons listed in (1). There are ftp
- * servers out there, when partial downloading is enforced by closing data
- * channel socket on our client side, the server side immediately closes control
- * channel (socket). Our codes deal with such a bad behavior. (4) LIST is used
- * to obtain remote file attributes if possible. MDTM & SIZE would be nice, but
- * not as ubiquitously implemented as LIST. (5) Avoid using ABOR in single
- * thread? Do not use it at all.
- * 
- * About exceptions: Some specific exceptions are re-thrown as one of
- * FtpException*.java In fact, each function throws FtpException*.java or pass
- * IOException.
- * 
- * @author John Xing
- ***********************************************/
-
-public class Client extends FTP {
-  private int __dataTimeout;
-  private int __passivePort;
-  private String __passiveHost;
-  // private int __fileType, __fileFormat;
-  private boolean __remoteVerificationEnabled;
-  // private FTPFileEntryParser __entryParser;
-  private String __systemName;
-
-  /** Public default constructor */
-  public Client() {
-    __initDefaults();
-    __dataTimeout = -1;
-    __remoteVerificationEnabled = true;
-  }
-
-  // defaults when initialize
-  private void __initDefaults() {
-    __passiveHost = null;
-    __passivePort = -1;
-    __systemName = null;
-    // __fileType = FTP.ASCII_FILE_TYPE;
-    // __fileFormat = FTP.NON_PRINT_TEXT_FORMAT;
-    // __entryParser = null;
-  }
-
-  // parse reply for pass()
-  private void __parsePassiveModeReply(String reply)
-      throws MalformedServerReplyException {
-    int i, index, lastIndex;
-    String octet1, octet2;
-    StringBuffer host;
-
-    reply = reply.substring(reply.indexOf('(') + 1, reply.indexOf(')')).trim();
-
-    host = new StringBuffer(24);
-    lastIndex = 0;
-    index = reply.indexOf(',');
-    host.append(reply.substring(lastIndex, index));
-
-    for (i = 0; i < 3; i++) {
-      host.append('.');
-      lastIndex = index + 1;
-      index = reply.indexOf(',', lastIndex);
-      host.append(reply.substring(lastIndex, index));
-    }
-
-    lastIndex = index + 1;
-    index = reply.indexOf(',', lastIndex);
-
-    octet1 = reply.substring(lastIndex, index);
-    octet2 = reply.substring(index + 1);
-
-    // index and lastIndex now used as temporaries
-    try {
-      index = Integer.parseInt(octet1);
-      lastIndex = Integer.parseInt(octet2);
-    } catch (NumberFormatException e) {
-      throw new MalformedServerReplyException(
-          "Could not parse passive host information.\nServer Reply: " + reply);
-    }
-
-    index <<= 8;
-    index |= lastIndex;
-
-    __passiveHost = host.toString();
-    __passivePort = index;
-  }
-
-  /**
-   * open a passive data connection socket
-   * 
-   * @param command
-   * @param arg
-   * @return
-   * @throws IOException
-   * @throws FtpExceptionCanNotHaveDataConnection
-   */
-  protected Socket __openPassiveDataConnection(int command, String arg)
-      throws IOException, FtpExceptionCanNotHaveDataConnection {
-    Socket socket;
-
-    // // 20040317, xing, accommodate ill-behaved servers, see below
-    // int port_previous = __passivePort;
-
-    if (pasv() != FTPReply.ENTERING_PASSIVE_MODE)
-      throw new FtpExceptionCanNotHaveDataConnection("pasv() failed. "
-          + getReplyString());
-
-    try {
-      __parsePassiveModeReply(getReplyStrings()[0]);
-    } catch (MalformedServerReplyException e) {
-      throw new FtpExceptionCanNotHaveDataConnection(e.getMessage());
-    }
-
-    // // 20040317, xing, accommodate ill-behaved servers, see above
-    // int count = 0;
-    // System.err.println("__passivePort "+__passivePort);
-    // System.err.println("port_previous "+port_previous);
-    // while (__passivePort == port_previous) {
-    // // just quit if too many tries. make it an exception here?
-    // if (count++ > 10)
-    // return null;
-    // // slow down further for each new try
-    // Thread.sleep(500*count);
-    // if (pasv() != FTPReply.ENTERING_PASSIVE_MODE)
-    // throw new FtpExceptionCanNotHaveDataConnection(
-    // "pasv() failed. " + getReplyString());
-    // //return null;
-    // try {
-    // __parsePassiveModeReply(getReplyStrings()[0]);
-    // } catch (MalformedServerReplyException e) {
-    // throw new FtpExceptionCanNotHaveDataConnection(e.getMessage());
-    // }
-    // }
-
-    socket = _socketFactory_.createSocket(__passiveHost, __passivePort);
-
-    if (!FTPReply.isPositivePreliminary(sendCommand(command, arg))) {
-      socket.close();
-      return null;
-    }
-
-    if (__remoteVerificationEnabled && !verifyRemote(socket)) {
-      InetAddress host1, host2;
-
-      host1 = socket.getInetAddress();
-      host2 = getRemoteAddress();
-
-      socket.close();
-
-      // our precaution
-      throw new FtpExceptionCanNotHaveDataConnection(
-          "Host attempting data connection " + host1.getHostAddress()
-              + " is not same as server " + host2.getHostAddress()
-              + " So we intentionally close it for security precaution.");
-    }
-
-    if (__dataTimeout >= 0)
-      socket.setSoTimeout(__dataTimeout);
-
-    return socket;
-  }
-
-  /***
-   * Sets the timeout in milliseconds to use for data connection. set
-   * immediately after opening the data connection.
-   ***/
-  public void setDataTimeout(int timeout) {
-    __dataTimeout = timeout;
-  }
-
-  /***
-   * Closes the connection to the FTP server and restores connection parameters
-   * to the default values.
-   * <p>
-   * 
-   * @exception IOException
-   *              If an error occurs while disconnecting.
-   ***/
-  public void disconnect() throws IOException {
-    __initDefaults();
-    super.disconnect();
-    // no worry for data connection, since we always close it
-    // in every ftp command that invloves data connection
-  }
-
-  /***
-   * Enable or disable verification that the remote host taking part of a data
-   * connection is the same as the host to which the control connection is
-   * attached. The default is for verification to be enabled. You may set this
-   * value at any time, whether the FTPClient is currently connected or not.
-   * <p>
-   * 
-   * @param enable
-   *          True to enable verification, false to disable verification.
-   ***/
-  public void setRemoteVerificationEnabled(boolean enable) {
-    __remoteVerificationEnabled = enable;
-  }
-
-  /***
-   * Return whether or not verification of the remote host participating in data
-   * connections is enabled. The default behavior is for verification to be
-   * enabled.
-   * <p>
-   * 
-   * @return True if verification is enabled, false if not.
-   ***/
-  public boolean isRemoteVerificationEnabled() {
-    return __remoteVerificationEnabled;
-  }
-
-  /***
-   * Login to the FTP server using the provided username and password.
-   * <p>
-   * 
-   * @param username
-   *          The username to login under.
-   * @param password
-   *          The password to use.
-   * @return True if successfully completed, false if not.
-   * @exception FTPConnectionClosedException
-   *              If the FTP server prematurely closes the connection as a
-   *              result of the client being idle or some other reason causing
-   *              the server to send FTP reply code 421. This exception may be
-   *              caught either as an IOException or independently as itself.
-   * @exception IOException
-   *              If an I/O error occurs while either sending a command to the
-   *              server or receiving a reply from the server.
-   ***/
-  public boolean login(String username, String password) throws IOException {
-    user(username);
-
-    if (FTPReply.isPositiveCompletion(getReplyCode()))
-      return true;
-
-    // If we get here, we either have an error code, or an intermmediate
-    // reply requesting password.
-    if (!FTPReply.isPositiveIntermediate(getReplyCode()))
-      return false;
-
-    return FTPReply.isPositiveCompletion(pass(password));
-  }
-
-  /***
-   * Logout of the FTP server by sending the QUIT command.
-   * <p>
-   * 
-   * @return True if successfully completed, false if not.
-   * @exception FTPConnectionClosedException
-   *              If the FTP server prematurely closes the connection as a
-   *              result of the client being idle or some other reason causing
-   *              the server to send FTP reply code 421. This exception may be
-   *              caught either as an IOException or independently as itself.
-   * @exception IOException
-   *              If an I/O error occurs while either sending a command to the
-   *              server or receiving a reply from the server.
-   ***/
-  public boolean logout() throws IOException {
-    return FTPReply.isPositiveCompletion(quit());
-  }
-
-  /**
-   * retrieve list reply for path
-   * 
-   * @param path
-   * @param entries
-   * @param limit
-   * @param parser
-   * @throws IOException
-   * @throws FtpExceptionCanNotHaveDataConnection
-   * @throws FtpExceptionUnknownForcedDataClose
-   * @throws FtpExceptionControlClosedByForcedDataClose
-   */
-  public void retrieveList(String path, List<FTPFile> entries, int limit,
-      FTPFileEntryParser parser) throws IOException,
-      FtpExceptionCanNotHaveDataConnection, FtpExceptionUnknownForcedDataClose,
-      FtpExceptionControlClosedByForcedDataClose {
-    Socket socket = __openPassiveDataConnection(FTPCommand.LIST, path);
-
-    if (socket == null)
-      throw new FtpExceptionCanNotHaveDataConnection("LIST "
-          + ((path == null) ? "" : path));
-
-    BufferedReader reader = new BufferedReader(new InputStreamReader(
-        socket.getInputStream()));
-
-    // force-close data channel socket, when download limit is reached
-    // boolean mandatory_close = false;
-
-    // List entries = new LinkedList();
-    int count = 0;
-    String line = parser.readNextEntry(reader);
-    while (line != null) {
-      FTPFile ftpFile = parser.parseFTPEntry(line);
-      // skip non-formatted lines
-      if (ftpFile == null) {
-        line = parser.readNextEntry(reader);
-        continue;
-      }
-      entries.add(ftpFile);
-      count += line.length();
-      // impose download limit if limit >= 0, otherwise no limit
-      // here, cut off is up to the line when total bytes is just over limit
-      if (limit >= 0 && count > limit) {
-        // mandatory_close = true;
-        break;
-      }
-      line = parser.readNextEntry(reader);
-    }
-
-    // if (mandatory_close)
-    // you always close here, no matter mandatory_close or not.
-    // however different ftp servers respond differently, see below.
-    socket.close();
-
-    // scenarios:
-    // (1) mandatory_close is false, download limit not reached
-    // no special care here
-    // (2) mandatory_close is true, download limit is reached
-    // different servers have different reply codes:
-
-    try {
-      int reply = getReply();
-      if (!_notBadReply(reply))
-        throw new FtpExceptionUnknownForcedDataClose(getReplyString());
-    } catch (FTPConnectionClosedException e) {
-      // some ftp servers will close control channel if data channel socket
-      // is closed by our end before all data has been read out. Check:
-      // tux414.q-tam.hp.com FTP server (hp.com version whp02)
-      // so must catch FTPConnectionClosedException thrown by getReply() above
-      // disconnect();
-      throw new FtpExceptionControlClosedByForcedDataClose(e.getMessage());
-    }
-
-  }
-
-  /**
-   * retrieve file for path
-   * 
-   * @param path
-   * @param os
-   * @param limit
-   * @throws IOException
-   * @throws FtpExceptionCanNotHaveDataConnection
-   * @throws FtpExceptionUnknownForcedDataClose
-   * @throws FtpExceptionControlClosedByForcedDataClose
-   */
-  public void retrieveFile(String path, OutputStream os, int limit)
-      throws IOException, FtpExceptionCanNotHaveDataConnection,
-      FtpExceptionUnknownForcedDataClose,
-      FtpExceptionControlClosedByForcedDataClose {
-
-    Socket socket = __openPassiveDataConnection(FTPCommand.RETR, path);
-
-    if (socket == null)
-      throw new FtpExceptionCanNotHaveDataConnection("RETR "
-          + ((path == null) ? "" : path));
-
-    InputStream input = socket.getInputStream();
-
-    // 20040318, xing, treat everything as BINARY_FILE_TYPE for now
-    // do we ever need ASCII_FILE_TYPE?
-    // if (__fileType == ASCII_FILE_TYPE)
-    // input = new FromNetASCIIInputStream(input);
-
-    // fixme, should we instruct server here for binary file type?
-
-    // force-close data channel socket
-    // boolean mandatory_close = false;
-
-    int len;
-    int count = 0;
-    byte[] buf = new byte[org.apache.commons.net.io.Util.DEFAULT_COPY_BUFFER_SIZE];
-    while ((len = input.read(buf, 0, buf.length)) != -1) {
-      count += len;
-      // impose download limit if limit >= 0, otherwise no limit
-      // here, cut off is exactly of limit bytes
-      if (limit >= 0 && count > limit) {
-        os.write(buf, 0, len - (count - limit));
-        // mandatory_close = true;
-        break;
-      }
-      os.write(buf, 0, len);
-      os.flush();
-    }
-
-    // if (mandatory_close)
-    // you always close here, no matter mandatory_close or not.
-    // however different ftp servers respond differently, see below.
-    socket.close();
-
-    // scenarios:
-    // (1) mandatory_close is false, download limit not reached
-    // no special care here
-    // (2) mandatory_close is true, download limit is reached
-    // different servers have different reply codes:
-
-    // do not need this
-    // sendCommand("ABOR");
-
-    try {
-      int reply = getReply();
-      if (!_notBadReply(reply))
-        throw new FtpExceptionUnknownForcedDataClose(getReplyString());
-    } catch (FTPConnectionClosedException e) {
-      // some ftp servers will close control channel if data channel socket
-      // is closed by our end before all data has been read out. Check:
-      // tux414.q-tam.hp.com FTP server (hp.com version whp02)
-      // so must catch FTPConnectionClosedException thrown by getReply() above
-      // disconnect();
-      throw new FtpExceptionControlClosedByForcedDataClose(e.getMessage());
-    }
-
-  }
-
-  /**
-   * reply check after closing data connection
-   * 
-   * @param reply
-   * @return
-   */
-  private boolean _notBadReply(int reply) {
-
-    if (FTPReply.isPositiveCompletion(reply)) {
-      // do nothing
-    } else if (reply == 426) { // FTPReply.TRANSFER_ABORTED
-      // some ftp servers reply 426, e.g.,
-      // foggy FTP server (Version wu-2.6.2(2)
-      // there is second reply witing? no!
-      // getReply();
-    } else if (reply == 450) { // FTPReply.FILE_ACTION_NOT_TAKEN
-      // some ftp servers reply 450, e.g.,
-      // ProFTPD [ftp.kernel.org]
-      // there is second reply witing? no!
-      // getReply();
-    } else if (reply == 451) { // FTPReply.ACTION_ABORTED
-      // some ftp servers reply 451, e.g.,
-      // ProFTPD [ftp.kernel.org]
-      // there is second reply witing? no!
-      // getReply();
-    } else if (reply == 451) { // FTPReply.ACTION_ABORTED
-    } else {
-      // what other kind of ftp server out there?
-      return false;
-    }
-
-    return true;
-  }
-
-  /***
-   * Sets the file type to be transferred. This should be one of
-   * <code> FTP.ASCII_FILE_TYPE </code>, <code> FTP.IMAGE_FILE_TYPE </code>,
-   * etc. The file type only needs to be set when you want to change the type.
-   * After changing it, the new type stays in effect until you change it again.
-   * The default file type is <code> FTP.ASCII_FILE_TYPE </code> if this method
-   * is never called.
-   * <p>
-   * 
-   * @param fileType
-   *          The <code> _FILE_TYPE </code> constant indcating the type of file.
-   * @return True if successfully completed, false if not.
-   * @exception FTPConnectionClosedException
-   *              If the FTP server prematurely closes the connection as a
-   *              result of the client being idle or some other reason causing
-   *              the server to send FTP reply code 421. This exception may be
-   *              caught either as an IOException or independently as itself.
-   * @exception IOException
-   *              If an I/O error occurs while either sending a command to the
-   *              server or receiving a reply from the server.
-   ***/
-  public boolean setFileType(int fileType) throws IOException {
-    if (FTPReply.isPositiveCompletion(type(fileType))) {
-      /*
-       * __fileType = fileType; __fileFormat = FTP.NON_PRINT_TEXT_FORMAT;
-       */
-      return true;
-    }
-    return false;
-  }
-
-  /***
-   * Fetches the system type name from the server and returns the string. This
-   * value is cached for the duration of the connection after the first call to
-   * this method. In other words, only the first time that you invoke this
-   * method will it issue a SYST command to the FTP server. FTPClient will
-   * remember the value and return the cached value until a call to disconnect.
-   * <p>
-   * 
-   * @return The system type name obtained from the server. null if the
-   *         information could not be obtained.
-   * @exception FTPConnectionClosedException
-   *              If the FTP server prematurely closes the connection as a
-   *              result of the client being idle or some other reason causing
-   *              the server to send FTP reply code 421. This exception may be
-   *              caught either as an IOException or independently as itself.
-   * @exception IOException
-   *              If an I/O error occurs while either sending a command to the
-   *              server or receiving a reply from the server.
-   ***/
-  public String getSystemName() throws IOException, FtpExceptionBadSystResponse {
-    // if (syst() == FTPReply.NAME_SYSTEM_TYPE)
-    // Technically, we should expect a NAME_SYSTEM_TYPE response, but
-    // in practice FTP servers deviate, so we soften the condition to
-    // a positive completion.
-    if (__systemName == null && FTPReply.isPositiveCompletion(syst())) {
-      __systemName = (getReplyStrings()[0]).substring(4);
-    } else {
-      throw new FtpExceptionBadSystResponse("Bad response of SYST: "
-          + getReplyString());
-    }
-
-    return __systemName;
-  }
-
-  /***
-   * Sends a NOOP command to the FTP server. This is useful for preventing
-   * server timeouts.
-   * <p>
-   * 
-   * @return True if successfully completed, false if not.
-   * @exception FTPConnectionClosedException
-   *              If the FTP server prematurely closes the connection as a
-   *              result of the client being idle or some other reason causing
-   *              the server to send FTP reply code 421. This exception may be
-   *              caught either as an IOException or independently as itself.
-   * @exception IOException
-   *              If an I/O error occurs while either sending a command to the
-   *              server or receiving a reply from the server.
-   ***/
-  public boolean sendNoOp() throws IOException {
-    return FTPReply.isPositiveCompletion(noop());
-  }
-
-  // client.stat(path);
-  // client.sendCommand("STAT");
-  // client.sendCommand("STAT",path);
-  // client.sendCommand("MDTM",path);
-  // client.sendCommand("SIZE",path);
-  // client.sendCommand("HELP","SITE");
-  // client.sendCommand("SYST");
-  // client.setRestartOffset(120);
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
deleted file mode 100644
index 772f3bb..0000000
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
+++ /dev/null
@@ -1,267 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.ftp;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.commons.net.ftp.FTPFileEntryParser;
-
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.net.protocols.Response;
-
-import org.apache.hadoop.conf.Configuration;
-
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolOutput;
-import org.apache.nutch.protocol.ProtocolStatus;
-import crawlercommons.robots.BaseRobotRules;
-
-import java.net.URL;
-
-import java.io.IOException;
-
-/**
- * This class is a protocol plugin used for ftp: scheme. It creates
- * {@link FtpResponse} object and gets the content of the url from it.
- * Configurable parameters are {@code ftp.username}, {@code ftp.password},
- * {@code ftp.content.limit}, {@code ftp.timeout}, {@code ftp.server.timeout},
- * {@code ftp.password}, {@code ftp.keep.connection} and {@code ftp.follow.talk}
- * . For details see "FTP properties" section in {@code nutch-default.xml}.
- */
-public class Ftp implements Protocol {
-
-  public static final Logger LOG = LoggerFactory.getLogger(Ftp.class);
-
-  private static final int BUFFER_SIZE = 16384; // 16*1024 = 16384
-
-  static final int MAX_REDIRECTS = 5;
-
-  int timeout;
-
-  int maxContentLength;
-
-  String userName;
-  String passWord;
-
-  // typical/default server timeout is 120*1000 millisec.
-  // better be conservative here
-  int serverTimeout;
-
-  // when to have client start anew
-  long renewalTime = -1;
-
-  boolean keepConnection;
-
-  boolean followTalk;
-
-  // ftp client
-  Client client = null;
-  // ftp dir list entry parser
-  FTPFileEntryParser parser = null;
-
-  private Configuration conf;
-
-  private FtpRobotRulesParser robots = null;
-
-  // constructor
-  public Ftp() {
-    robots = new FtpRobotRulesParser();
-  }
-
-  /** Set the timeout. */
-  public void setTimeout(int to) {
-    timeout = to;
-  }
-
-  /** Set the point at which content is truncated. */
-  public void setMaxContentLength(int length) {
-    maxContentLength = length;
-  }
-
-  /** Set followTalk */
-  public void setFollowTalk(boolean followTalk) {
-    this.followTalk = followTalk;
-  }
-
-  /** Set keepConnection */
-  public void setKeepConnection(boolean keepConnection) {
-    this.keepConnection = keepConnection;
-  }
-
-  /**
-   * Creates a {@link FtpResponse} object corresponding to the url and returns a
-   * {@link ProtocolOutput} object as per the content received
-   * 
-   * @param url
-   *          Text containing the ftp url
-   * @param datum
-   *          The CrawlDatum object corresponding to the url
-   * 
-   * @return {@link ProtocolOutput} object for the url
-   */
-  public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
-    String urlString = url.toString();
-    try {
-      URL u = new URL(urlString);
-
-      int redirects = 0;
-
-      while (true) {
-        FtpResponse response;
-        response = new FtpResponse(u, datum, this, getConf()); // make a request
-
-        int code = response.getCode();
-        datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY,
-          new Text(Integer.toString(code)));
-        
-
-        if (code == 200) { // got a good response
-          return new ProtocolOutput(response.toContent()); // return it
-
-        } else if (code >= 300 && code < 400) { // handle redirect
-          if (redirects == MAX_REDIRECTS)
-            throw new FtpException("Too many redirects: " + url);
-          u = new URL(response.getHeader("Location"));
-          redirects++;
-          if (LOG.isTraceEnabled()) {
-            LOG.trace("redirect to " + u);
-          }
-        } else { // convert to exception
-          throw new FtpError(code);
-        }
-      }
-    } catch (Exception e) {
-      return new ProtocolOutput(null, new ProtocolStatus(e));
-    }
-  }
-
-  protected void finalize() {
-    try {
-      if (this.client != null && this.client.isConnected()) {
-        this.client.logout();
-        this.client.disconnect();
-      }
-    } catch (IOException e) {
-      // do nothing
-    }
-  }
-
-  /** For debugging. */
-  public static void main(String[] args) throws Exception {
-    int timeout = Integer.MIN_VALUE;
-    int maxContentLength = Integer.MIN_VALUE;
-    String logLevel = "info";
-    boolean followTalk = false;
-    boolean keepConnection = false;
-    boolean dumpContent = false;
-    String urlString = null;
-
-    String usage = "Usage: Ftp [-logLevel level] [-followTalk] [-keepConnection] [-timeout N] [-maxContentLength L] [-dumpContent] url";
-
-    if (args.length == 0) {
-      System.err.println(usage);
-      System.exit(-1);
-    }
-
-    for (int i = 0; i < args.length; i++) {
-      if (args[i].equals("-logLevel")) {
-        logLevel = args[++i];
-      } else if (args[i].equals("-followTalk")) {
-        followTalk = true;
-      } else if (args[i].equals("-keepConnection")) {
-        keepConnection = true;
-      } else if (args[i].equals("-timeout")) {
-        timeout = Integer.parseInt(args[++i]) * 1000;
-      } else if (args[i].equals("-maxContentLength")) {
-        maxContentLength = Integer.parseInt(args[++i]);
-      } else if (args[i].equals("-dumpContent")) {
-        dumpContent = true;
-      } else if (i != args.length - 1) {
-        System.err.println(usage);
-        System.exit(-1);
-      } else {
-        urlString = args[i];
-      }
-    }
-
-    Ftp ftp = new Ftp();
-
-    ftp.setFollowTalk(followTalk);
-    ftp.setKeepConnection(keepConnection);
-
-    if (timeout != Integer.MIN_VALUE) // set timeout
-      ftp.setTimeout(timeout);
-
-    if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
-      ftp.setMaxContentLength(maxContentLength);
-
-    // set log level
-    // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
-
-    Content content = ftp.getProtocolOutput(new Text(urlString),
-        new CrawlDatum()).getContent();
-
-    System.err.println("Content-Type: " + content.getContentType());
-    System.err.println("Content-Length: "
-        + content.getMetadata().get(Response.CONTENT_LENGTH));
-    System.err.println("Last-Modified: "
-        + content.getMetadata().get(Response.LAST_MODIFIED));
-    if (dumpContent) {
-      System.out.print(new String(content.getContent()));
-    }
-
-    ftp = null;
-  }
-
-  /**
-   * Set the {@link Configuration} object
-   */
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    this.maxContentLength = conf.getInt("ftp.content.limit", 64 * 1024);
-    this.timeout = conf.getInt("ftp.timeout", 10000);
-    this.userName = conf.get("ftp.username", "anonymous");
-    this.passWord = conf.get("ftp.password", "anonymous@example.com");
-    this.serverTimeout = conf.getInt("ftp.server.timeout", 60 * 1000);
-    this.keepConnection = conf.getBoolean("ftp.keep.connection", false);
-    this.followTalk = conf.getBoolean("ftp.follow.talk", false);
-    this.robots.setConf(conf);
-  }
-
-  /**
-   * Get the {@link Configuration} object
-   */
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  /**
-   * Get the robots rules for a given url
-   */
-  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
-    return robots.getRobotRulesSet(this, url);
-  }
-
-  public int getBufferSize() {
-    return BUFFER_SIZE;
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpError.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpError.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpError.java
deleted file mode 100644
index b63a67e..0000000
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpError.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.ftp;
-
-/**
- * Thrown for Ftp error codes.
- */
-public class FtpError extends FtpException {
-
-  private int code;
-
-  public int getCode(int code) {
-    return code;
-  }
-
-  public FtpError(int code) {
-    super("Ftp Error: " + code);
-    this.code = code;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpException.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpException.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpException.java
deleted file mode 100644
index 5a29668..0000000
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpException.java
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.ftp;
-
-import org.apache.nutch.protocol.ProtocolException;
-
-/***
- * Superclass for important exceptions thrown during FTP talk, that must be
- * handled with care.
- * 
- * @author John Xing
- */
-public class FtpException extends ProtocolException {
-
-  public FtpException() {
-    super();
-  }
-
-  public FtpException(String message) {
-    super(message);
-  }
-
-  public FtpException(String message, Throwable cause) {
-    super(message, cause);
-  }
-
-  public FtpException(Throwable cause) {
-    super(cause);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java
deleted file mode 100644
index 689ac8e..0000000
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.ftp;
-
-/**
- * Exception indicating bad reply of SYST command.
- * 
- * @author John Xing
- */
-public class FtpExceptionBadSystResponse extends FtpException {
-  FtpExceptionBadSystResponse(String msg) {
-    super(msg);
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java
deleted file mode 100644
index 9f35b74..0000000
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.ftp;
-
-/**
- * Exception indicating failure of opening data connection.
- * 
- * @author John Xing
- */
-public class FtpExceptionCanNotHaveDataConnection extends FtpException {
-  FtpExceptionCanNotHaveDataConnection(String msg) {
-    super(msg);
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java
deleted file mode 100644
index c058fcb..0000000
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.ftp;
-
-/**
- * Exception indicating control channel is closed by server end, due to forced
- * closure of data channel at client (our) end.
- * 
- * @author John Xing
- */
-public class FtpExceptionControlClosedByForcedDataClose extends FtpException {
-  FtpExceptionControlClosedByForcedDataClose(String msg) {
-    super(msg);
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java
deleted file mode 100644
index 9083d7c..0000000
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.ftp;
-
-/**
- * Exception indicating unrecognizable reply from server after forced closure of
- * data channel by client (our) side.
- * 
- * @author John Xing
- */
-public class FtpExceptionUnknownForcedDataClose extends FtpException {
-  FtpExceptionUnknownForcedDataClose(String msg) {
-    super(msg);
-  }
-}