You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:49:13 UTC
[29/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/protocol-interactiveselenium/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/ivy.xml b/nutch-plugins/protocol-interactiveselenium/ivy.xml
new file mode 100644
index 0000000..ff07f8c
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="default"/>
+  </publications>
+
+  <dependencies>
+    <!-- Note: only dependencies which are not contained in lib-selenium have to be listed here! -->
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/protocol-interactiveselenium/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/plugin.xml b/nutch-plugins/protocol-interactiveselenium/plugin.xml
new file mode 100644
index 0000000..a69a1e5
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/plugin.xml
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="protocol-interactiveselenium"
+   name="Http Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="protocol-interactiveselenium.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-http"/>
+      <import plugin="lib-selenium"/>
+   </requires>
+
+   <extension id="org.apache.nutch.protocol.interactiveselenium"
+              name="HttpProtocol"
+              point="org.apache.nutch.protocol.Protocol">
+
+      <implementation id="org.apache.nutch.protocol.interactiveselenium.Http"
+                      class="org.apache.nutch.protocol.interactiveselenium.Http">
+        <parameter name="protocolName" value="http"/>
+      </implementation>
+
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/protocol-interactiveselenium/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/pom.xml b/nutch-plugins/protocol-interactiveselenium/pom.xml
new file mode 100644
index 0000000..ced9cdc
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/pom.xml
@@ -0,0 +1,50 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>protocol-interactiveselenium</artifactId>
+    <packaging>jar</packaging>
+
+    <name>protocol-interactiveselenium</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-http</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-selenium</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/Http.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/Http.java b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/Http.java
new file mode 100644
index 0000000..9449fa1
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/Http.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.interactiveselenium;
+
+// JDK imports
+import java.io.IOException;
+import java.net.URL;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.nutch.protocol.interactiveselenium.HttpResponse;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class Http extends HttpBase {
+
+  public static final Logger LOG = LoggerFactory.getLogger(Http.class);
+
+  public Http() {
+    super(LOG);
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+  }
+
+  public static void main(String[] args) throws Exception {
+    Http http = new Http();
+    http.setConf(NutchConfiguration.create());
+    main(http, args);
+  }
+
+  @Override
+  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
+      throws ProtocolException, IOException {
+    return new HttpResponse(this, url, datum);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
new file mode 100644
index 0000000..a1ccf29
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
@@ -0,0 +1,399 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.interactiveselenium;
+
+// JDK imports
+import java.io.BufferedInputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.PushbackInputStream;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.openqa.selenium.WebDriver;
+
+import org.apache.nutch.protocol.selenium.HttpWebClient;
+
+/* Most of this code was borrowed from protocol-htmlunit; which in turn borrowed it from protocol-httpclient */
+
+public class HttpResponse implements Response {
+
+  private Http http;
+  private URL url;
+  private String orig;
+  private String base;
+  private byte[] content;
+  private int code;
+  private Metadata headers = new SpellCheckedMetadata();
+  private static InteractiveSeleniumHandler[] handlers;
+
+  /** The nutch configuration */
+  private Configuration conf = null;
+
+  public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolException, IOException {
+
+    this.conf = http.getConf();
+    this.http = http;
+    this.url = url;
+    this.orig = url.toString();
+    this.base = url.toString();
+
+    if (!"http".equals(url.getProtocol()))
+      throw new HttpException("Not an HTTP url:" + url);
+
+    if (Http.LOG.isTraceEnabled()) {
+      Http.LOG.trace("fetching " + url);
+    }
+
+    String path = "".equals(url.getFile()) ? "/" : url.getFile();
+
+    // some servers will redirect a request with a host line like
+    // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
+    // don't want the :80...
+
+    String host = url.getHost();
+    int port;
+    String portString;
+    if (url.getPort() == -1) {
+      port = 80;
+      portString = "";
+    } else {
+      port = url.getPort();
+      portString = ":" + port;
+    }
+    Socket socket = null;
+
+    try {
+      socket = new Socket(); // create the socket
+      socket.setSoTimeout(http.getTimeout());
+
+      // connect
+      String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
+      int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
+      InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
+      socket.connect(sockAddr, http.getTimeout());
+
+      // make request
+      OutputStream req = socket.getOutputStream();
+
+      StringBuffer reqStr = new StringBuffer("GET ");
+      if (http.useProxy(url)) {
+        reqStr.append(url.getProtocol() + "://" + host + portString + path);
+      } else {
+        reqStr.append(path);
+      }
+
+      reqStr.append(" HTTP/1.0\r\n");
+
+      reqStr.append("Host: ");
+      reqStr.append(host);
+      reqStr.append(portString);
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
+
+      String userAgent = http.getUserAgent();
+      if ((userAgent == null) || (userAgent.length() == 0)) {
+        if (Http.LOG.isErrorEnabled()) {
+          Http.LOG.error("User-agent is not set!");
+        }
+      } else {
+        reqStr.append("User-Agent: ");
+        reqStr.append(userAgent);
+        reqStr.append("\r\n");
+      }
+
+      reqStr.append("Accept-Language: ");
+      reqStr.append(this.http.getAcceptLanguage());
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept: ");
+      reqStr.append(this.http.getAccept());
+      reqStr.append("\r\n");
+
+      if (datum.getModifiedTime() > 0) {
+        reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(datum.getModifiedTime()));
+        reqStr.append("\r\n");
+      }
+      reqStr.append("\r\n");
+
+      byte[] reqBytes = reqStr.toString().getBytes();
+
+      req.write(reqBytes);
+      req.flush();
+
+      PushbackInputStream in = // process response
+          new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
+              Http.BUFFER_SIZE);
+
+      StringBuffer line = new StringBuffer();
+
+      boolean haveSeenNonContinueStatus = false;
+      while (!haveSeenNonContinueStatus) {
+        // parse status code line
+        this.code = parseStatusLine(in, line);
+        // parse headers
+        parseHeaders(in, line);
+        haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
+      }
+
+      // Get Content type header
+      String contentType = getHeader(Response.CONTENT_TYPE);
+
+      // handle with Selenium only if content type in HTML or XHTML 
+      if (contentType != null) {
+        if (contentType.contains("text/html") || contentType.contains("application/xhtml")) {
+          readPlainContent(url);
+        } else {
+          try {
+            int contentLength = Integer.MAX_VALUE;
+            String contentLengthString = headers.get(Response.CONTENT_LENGTH);
+            if (contentLengthString != null) {
+              try {
+                contentLength = Integer.parseInt(contentLengthString.trim());
+              } catch (NumberFormatException ex) {
+                throw new HttpException("bad content length: " + contentLengthString);
+              }
+            }
+
+            if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
+              contentLength = http.getMaxContent();
+            }
+
+            byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
+            int bufferFilled = 0;
+            int totalRead = 0;
+            ByteArrayOutputStream out = new ByteArrayOutputStream();
+            while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
+                && totalRead + bufferFilled <= contentLength) {
+              totalRead += bufferFilled;
+              out.write(buffer, 0, bufferFilled);
+            }
+
+            content = out.toByteArray();
+
+          } catch (Exception e) {
+            if (code == 200)
+              throw new IOException(e.toString());
+            // for codes other than 200 OK, we are fine with empty content
+          } finally {
+            if (in != null) {
+              in.close();
+            }
+          }
+        }
+      } 
+
+    } finally {
+      if (socket != null)
+        socket.close();
+    }
+  }
+
+  /* ------------------------- *
+   * <implementation:Response> *
+   * ------------------------- */
+
+  public URL getUrl() {
+    return url;
+  }
+
+  public int getCode() {
+    return code;
+  }
+
+  public String getHeader(String name) {
+    return headers.get(name);
+  }
+
+  public Metadata getHeaders() {
+    return headers;
+  }
+
+  public byte[] getContent() {
+    return content;
+  }
+
+  /* ------------------------- *
+   * <implementation:Response> *
+   * ------------------------- */
+  private void loadSeleniumHandlers() {
+    if (handlers != null) return;
+
+    String handlerConfig = this.conf.get("interactiveselenium.handlers", "DefaultHandler");
+    String[] handlerNames = handlerConfig.split(",");
+    handlers = new InteractiveSeleniumHandler[handlerNames.length];
+    for (int i = 0; i < handlerNames.length; i++) {
+        try {
+            String classToLoad = this.getClass().getPackage().getName() + "." + handlerNames[i];
+            handlers[i] = InteractiveSeleniumHandler.class.cast(Class.forName(classToLoad).newInstance());
+            Http.LOG.info("Successfully loaded " + classToLoad);
+        } catch (ClassNotFoundException e) {
+            Http.LOG.info("Unable to load Handler class for: " + handlerNames[i]);
+        } catch (InstantiationException e) {
+            Http.LOG.info("Unable to instantiate Handler: " + handlerNames[i]);
+        } catch (IllegalAccessException e) {
+            Http.LOG.info("Illegal access with Handler: " + handlerNames[i]);
+        }
+    }
+  }
+
+  private void readPlainContent(URL url) throws IOException {
+    if (handlers == null)
+        loadSeleniumHandlers();
+
+    String processedPage = "";
+
+    for (InteractiveSeleniumHandler handler : this.handlers) {
+        if (! handler.shouldProcessURL(url.toString())) {
+            continue;
+        }
+
+        WebDriver driver = HttpWebClient.getDriverForPage(url.toString(), conf);
+
+        processedPage += handler.processDriver(driver);
+
+        HttpWebClient.cleanUpDriver(driver);
+    }
+
+    content = processedPage.getBytes("UTF-8");
+  }
+
+  private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+    readLine(in, line, false);
+
+    int codeStart = line.indexOf(" ");
+    int codeEnd = line.indexOf(" ", codeStart + 1);
+
+    // handle lines with no plaintext result code, ie:
+    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
+    if (codeEnd == -1)
+      codeEnd = line.length();
+
+    int code;
+    try {
+      code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
+    } catch (NumberFormatException e) {
+      throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e);
+    }
+
+    return code;
+  }
+
+  private void processHeaderLine(StringBuffer line) throws IOException, HttpException {
+
+    int colonIndex = line.indexOf(":"); // key is up to colon
+    if (colonIndex == -1) {
+      int i;
+      for (i = 0; i < line.length(); i++)
+        if (!Character.isWhitespace(line.charAt(i)))
+          break;
+      if (i == line.length())
+        return;
+      throw new HttpException("No colon in header:" + line);
+    }
+    String key = line.substring(0, colonIndex);
+
+    int valueStart = colonIndex + 1; // skip whitespace
+    while (valueStart < line.length()) {
+      int c = line.charAt(valueStart);
+      if (c != ' ' && c != '\t')
+        break;
+      valueStart++;
+    }
+    String value = line.substring(valueStart);
+    headers.set(key, value);
+  }
+
+  // Adds headers to our headers Metadata
+  private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+
+    while (readLine(in, line, true) != 0) {
+
+      // handle HTTP responses with missing blank line after headers
+      int pos;
+      if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1)
+          || ((pos = line.indexOf("<html")) != -1)) {
+
+        in.unread(line.substring(pos).getBytes("UTF-8"));
+        line.setLength(pos);
+
+        try {
+          //TODO: (CM) We don't know the header names here
+          //since we're just handling them generically. It would
+          //be nice to provide some sort of mapping function here
+          //for the returned header names to the standard metadata
+          //names in the ParseData class
+          processHeaderLine(line);
+        } catch (Exception e) {
+          // fixme:
+          Http.LOG.warn("Error: ", e);
+        }
+        return;
+      }
+
+      processHeaderLine(line);
+    }
+  }
+
+  private static int readLine(PushbackInputStream in, StringBuffer line, boolean allowContinuedLine)
+      throws IOException {
+    line.setLength(0);
+    for (int c = in.read(); c != -1; c = in.read()) {
+      switch (c) {
+      case '\r':
+        if (peek(in) == '\n') {
+          in.read();
+        }
+      case '\n':
+        if (line.length() > 0) {
+          // at EOL -- check for continued line if the current
+          // (possibly continued) line wasn't blank
+          if (allowContinuedLine)
+            switch (peek(in)) {
+            case ' ':
+            case '\t': // line is continued
+              in.read();
+              continue;
+            }
+        }
+        return line.length(); // else complete
+      default:
+        line.append((char) c);
+      }
+    }
+    throw new EOFException();
+  }
+
+  private static int peek(PushbackInputStream in) throws IOException {
+    int value = in.read();
+    in.unread(value);
+    return value;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java
new file mode 100644
index 0000000..f3c0f6f
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.interactiveselenium;
+
+import org.apache.hadoop.util.StringUtils;
+import org.openqa.selenium.JavascriptExecutor;
+import org.openqa.selenium.WebDriver;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This is a placeholder/example of a technique or use case where we do multiple 
+ * interaction with the web driver and need data from each such interaction in the end. This code shows that after you have 
+ * done multiple interactions and accumulated data you can in the end append that to the driver.  
+ */
+public class DefalultMultiInteractionHandler implements
+    InteractiveSeleniumHandler {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(DefalultMultiInteractionHandler.class);
+
+  public String processDriver(WebDriver driver) {
+    // loop and get multiple pages in this string
+    String accumulatedData = "";
+    try {
+      
+      // append the string to the last page's driver
+      JavascriptExecutor jsx = (JavascriptExecutor) driver;
+      jsx.executeScript("document.body.innerHTML=document.body.innerHTML "
+          + accumulatedData + ";");
+    } catch (Exception e) {
+      LOG.info(StringUtils.stringifyException(e));
+    }
+    return accumulatedData;
+  }
+
+  public boolean shouldProcessURL(String URL) {
+    return true;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java
new file mode 100644
index 0000000..e3423d5
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.interactiveselenium;
+
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.util.NutchConfiguration;
+import org.openqa.selenium.By;
+import org.openqa.selenium.JavascriptExecutor;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.WebElement;
+import org.openqa.selenium.support.ui.WebDriverWait;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This handler clicks all the <a hfer="javascript:void(null);"> tags
+ * because it considers them as not usual links but ajax links/interactions. This uses the same logic of 
+ * DefalultMultiInteractionHandler. 
+ */
+public class DefaultClickAllAjaxLinksHandler implements InteractiveSeleniumHandler {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(DefaultClickAllAjaxLinksHandler.class);
+
+  public String processDriver(WebDriver driver) {
+    
+    String accumulatedData = "";
+    try {
+      
+
+      driver.findElement(By.tagName("body")).getAttribute("innerHTML");
+      Configuration conf = NutchConfiguration.create();
+      new WebDriverWait(driver, conf.getLong("libselenium.page.load.delay", 3));
+
+      List<WebElement> atags = driver.findElements(By.tagName("a"));
+      int numberofajaxlinks = atags.size();
+      for (int i = 0; i < numberofajaxlinks; i++) {
+
+        if (atags.get(i).getAttribute("href") != null
+            && atags.get(i).getAttribute("href")
+                .equals("javascript:void(null);")) {
+
+          atags.get(i).click();
+
+          if (i == numberofajaxlinks - 1) {
+            // append everything to the driver in the last round
+            JavascriptExecutor jsx = (JavascriptExecutor) driver;
+            jsx.executeScript("document.body.innerHTML=document.body.innerHTML "
+                + accumulatedData + ";");
+            continue;
+          }
+
+          accumulatedData += driver.findElement(By.tagName("body"))
+              .getAttribute("innerHTML");
+
+          // refreshing the handlers as the page was interacted with
+          driver.navigate().refresh();
+          new WebDriverWait(driver, conf.getLong("libselenium.page.load.delay",
+              3));
+          atags = driver.findElements(By.tagName("a"));
+        }
+      }
+    } catch (Exception e) {
+      LOG.info(StringUtils.stringifyException(e));
+    }
+    return accumulatedData;
+  }
+
+  public boolean shouldProcessURL(String URL) {
+    return true;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java
new file mode 100644
index 0000000..ae7b97e
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.interactiveselenium;
+
+import org.openqa.selenium.WebDriver;
+
+public class DefaultHandler implements InteractiveSeleniumHandler {
+    public String processDriver(WebDriver driver) {
+      return null;
+    }
+
+    public boolean shouldProcessURL(String URL) {
+        return true;
+    }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java
new file mode 100644
index 0000000..9ce1e26
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java
@@ -0,0 +1,25 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.interactiveselenium;
+
+import org.openqa.selenium.WebDriver;
+
+public interface InteractiveSeleniumHandler {
+    public String processDriver(WebDriver driver);
+    public boolean shouldProcessURL(String URL);
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/package.html b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/package.html
new file mode 100644
index 0000000..75cd5b5
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via selenium.</p><p></p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/protocol-selenium/README.md
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-selenium/README.md b/nutch-plugins/protocol-selenium/README.md
new file mode 100644
index 0000000..1462b47
--- /dev/null
+++ b/nutch-plugins/protocol-selenium/README.md
@@ -0,0 +1,208 @@
+Nutch Selenium
+==============
+
+# Introduction
+
+This plugin allows you to fetch Javascript pages using [Selenium](http://www.seleniumhq.org/), while relying on the rest of the awesome Nutch stack!
+
+The underlying code is based on the nutch-htmlunit plugin, which was in turn based on nutch-httpclient.
+
+There are essentially two ways in which Nutch can be used with Selenium.
+
+ * Locally (on each node) as a self contained process, or
+ * via the RemoteWebDriver which connects to [Selenium-Grid](http://www.seleniumhq.org/docs/07_selenium_grid.jsp). A grid consists of a single hub, and one or more nodes.
+
+# Installation
+
+## Part 1: 
+
+### A) Setting up Selenium (local mode)
+
+ * Ensure that you have your prefered browser installed. Currently Chrome, Safari, Opera, PhantomJS and Firefox are supported. Here there example of installing Firefox is provided. More info about the package @ [launchpad](https://launchpad.net/ubuntu/trusty/+source/firefox)
+```
+sudo apt-get install firefox
+```
+
+ * Install Xvfb and its associates
+
+This step is not necessary for the PhantomJs broswer and may not be needed for all browsers.
+
+```
+sudo apt-get install xorg synaptic xvfb gtk2-engines-pixbuf xfonts-cyrillic xfonts-100dpi \
+    xfonts-75dpi xfonts-base xfonts-scalable freeglut3-dev dbus-x11 openbox x11-xserver-utils \
+    libxrender1 cabextract
+```
+
+ * Set a display for Xvfb, so that firefox believes a display is connected
+ 
+```
+sudo /usr/bin/Xvfb :11 -screen 0 1024x768x24 &
+sudo export DISPLAY=:11
+```
+### B) Setting up a Selenium Grid 
+
+Using the Selenium Grid will allow you to parallelize the job by facilitating access of several instances of browsers whether on one machine or on several machines. Note that grid facilitates heterogeneity with regards to browser types used. However, these steps have been tested using a homogenous Selenium Grid with Firefox and PhantomJS browsers. 
+
+ * Download the [Selenium Standalone Server](http://www.seleniumhq.org/download/) and follow the installation instructions.
+ 
+ * Some important configurations to note while setting up the selenium-hub and the selenium-nodes are:
+    * For the hub: 
+      - maxSession (how many browser sessions to allow on the grid at a time)
+      - browserTimeout (how long to wait before timing out a browser session. This is dependent on the interactivity to be completed on the page)
+      
+    * For the nodes:
+      - browserName=<browser>, maxInstances (the max number of instances of the same version browser to allow per a system)
+      - browserName=<browser>, maxSession (the max number of sessions of any type of browser/version to allow per a system)
+      
+  * Go headless with your selenium Grid installation. There are different ways to this. See [this resource](http://elementalselenium.com/tips/38-headless) for further details. 
+ 
+  * For Nutch efficiency, and optimization of the grid, consider editing the following configs in **nutch-site.xml**
+    - fetcher.threads.per.queue (change value to the value of the maxSession config on the hub)
+    - fetcher.threads.fetch (change value to the value of the maxSession config on the hub)
+    - fetcher.server.delay (As multiple threads may be accessing a single server at a time, consider changing this value to 4-5 seconds for politeness)
+    - fetcher.server.min.delay (As multiple threads may be accessing a single server at a time, consider changing this values to 4-5 seconds for politeness)
+    - Ensure all configs for the hub mentioned in Part 2 are appropriately set. 
+
+  * To activate the full selenium grid, edit **$NUTCH_HOME/runtime/local/bin/crawl** script:
+    - numThreads = maxSession on nodes * num of nodes
+
+
+## Part 2: Installing plugin for Nutch (where NUTCH_HOME is the root of your nutch install)
+
+ * Ensure that the plugin will be used as the protocol parser in your config
+
+```
+<!-- NUTCH_HOME/conf/nutch-site.xml -->
+
+<configuration>
+  ...
+  <property>
+    <name>plugin.includes</name>
+    <value>protocol-selenium|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|urlnormalizer-(pass|regex|basic)|scoring-opic</value>
+    <description>Regular expression naming plugin directory names to
+    include.  Any plugin not matching this expression is excluded.
+    In any case you need at least include the nutch-extensionpoints plugin. By
+    default Nutch includes crawling just HTML and plain text via HTTP,
+    and basic indexing and search plugins. In order to use HTTPS please enable 
+    protocol-httpclient, but be aware of possible intermittent problems with the 
+    underlying commons-httpclient library.
+    </description>
+  </property>
+```
+
+* Then ensure that you have the correct configuration set within the following configuration options
+
+```
+<!-- protocol-selenium plugin properties -->
+
+<property>
+  <name>selenium.driver</name>
+  <value>firefox</value>
+  <description>
+    A String value representing the flavour of Selenium 
+    WebDriver() to use. Currently the following options
+    exist - 'firefox', 'chrome', 'safari', 'opera', 'phantomJS', and 'remote'.
+    If 'remote' is used it is essential to also set correct properties for
+    'selenium.hub.port', 'selenium.hub.path', 'selenium.hub.host' and
+    'selenium.hub.protocol'.
+  </description>
+</property>
+
+<property>
+  <name>selenium.take.screenshot</name>
+  <value>false</value>
+  <description>
+    Boolean property determining whether the protocol-selenium
+    WebDriver should capture a screenshot of the URL. If set to
+    true remember to define the 'selenium.screenshot.location' 
+    property as this determines the location screenshots should be 
+    persisted to on HDFS. If that property is not set, screenshots
+    are simply discarded.
+  </description>
+</property>
+
+<property>
+  <name>selenium.screenshot.location</name>
+  <value></value>
+  <description>
+    The location on disk where a URL screenshot should be saved
+    to if the 'selenium.take.screenshot' proerty is set to true.
+    By default this is null, in this case screenshots held in memory
+    are simply discarded.
+  </description>
+</property>
+
+<property>
+  <name>selenium.hub.port</name>
+  <value>4444</value>
+  <description>Selenium Hub Location connection port</description>
+</property>
+
+<property>
+  <name>selenium.hub.path</name>
+  <value>/wd/hub</value>
+  <description>Selenium Hub Location connection path</description>
+</property>
+
+<property>
+  <name>selenium.hub.host</name>
+  <value>localhost</value>
+  <description>Selenium Hub Location connection host</description>
+</property>
+
+<property>
+  <name>selenium.hub.protocol</name>
+  <value>http</value>
+  <description>Selenium Hub Location connection protocol</description>
+</property>
+
+<property>
+  <name>selenium.grid.driver</name>
+  <value>firefox</value>
+  <description>A String value representing the flavour of Selenium 
+    WebDriver() used on the selenium grid. Currently the following options
+    exist - 'firefox' or 'phantomJS' </description>
+</property>
+
+<property>
+  <name>selenium.grid.binary</name>
+  <value></value>
+  <description>A String value representing the path to the browser binary 
+    location for each node
+ </description>
+</property>
+
+<!-- lib-selenium configuration -->
+<property>
+  <name>libselenium.page.load.delay</name>
+  <value>3</value>
+  <description>
+    The delay in seconds to use when loading a page with lib-selenium. This
+    setting is used by protocol-selenium and protocol-interactiveselenium
+    since they depending on lib-selenium for fetching.
+  </description>
+</property>
+```
+ * If you've selected 'remote' value for the 'selenium.driver' property, ensure that you've configured
+ the additional properties based on your [Selenium-Grid installation](http://www.seleniumhq.org/docs/07_selenium_grid.jsp#installation).
+
+ * Compile nutch
+```
+ant runtime
+```
+
+ * Start your web crawl (Ensure that you followed the above steps and have started your xvfb display as shown above)
+
+## Part 3: Common Pitfalls
+
+* Be sure your browser version and selenium version are compatible (See list in 'Tested configurations' section below) 
+* Be sure to start the Xvfb window then start selenium (not a necessary step for PhantomJS)
+* Disconnecting and reconnect nodes after a hub config change has proven useful in our tests. 
+* Be sure that each browser session deallocates its webdriver resource independently of any other tests running on other broswers (check out driver.quit() and driver.close()). 
+
+### Tested configurations 
+
+* Firefox 31.4.0 and Selenium 2.48.2
+* PhantomJS 2.1.1 and Selenium 2.48.2
+* PhantomJS 2.1.1 and Selenium 2.53.0
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/protocol-selenium/build-ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-selenium/build-ivy.xml b/nutch-plugins/protocol-selenium/build-ivy.xml
new file mode 100644
index 0000000..67d39cd
--- /dev/null
+++ b/nutch-plugins/protocol-selenium/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-selenium" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without any special installation -->
+        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not already dropped
+              it into ant's lib dir (note that the latter copy will always take precedence).
+              We will not fail as long as local lib dir exists (it may be empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/protocol-selenium/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-selenium/build.xml b/nutch-plugins/protocol-selenium/build.xml
new file mode 100644
index 0000000..055018f
--- /dev/null
+++ b/nutch-plugins/protocol-selenium/build.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-selenium" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-http"/>
+    <ant target="jar" inheritall="false" dir="../lib-selenium"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+      <include name="**/lib-selenium/*.jar" />
+    </fileset>
+  </path>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/protocol-selenium/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-selenium/ivy.xml b/nutch-plugins/protocol-selenium/ivy.xml
new file mode 100644
index 0000000..ff07f8c
--- /dev/null
+++ b/nutch-plugins/protocol-selenium/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="default"/>
+  </publications>
+
+  <dependencies>
+    <!-- Note: only dependencies which are not contained in lib-selenium have to be listed here! -->
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/protocol-selenium/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-selenium/plugin.xml b/nutch-plugins/protocol-selenium/plugin.xml
new file mode 100644
index 0000000..1454c1b
--- /dev/null
+++ b/nutch-plugins/protocol-selenium/plugin.xml
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="protocol-selenium"
+   name="Http Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="protocol-selenium.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-http"/>
+      <import plugin="lib-selenium"/>
+   </requires>
+
+   <extension id="org.apache.nutch.protocol.selenium"
+              name="HttpProtocol"
+              point="org.apache.nutch.protocol.Protocol">
+
+      <implementation id="org.apache.nutch.protocol.selenium.Http"
+                      class="org.apache.nutch.protocol.selenium.Http">
+        <parameter name="protocolName" value="http"/>
+      </implementation>
+
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/protocol-selenium/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-selenium/pom.xml b/nutch-plugins/protocol-selenium/pom.xml
new file mode 100644
index 0000000..a94c7ec
--- /dev/null
+++ b/nutch-plugins/protocol-selenium/pom.xml
@@ -0,0 +1,50 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>protocol-selenium</artifactId>
+    <packaging>jar</packaging>
+
+    <name>protocol-selenium</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-http</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-selenium</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+    </dependencies>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/Http.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/Http.java b/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/Http.java
new file mode 100644
index 0000000..7726bdf
--- /dev/null
+++ b/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/Http.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.selenium;
+
+// JDK imports
+import java.io.IOException;
+import java.net.URL;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.nutch.protocol.selenium.HttpResponse;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class Http extends HttpBase {
+
+  public static final Logger LOG = LoggerFactory.getLogger(Http.class);
+
+  public Http() {
+    super(LOG);
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+  }
+
+  public static void main(String[] args) throws Exception {
+    Http http = new Http();
+    http.setConf(NutchConfiguration.create());
+    main(http, args);
+  }
+
+  @Override
+  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
+      throws ProtocolException, IOException {
+    return new HttpResponse(this, url, datum);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpResponse.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpResponse.java
new file mode 100644
index 0000000..681e838
--- /dev/null
+++ b/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpResponse.java
@@ -0,0 +1,360 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.selenium;
+
+// JDK imports
+import java.io.BufferedInputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.PushbackInputStream;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+
+/* Most of this code was borrowed from protocol-htmlunit; which in turn borrowed it from protocol-httpclient */
+
+public class HttpResponse implements Response {
+
+  private Http http;
+  private URL url;
+  private String orig;
+  private String base;
+  private byte[] content;
+  private int code;
+  private Metadata headers = new SpellCheckedMetadata();
+
+  /** The nutch configuration */
+  private Configuration conf = null;
+
+  public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolException, IOException {
+
+    this.conf = http.getConf();
+    this.http = http;
+    this.url = url;
+    this.orig = url.toString();
+    this.base = url.toString();
+
+    if (!"http".equals(url.getProtocol()))
+      throw new HttpException("Not an HTTP url:" + url);
+
+    if (Http.LOG.isTraceEnabled()) {
+      Http.LOG.trace("fetching " + url);
+    }
+
+    String path = "".equals(url.getFile()) ? "/" : url.getFile();
+
+    // some servers will redirect a request with a host line like
+    // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
+    // don't want the :80...
+
+    String host = url.getHost();
+    int port;
+    String portString;
+    if (url.getPort() == -1) {
+      port = 80;
+      portString = "";
+    } else {
+      port = url.getPort();
+      portString = ":" + port;
+    }
+    Socket socket = null;
+
+    try {
+      socket = new Socket(); // create the socket
+      socket.setSoTimeout(http.getTimeout());
+
+      // connect
+      String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
+      int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
+      InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
+      socket.connect(sockAddr, http.getTimeout());
+
+      // make request
+      OutputStream req = socket.getOutputStream();
+
+      StringBuffer reqStr = new StringBuffer("GET ");
+      if (http.useProxy(url)) {
+        reqStr.append(url.getProtocol() + "://" + host + portString + path);
+      } else {
+        reqStr.append(path);
+      }
+
+      reqStr.append(" HTTP/1.0\r\n");
+
+      reqStr.append("Host: ");
+      reqStr.append(host);
+      reqStr.append(portString);
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
+
+      String userAgent = http.getUserAgent();
+      if ((userAgent == null) || (userAgent.length() == 0)) {
+        if (Http.LOG.isErrorEnabled()) {
+          Http.LOG.error("User-agent is not set!");
+        }
+      } else {
+        reqStr.append("User-Agent: ");
+        reqStr.append(userAgent);
+        reqStr.append("\r\n");
+      }
+
+      reqStr.append("Accept-Language: ");
+      reqStr.append(this.http.getAcceptLanguage());
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept: ");
+      reqStr.append(this.http.getAccept());
+      reqStr.append("\r\n");
+
+      if (datum.getModifiedTime() > 0) {
+        reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(datum.getModifiedTime()));
+        reqStr.append("\r\n");
+      }
+      reqStr.append("\r\n");
+
+      byte[] reqBytes = reqStr.toString().getBytes();
+
+      req.write(reqBytes);
+      req.flush();
+
+      PushbackInputStream in = // process response
+          new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
+              Http.BUFFER_SIZE);
+
+      StringBuffer line = new StringBuffer();
+
+      boolean haveSeenNonContinueStatus = false;
+      while (!haveSeenNonContinueStatus) {
+        // parse status code line
+        this.code = parseStatusLine(in, line);
+        // parse headers
+        parseHeaders(in, line);
+        haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
+      }
+
+      // Get Content type header
+      String contentType = getHeader(Response.CONTENT_TYPE);
+
+      // handle with Selenium only if content type in HTML or XHTML 
+      if (contentType != null) {
+        if (contentType.contains("text/html") || contentType.contains("application/xhtml")) {
+          readPlainContent(url);
+        } else {
+          try {
+            int contentLength = Integer.MAX_VALUE;
+            String contentLengthString = headers.get(Response.CONTENT_LENGTH);
+            if (contentLengthString != null) {
+              try {
+                contentLength = Integer.parseInt(contentLengthString.trim());
+              } catch (NumberFormatException ex) {
+                throw new HttpException("bad content length: " + contentLengthString);
+              }
+            }
+
+            if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
+              contentLength = http.getMaxContent();
+            }
+
+            byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
+            int bufferFilled = 0;
+            int totalRead = 0;
+            ByteArrayOutputStream out = new ByteArrayOutputStream();
+            while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
+                && totalRead + bufferFilled <= contentLength) {
+              totalRead += bufferFilled;
+              out.write(buffer, 0, bufferFilled);
+            }
+
+            content = out.toByteArray();
+
+          } catch (Exception e) {
+            if (code == 200)
+              throw new IOException(e.toString());
+            // for codes other than 200 OK, we are fine with empty content
+          } finally {
+            if (in != null) {
+              in.close();
+            }
+          }
+        }
+      } 
+
+    } finally {
+      if (socket != null)
+        socket.close();
+    }
+  }
+
+  /* ------------------------- *
+   * <implementation:Response> *
+   * ------------------------- */
+
+  public URL getUrl() {
+    return url;
+  }
+
+  public int getCode() {
+    return code;
+  }
+
+  public String getHeader(String name) {
+    return headers.get(name);
+  }
+
+  public Metadata getHeaders() {
+    return headers;
+  }
+
+  public byte[] getContent() {
+    return content;
+  }
+
+  /* ------------------------- *
+   * <implementation:Response> *
+   * ------------------------- */
+
+  private void readPlainContent(URL url) throws IOException {
+    String page = HttpWebClient.getHtmlPage(url.toString(), conf);
+
+    content = page.getBytes("UTF-8");
+  }
+
+  private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+    readLine(in, line, false);
+
+    int codeStart = line.indexOf(" ");
+    int codeEnd = line.indexOf(" ", codeStart + 1);
+
+    // handle lines with no plaintext result code, ie:
+    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
+    if (codeEnd == -1)
+      codeEnd = line.length();
+
+    int code;
+    try {
+      code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
+    } catch (NumberFormatException e) {
+      throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e);
+    }
+
+    return code;
+  }
+
+  private void processHeaderLine(StringBuffer line) throws IOException, HttpException {
+
+    int colonIndex = line.indexOf(":"); // key is up to colon
+    if (colonIndex == -1) {
+      int i;
+      for (i = 0; i < line.length(); i++)
+        if (!Character.isWhitespace(line.charAt(i)))
+          break;
+      if (i == line.length())
+        return;
+      throw new HttpException("No colon in header:" + line);
+    }
+    String key = line.substring(0, colonIndex);
+
+    int valueStart = colonIndex + 1; // skip whitespace
+    while (valueStart < line.length()) {
+      int c = line.charAt(valueStart);
+      if (c != ' ' && c != '\t')
+        break;
+      valueStart++;
+    }
+    String value = line.substring(valueStart);
+    headers.set(key, value);
+  }
+
+  // Adds headers to our headers Metadata
+  private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+
+    while (readLine(in, line, true) != 0) {
+
+      // handle HTTP responses with missing blank line after headers
+      int pos;
+      if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1)
+          || ((pos = line.indexOf("<html")) != -1)) {
+
+        in.unread(line.substring(pos).getBytes("UTF-8"));
+        line.setLength(pos);
+
+        try {
+          //TODO: (CM) We don't know the header names here
+          //since we're just handling them generically. It would
+          //be nice to provide some sort of mapping function here
+          //for the returned header names to the standard metadata
+          //names in the ParseData class
+          processHeaderLine(line);
+        } catch (Exception e) {
+          // fixme:
+          Http.LOG.warn("Error: ", e);
+        }
+        return;
+      }
+
+      processHeaderLine(line);
+    }
+  }
+
+  private static int readLine(PushbackInputStream in, StringBuffer line, boolean allowContinuedLine)
+      throws IOException {
+    line.setLength(0);
+    for (int c = in.read(); c != -1; c = in.read()) {
+      switch (c) {
+      case '\r':
+        if (peek(in) == '\n') {
+          in.read();
+        }
+      case '\n':
+        if (line.length() > 0) {
+          // at EOL -- check for continued line if the current
+          // (possibly continued) line wasn't blank
+          if (allowContinuedLine)
+            switch (peek(in)) {
+            case ' ':
+            case '\t': // line is continued
+              in.read();
+              continue;
+            }
+        }
+        return line.length(); // else complete
+      default:
+        line.append((char) c);
+      }
+    }
+    throw new EOFException();
+  }
+
+  private static int peek(PushbackInputStream in) throws IOException {
+    int value = in.read();
+    in.unread(value);
+    return value;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/package.html b/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/package.html
new file mode 100644
index 0000000..75cd5b5
--- /dev/null
+++ b/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via selenium.</p><p></p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/scoring-depth/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-depth/build.xml b/nutch-plugins/scoring-depth/build.xml
new file mode 100644
index 0000000..6c041ed
--- /dev/null
+++ b/nutch-plugins/scoring-depth/build.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<project name="scoring-depth" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/scoring-depth/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-depth/ivy.xml b/nutch-plugins/scoring-depth/ivy.xml
new file mode 100644
index 0000000..24d7606
--- /dev/null
+++ b/nutch-plugins/scoring-depth/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/scoring-depth/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-depth/plugin.xml b/nutch-plugins/scoring-depth/plugin.xml
new file mode 100644
index 0000000..ea57dc6
--- /dev/null
+++ b/nutch-plugins/scoring-depth/plugin.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="scoring-depth"
+   name="Scoring plugin for depth-limited crawling."
+   version="1.0.0"
+   provider-name="ant.com">
+
+   <runtime>
+      <library name="scoring-depth.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.scoring.depth"
+              name="Depth Scoring Filter"
+              point="org.apache.nutch.scoring.ScoringFilter">
+      <implementation id="DepthScoringFilter"
+                      class="org.apache.nutch.scoring.depth.DepthScoringFilter"/>
+   </extension>
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/scoring-depth/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-depth/pom.xml b/nutch-plugins/scoring-depth/pom.xml
new file mode 100644
index 0000000..64ebe18
--- /dev/null
+++ b/nutch-plugins/scoring-depth/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>scoring-depth</artifactId>
+    <packaging>jar</packaging>
+
+    <name>scoring-depth</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/scoring-depth/src/main/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-depth/src/main/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java b/nutch-plugins/scoring-depth/src/main/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
new file mode 100644
index 0000000..0a0dd27
--- /dev/null
+++ b/nutch-plugins/scoring-depth/src/main/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
@@ -0,0 +1,207 @@
+package org.apache.nutch.scoring.depth;
+
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map.Entry;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+
+/**
+ * This scoring filter limits the number of hops from the initial seed urls. If
+ * the number of hops exceeds the depth (either the default value, or the one
+ * set in the injector file) then all outlinks from that url are discarded,
+ * effectively stopping further crawling along this path.
+ */
+public class DepthScoringFilter extends Configured implements ScoringFilter {
+  private static final Log LOG = LogFactory.getLog(DepthScoringFilter.class);
+
+  public static final String DEPTH_KEY = "_depth_";
+  public static final Text DEPTH_KEY_W = new Text(DEPTH_KEY);
+  public static final String MAX_DEPTH_KEY = "_maxdepth_";
+  public static final Text MAX_DEPTH_KEY_W = new Text(MAX_DEPTH_KEY);
+
+  // maximum value that we are never likely to reach
+  // because the depth of the Web graph is that high only
+  // for spam cliques.
+  public static final int DEFAULT_MAX_DEPTH = 1000;
+
+  private int defaultMaxDepth;
+
+  @Override
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    if (conf == null)
+      return;
+    defaultMaxDepth = conf.getInt("scoring.depth.max", DEFAULT_MAX_DEPTH);
+    if (defaultMaxDepth <= 0) {
+      defaultMaxDepth = DEFAULT_MAX_DEPTH;
+    }
+  }
+
+  @Override
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+      CrawlDatum adjust, int allCount) throws ScoringFilterException {
+    String depthString = parseData.getMeta(DEPTH_KEY);
+    if (depthString == null) {
+      LOG.warn("Missing depth, removing all outlinks from url " + fromUrl);
+      targets.clear();
+      return adjust;
+    }
+    int curDepth = Integer.parseInt(depthString);
+    int curMaxDepth = defaultMaxDepth;
+    IntWritable customMaxDepth = null;
+    // allow overrides from injector
+    String maxDepthString = parseData.getMeta(MAX_DEPTH_KEY);
+    if (maxDepthString != null) {
+      curMaxDepth = Integer.parseInt(maxDepthString);
+      customMaxDepth = new IntWritable(curMaxDepth);
+    }
+    if (curDepth >= curMaxDepth) {
+      // depth exceeded - throw away
+      LOG.info("Depth limit (" + curMaxDepth
+          + ") reached, ignoring outlinks for " + fromUrl);
+      targets.clear();
+      return adjust;
+    }
+    Iterator<Entry<Text, CrawlDatum>> it = targets.iterator();
+    while (it.hasNext()) {
+      Entry<Text, CrawlDatum> e = it.next();
+      // record increased depth
+      e.getValue().getMetaData()
+          .put(DEPTH_KEY_W, new IntWritable(curDepth + 1));
+      // record maxDepth if any
+      if (customMaxDepth != null) {
+        e.getValue().getMetaData().put(MAX_DEPTH_KEY_W, customMaxDepth);
+      }
+    }
+    return adjust;
+  }
+
+  // prioritize by smaller values of depth
+  @Override
+  public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+      throws ScoringFilterException {
+    // boost up by current depth
+    int curDepth, curMaxDepth;
+    IntWritable maxDepth = (IntWritable) datum.getMetaData().get(
+        MAX_DEPTH_KEY_W);
+    if (maxDepth != null) {
+      curMaxDepth = maxDepth.get();
+    } else {
+      curMaxDepth = defaultMaxDepth;
+    }
+    IntWritable depth = (IntWritable) datum.getMetaData().get(DEPTH_KEY_W);
+    if (depth == null) {
+      // penalize
+      curDepth = curMaxDepth;
+    } else {
+      curDepth = depth.get();
+    }
+    int mul = curMaxDepth - curDepth;
+    return initSort * (1 + mul);
+  }
+
+  public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
+      CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+      throws ScoringFilterException {
+    return initScore;
+  }
+
+  @Override
+  public void initialScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+    // the datum might already have some values set
+    // e.g. obtained from redirection
+    // in which case we don't want to override them
+    if (datum.getMetaData().get(MAX_DEPTH_KEY_W) == null)
+      datum.getMetaData()
+          .put(MAX_DEPTH_KEY_W, new IntWritable(defaultMaxDepth));
+    // initial depth is 1
+    if (datum.getMetaData().get(DEPTH_KEY_W) == null)
+      datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(1));
+  }
+
+  @Override
+  public void injectedScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+
+    // check for the presence of the depth limit key
+    if (datum.getMetaData().get(MAX_DEPTH_KEY_W) != null) {
+      // convert from Text to Int
+      String depthString = datum.getMetaData().get(MAX_DEPTH_KEY_W).toString();
+      datum.getMetaData().remove(MAX_DEPTH_KEY_W);
+      int depth = Integer.parseInt(depthString);
+      datum.getMetaData().put(MAX_DEPTH_KEY_W, new IntWritable(depth));
+    } else { // put the default
+      datum.getMetaData()
+          .put(MAX_DEPTH_KEY_W, new IntWritable(defaultMaxDepth));
+    }
+    // initial depth is 1
+    datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(1));
+  }
+
+  @Override
+  public void passScoreAfterParsing(Text url, Content content, Parse parse)
+      throws ScoringFilterException {
+    String depth = content.getMetadata().get(DEPTH_KEY);
+    if (depth != null) {
+      parse.getData().getParseMeta().set(DEPTH_KEY, depth);
+    }
+    String maxdepth = content.getMetadata().get(MAX_DEPTH_KEY);
+    if (maxdepth != null) {
+      parse.getData().getParseMeta().set(MAX_DEPTH_KEY, maxdepth);
+    }
+  }
+
+  @Override
+  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
+      throws ScoringFilterException {
+    IntWritable depth = (IntWritable) datum.getMetaData().get(DEPTH_KEY_W);
+    if (depth != null) {
+      content.getMetadata().set(DEPTH_KEY, depth.toString());
+    }
+    IntWritable maxdepth = (IntWritable) datum.getMetaData().get(
+        MAX_DEPTH_KEY_W);
+    if (maxdepth != null) {
+      content.getMetadata().set(MAX_DEPTH_KEY, maxdepth.toString());
+    }
+  }
+
+  @Override
+  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+      List<CrawlDatum> inlinked) throws ScoringFilterException {
+    // find a minimum of all depths
+    int newDepth = DEFAULT_MAX_DEPTH;
+    if (old != null) {
+      IntWritable oldDepth = (IntWritable) old.getMetaData().get(DEPTH_KEY_W);
+      if (oldDepth != null) {
+        newDepth = oldDepth.get();
+      } else {
+        // not set ?
+        initialScore(url, old);
+      }
+    }
+    for (CrawlDatum lnk : inlinked) {
+      IntWritable depth = (IntWritable) lnk.getMetaData().get(DEPTH_KEY_W);
+      if (depth != null && depth.get() < newDepth) {
+        newDepth = depth.get();
+      }
+    }
+    datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(newDepth));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/scoring-depth/src/main/java/org/apache/nutch/scoring/depth/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-depth/src/main/java/org/apache/nutch/scoring/depth/package-info.java b/nutch-plugins/scoring-depth/src/main/java/org/apache/nutch/scoring/depth/package-info.java
new file mode 100644
index 0000000..aa89797
--- /dev/null
+++ b/nutch-plugins/scoring-depth/src/main/java/org/apache/nutch/scoring/depth/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Scoring filter to stop crawling at a configurable depth
+ * (number of "hops" from seed URLs).
+ */
+package org.apache.nutch.scoring.depth;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/scoring-link/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-link/build.xml b/nutch-plugins/scoring-link/build.xml
new file mode 100644
index 0000000..123b1ea
--- /dev/null
+++ b/nutch-plugins/scoring-link/build.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="scoring-link" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/scoring-link/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-link/ivy.xml b/nutch-plugins/scoring-link/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/scoring-link/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>