You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:48:46 UTC

[02/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-selenium/build-ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-selenium/build-ivy.xml b/src/plugin/protocol-selenium/build-ivy.xml
deleted file mode 100644
index 67d39cd..0000000
--- a/src/plugin/protocol-selenium/build-ivy.xml
+++ /dev/null
@@ -1,54 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="protocol-selenium" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
-
-    <property name="ivy.install.version" value="2.1.0" />
-    <condition property="ivy.home" value="${env.IVY_HOME}">
-      <isset property="env.IVY_HOME" />
-    </condition>
-    <property name="ivy.home" value="${user.home}/.ant" />
-    <property name="ivy.checksums" value="" />
-    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
-    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
-
-    <target name="download-ivy" unless="offline">
-
-        <mkdir dir="${ivy.jar.dir}"/>
-        <!-- download Ivy from web site so that it can be used even without any special installation -->
-        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
-             dest="${ivy.jar.file}" usetimestamp="true"/>
-    </target>
-
-    <target name="init-ivy" depends="download-ivy">
-      <!-- try to load ivy here from ivy home, in case the user has not already dropped
-              it into ant's lib dir (note that the latter copy will always take precedence).
-              We will not fail as long as local lib dir exists (it may be empty) and
-              ivy is in at least one of ant's lib dir or the local lib dir. -->
-        <path id="ivy.lib.path">
-            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
-
-        </path>
-        <taskdef resource="org/apache/ivy/ant/antlib.xml"
-                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
-    </target>
-
-  <target name="deps-jar" depends="init-ivy">
-    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/>
-  </target>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-selenium/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-selenium/build.xml b/src/plugin/protocol-selenium/build.xml
deleted file mode 100644
index 055018f..0000000
--- a/src/plugin/protocol-selenium/build.xml
+++ /dev/null
@@ -1,36 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="protocol-selenium" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-  <!-- Build compilation dependencies -->
-  <target name="deps-jar">
-    <ant target="jar" inheritall="false" dir="../lib-http"/>
-    <ant target="jar" inheritall="false" dir="../lib-selenium"/>
-  </target>
-
-  <!-- Add compilation dependencies to classpath -->
-  <path id="plugin.deps">
-    <fileset dir="${nutch.root}/build">
-      <include name="**/lib-http/*.jar" />
-      <include name="**/lib-selenium/*.jar" />
-    </fileset>
-  </path>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-selenium/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-selenium/ivy.xml b/src/plugin/protocol-selenium/ivy.xml
deleted file mode 100644
index ff07f8c..0000000
--- a/src/plugin/protocol-selenium/ivy.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../../ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="default"/>
-  </publications>
-
-  <dependencies>
-    <!-- Note: only dependencies which are not contained in lib-selenium have to be listed here! -->
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-selenium/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-selenium/plugin.xml b/src/plugin/protocol-selenium/plugin.xml
deleted file mode 100644
index 1454c1b..0000000
--- a/src/plugin/protocol-selenium/plugin.xml
+++ /dev/null
@@ -1,47 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="protocol-selenium"
-   name="Http Protocol Plug-in"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-   <runtime>
-      <library name="protocol-selenium.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-      <import plugin="lib-http"/>
-      <import plugin="lib-selenium"/>
-   </requires>
-
-   <extension id="org.apache.nutch.protocol.selenium"
-              name="HttpProtocol"
-              point="org.apache.nutch.protocol.Protocol">
-
-      <implementation id="org.apache.nutch.protocol.selenium.Http"
-                      class="org.apache.nutch.protocol.selenium.Http">
-        <parameter name="protocolName" value="http"/>
-      </implementation>
-
-   </extension>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
deleted file mode 100644
index 7726bdf..0000000
--- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.selenium;
-
-// JDK imports
-import java.io.IOException;
-import java.net.URL;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.http.api.HttpBase;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.util.NutchConfiguration;
-
-import org.apache.nutch.protocol.selenium.HttpResponse;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public class Http extends HttpBase {
-
-  public static final Logger LOG = LoggerFactory.getLogger(Http.class);
-
-  public Http() {
-    super(LOG);
-  }
-
-  @Override
-  public void setConf(Configuration conf) {
-    super.setConf(conf);
-  }
-
-  public static void main(String[] args) throws Exception {
-    Http http = new Http();
-    http.setConf(NutchConfiguration.create());
-    main(http, args);
-  }
-
-  @Override
-  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
-      throws ProtocolException, IOException {
-    return new HttpResponse(this, url, datum);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
deleted file mode 100644
index 681e838..0000000
--- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
+++ /dev/null
@@ -1,360 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.selenium;
-
-// JDK imports
-import java.io.BufferedInputStream;
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.PushbackInputStream;
-import java.net.InetSocketAddress;
-import java.net.Socket;
-import java.net.URL;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.metadata.SpellCheckedMetadata;
-import org.apache.nutch.net.protocols.HttpDateFormat;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.http.api.HttpException;
-import org.apache.nutch.protocol.http.api.HttpBase;
-
-/* Most of this code was borrowed from protocol-htmlunit; which in turn borrowed it from protocol-httpclient */
-
-public class HttpResponse implements Response {
-
-  private Http http;
-  private URL url;
-  private String orig;
-  private String base;
-  private byte[] content;
-  private int code;
-  private Metadata headers = new SpellCheckedMetadata();
-
-  /** The nutch configuration */
-  private Configuration conf = null;
-
-  public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolException, IOException {
-
-    this.conf = http.getConf();
-    this.http = http;
-    this.url = url;
-    this.orig = url.toString();
-    this.base = url.toString();
-
-    if (!"http".equals(url.getProtocol()))
-      throw new HttpException("Not an HTTP url:" + url);
-
-    if (Http.LOG.isTraceEnabled()) {
-      Http.LOG.trace("fetching " + url);
-    }
-
-    String path = "".equals(url.getFile()) ? "/" : url.getFile();
-
-    // some servers will redirect a request with a host line like
-    // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
-    // don't want the :80...
-
-    String host = url.getHost();
-    int port;
-    String portString;
-    if (url.getPort() == -1) {
-      port = 80;
-      portString = "";
-    } else {
-      port = url.getPort();
-      portString = ":" + port;
-    }
-    Socket socket = null;
-
-    try {
-      socket = new Socket(); // create the socket
-      socket.setSoTimeout(http.getTimeout());
-
-      // connect
-      String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
-      int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
-      InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
-      socket.connect(sockAddr, http.getTimeout());
-
-      // make request
-      OutputStream req = socket.getOutputStream();
-
-      StringBuffer reqStr = new StringBuffer("GET ");
-      if (http.useProxy(url)) {
-        reqStr.append(url.getProtocol() + "://" + host + portString + path);
-      } else {
-        reqStr.append(path);
-      }
-
-      reqStr.append(" HTTP/1.0\r\n");
-
-      reqStr.append("Host: ");
-      reqStr.append(host);
-      reqStr.append(portString);
-      reqStr.append("\r\n");
-
-      reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
-
-      String userAgent = http.getUserAgent();
-      if ((userAgent == null) || (userAgent.length() == 0)) {
-        if (Http.LOG.isErrorEnabled()) {
-          Http.LOG.error("User-agent is not set!");
-        }
-      } else {
-        reqStr.append("User-Agent: ");
-        reqStr.append(userAgent);
-        reqStr.append("\r\n");
-      }
-
-      reqStr.append("Accept-Language: ");
-      reqStr.append(this.http.getAcceptLanguage());
-      reqStr.append("\r\n");
-
-      reqStr.append("Accept: ");
-      reqStr.append(this.http.getAccept());
-      reqStr.append("\r\n");
-
-      if (datum.getModifiedTime() > 0) {
-        reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(datum.getModifiedTime()));
-        reqStr.append("\r\n");
-      }
-      reqStr.append("\r\n");
-
-      byte[] reqBytes = reqStr.toString().getBytes();
-
-      req.write(reqBytes);
-      req.flush();
-
-      PushbackInputStream in = // process response
-          new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
-              Http.BUFFER_SIZE);
-
-      StringBuffer line = new StringBuffer();
-
-      boolean haveSeenNonContinueStatus = false;
-      while (!haveSeenNonContinueStatus) {
-        // parse status code line
-        this.code = parseStatusLine(in, line);
-        // parse headers
-        parseHeaders(in, line);
-        haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
-      }
-
-      // Get Content type header
-      String contentType = getHeader(Response.CONTENT_TYPE);
-
-      // handle with Selenium only if content type in HTML or XHTML 
-      if (contentType != null) {
-        if (contentType.contains("text/html") || contentType.contains("application/xhtml")) {
-          readPlainContent(url);
-        } else {
-          try {
-            int contentLength = Integer.MAX_VALUE;
-            String contentLengthString = headers.get(Response.CONTENT_LENGTH);
-            if (contentLengthString != null) {
-              try {
-                contentLength = Integer.parseInt(contentLengthString.trim());
-              } catch (NumberFormatException ex) {
-                throw new HttpException("bad content length: " + contentLengthString);
-              }
-            }
-
-            if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
-              contentLength = http.getMaxContent();
-            }
-
-            byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
-            int bufferFilled = 0;
-            int totalRead = 0;
-            ByteArrayOutputStream out = new ByteArrayOutputStream();
-            while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
-                && totalRead + bufferFilled <= contentLength) {
-              totalRead += bufferFilled;
-              out.write(buffer, 0, bufferFilled);
-            }
-
-            content = out.toByteArray();
-
-          } catch (Exception e) {
-            if (code == 200)
-              throw new IOException(e.toString());
-            // for codes other than 200 OK, we are fine with empty content
-          } finally {
-            if (in != null) {
-              in.close();
-            }
-          }
-        }
-      } 
-
-    } finally {
-      if (socket != null)
-        socket.close();
-    }
-  }
-
-  /* ------------------------- *
-   * <implementation:Response> *
-   * ------------------------- */
-
-  public URL getUrl() {
-    return url;
-  }
-
-  public int getCode() {
-    return code;
-  }
-
-  public String getHeader(String name) {
-    return headers.get(name);
-  }
-
-  public Metadata getHeaders() {
-    return headers;
-  }
-
-  public byte[] getContent() {
-    return content;
-  }
-
-  /* ------------------------- *
-   * <implementation:Response> *
-   * ------------------------- */
-
-  private void readPlainContent(URL url) throws IOException {
-    String page = HttpWebClient.getHtmlPage(url.toString(), conf);
-
-    content = page.getBytes("UTF-8");
-  }
-
-  private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
-    readLine(in, line, false);
-
-    int codeStart = line.indexOf(" ");
-    int codeEnd = line.indexOf(" ", codeStart + 1);
-
-    // handle lines with no plaintext result code, ie:
-    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
-    if (codeEnd == -1)
-      codeEnd = line.length();
-
-    int code;
-    try {
-      code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
-    } catch (NumberFormatException e) {
-      throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e);
-    }
-
-    return code;
-  }
-
-  private void processHeaderLine(StringBuffer line) throws IOException, HttpException {
-
-    int colonIndex = line.indexOf(":"); // key is up to colon
-    if (colonIndex == -1) {
-      int i;
-      for (i = 0; i < line.length(); i++)
-        if (!Character.isWhitespace(line.charAt(i)))
-          break;
-      if (i == line.length())
-        return;
-      throw new HttpException("No colon in header:" + line);
-    }
-    String key = line.substring(0, colonIndex);
-
-    int valueStart = colonIndex + 1; // skip whitespace
-    while (valueStart < line.length()) {
-      int c = line.charAt(valueStart);
-      if (c != ' ' && c != '\t')
-        break;
-      valueStart++;
-    }
-    String value = line.substring(valueStart);
-    headers.set(key, value);
-  }
-
-  // Adds headers to our headers Metadata
-  private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
-
-    while (readLine(in, line, true) != 0) {
-
-      // handle HTTP responses with missing blank line after headers
-      int pos;
-      if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1)
-          || ((pos = line.indexOf("<html")) != -1)) {
-
-        in.unread(line.substring(pos).getBytes("UTF-8"));
-        line.setLength(pos);
-
-        try {
-          //TODO: (CM) We don't know the header names here
-          //since we're just handling them generically. It would
-          //be nice to provide some sort of mapping function here
-          //for the returned header names to the standard metadata
-          //names in the ParseData class
-          processHeaderLine(line);
-        } catch (Exception e) {
-          // fixme:
-          Http.LOG.warn("Error: ", e);
-        }
-        return;
-      }
-
-      processHeaderLine(line);
-    }
-  }
-
-  private static int readLine(PushbackInputStream in, StringBuffer line, boolean allowContinuedLine)
-      throws IOException {
-    line.setLength(0);
-    for (int c = in.read(); c != -1; c = in.read()) {
-      switch (c) {
-      case '\r':
-        if (peek(in) == '\n') {
-          in.read();
-        }
-      case '\n':
-        if (line.length() > 0) {
-          // at EOL -- check for continued line if the current
-          // (possibly continued) line wasn't blank
-          if (allowContinuedLine)
-            switch (peek(in)) {
-            case ' ':
-            case '\t': // line is continued
-              in.read();
-              continue;
-            }
-        }
-        return line.length(); // else complete
-      default:
-        line.append((char) c);
-      }
-    }
-    throw new EOFException();
-  }
-
-  private static int peek(PushbackInputStream in) throws IOException {
-    int value = in.read();
-    in.unread(value);
-    return value;
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
deleted file mode 100644
index 75cd5b5..0000000
--- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
+++ /dev/null
@@ -1,5 +0,0 @@
-<html>
-<body>
-<p>Protocol plugin which supports retrieving documents via selenium.</p><p></p>
-</body>
-</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-depth/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-depth/build.xml b/src/plugin/scoring-depth/build.xml
deleted file mode 100644
index 6c041ed..0000000
--- a/src/plugin/scoring-depth/build.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0"?>
-<project name="scoring-depth" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-depth/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-depth/ivy.xml b/src/plugin/scoring-depth/ivy.xml
deleted file mode 100644
index 24d7606..0000000
--- a/src/plugin/scoring-depth/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../../ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-depth/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-depth/plugin.xml b/src/plugin/scoring-depth/plugin.xml
deleted file mode 100644
index ea57dc6..0000000
--- a/src/plugin/scoring-depth/plugin.xml
+++ /dev/null
@@ -1,24 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<plugin
-   id="scoring-depth"
-   name="Scoring plugin for depth-limited crawling."
-   version="1.0.0"
-   provider-name="ant.com">
-
-   <runtime>
-      <library name="scoring-depth.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-   <extension id="org.apache.nutch.scoring.depth"
-              name="Depth Scoring Filter"
-              point="org.apache.nutch.scoring.ScoringFilter">
-      <implementation id="DepthScoringFilter"
-                      class="org.apache.nutch.scoring.depth.DepthScoringFilter"/>
-   </extension>
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
deleted file mode 100644
index 0a0dd27..0000000
--- a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
+++ /dev/null
@@ -1,207 +0,0 @@
-package org.apache.nutch.scoring.depth;
-
-import java.util.Collection;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map.Entry;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.scoring.ScoringFilter;
-import org.apache.nutch.scoring.ScoringFilterException;
-
-/**
- * This scoring filter limits the number of hops from the initial seed urls. If
- * the number of hops exceeds the depth (either the default value, or the one
- * set in the injector file) then all outlinks from that url are discarded,
- * effectively stopping further crawling along this path.
- */
-public class DepthScoringFilter extends Configured implements ScoringFilter {
-  private static final Log LOG = LogFactory.getLog(DepthScoringFilter.class);
-
-  public static final String DEPTH_KEY = "_depth_";
-  public static final Text DEPTH_KEY_W = new Text(DEPTH_KEY);
-  public static final String MAX_DEPTH_KEY = "_maxdepth_";
-  public static final Text MAX_DEPTH_KEY_W = new Text(MAX_DEPTH_KEY);
-
-  // maximum value that we are never likely to reach
-  // because the depth of the Web graph is that high only
-  // for spam cliques.
-  public static final int DEFAULT_MAX_DEPTH = 1000;
-
-  private int defaultMaxDepth;
-
-  @Override
-  public void setConf(Configuration conf) {
-    super.setConf(conf);
-    if (conf == null)
-      return;
-    defaultMaxDepth = conf.getInt("scoring.depth.max", DEFAULT_MAX_DEPTH);
-    if (defaultMaxDepth <= 0) {
-      defaultMaxDepth = DEFAULT_MAX_DEPTH;
-    }
-  }
-
-  @Override
-  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
-      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
-      CrawlDatum adjust, int allCount) throws ScoringFilterException {
-    String depthString = parseData.getMeta(DEPTH_KEY);
-    if (depthString == null) {
-      LOG.warn("Missing depth, removing all outlinks from url " + fromUrl);
-      targets.clear();
-      return adjust;
-    }
-    int curDepth = Integer.parseInt(depthString);
-    int curMaxDepth = defaultMaxDepth;
-    IntWritable customMaxDepth = null;
-    // allow overrides from injector
-    String maxDepthString = parseData.getMeta(MAX_DEPTH_KEY);
-    if (maxDepthString != null) {
-      curMaxDepth = Integer.parseInt(maxDepthString);
-      customMaxDepth = new IntWritable(curMaxDepth);
-    }
-    if (curDepth >= curMaxDepth) {
-      // depth exceeded - throw away
-      LOG.info("Depth limit (" + curMaxDepth
-          + ") reached, ignoring outlinks for " + fromUrl);
-      targets.clear();
-      return adjust;
-    }
-    Iterator<Entry<Text, CrawlDatum>> it = targets.iterator();
-    while (it.hasNext()) {
-      Entry<Text, CrawlDatum> e = it.next();
-      // record increased depth
-      e.getValue().getMetaData()
-          .put(DEPTH_KEY_W, new IntWritable(curDepth + 1));
-      // record maxDepth if any
-      if (customMaxDepth != null) {
-        e.getValue().getMetaData().put(MAX_DEPTH_KEY_W, customMaxDepth);
-      }
-    }
-    return adjust;
-  }
-
-  // prioritize by smaller values of depth
-  @Override
-  public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
-      throws ScoringFilterException {
-    // boost up by current depth
-    int curDepth, curMaxDepth;
-    IntWritable maxDepth = (IntWritable) datum.getMetaData().get(
-        MAX_DEPTH_KEY_W);
-    if (maxDepth != null) {
-      curMaxDepth = maxDepth.get();
-    } else {
-      curMaxDepth = defaultMaxDepth;
-    }
-    IntWritable depth = (IntWritable) datum.getMetaData().get(DEPTH_KEY_W);
-    if (depth == null) {
-      // penalize
-      curDepth = curMaxDepth;
-    } else {
-      curDepth = depth.get();
-    }
-    int mul = curMaxDepth - curDepth;
-    return initSort * (1 + mul);
-  }
-
-  public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
-      CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
-      throws ScoringFilterException {
-    return initScore;
-  }
-
-  @Override
-  public void initialScore(Text url, CrawlDatum datum)
-      throws ScoringFilterException {
-    // the datum might already have some values set
-    // e.g. obtained from redirection
-    // in which case we don't want to override them
-    if (datum.getMetaData().get(MAX_DEPTH_KEY_W) == null)
-      datum.getMetaData()
-          .put(MAX_DEPTH_KEY_W, new IntWritable(defaultMaxDepth));
-    // initial depth is 1
-    if (datum.getMetaData().get(DEPTH_KEY_W) == null)
-      datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(1));
-  }
-
-  @Override
-  public void injectedScore(Text url, CrawlDatum datum)
-      throws ScoringFilterException {
-
-    // check for the presence of the depth limit key
-    if (datum.getMetaData().get(MAX_DEPTH_KEY_W) != null) {
-      // convert from Text to Int
-      String depthString = datum.getMetaData().get(MAX_DEPTH_KEY_W).toString();
-      datum.getMetaData().remove(MAX_DEPTH_KEY_W);
-      int depth = Integer.parseInt(depthString);
-      datum.getMetaData().put(MAX_DEPTH_KEY_W, new IntWritable(depth));
-    } else { // put the default
-      datum.getMetaData()
-          .put(MAX_DEPTH_KEY_W, new IntWritable(defaultMaxDepth));
-    }
-    // initial depth is 1
-    datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(1));
-  }
-
-  @Override
-  public void passScoreAfterParsing(Text url, Content content, Parse parse)
-      throws ScoringFilterException {
-    String depth = content.getMetadata().get(DEPTH_KEY);
-    if (depth != null) {
-      parse.getData().getParseMeta().set(DEPTH_KEY, depth);
-    }
-    String maxdepth = content.getMetadata().get(MAX_DEPTH_KEY);
-    if (maxdepth != null) {
-      parse.getData().getParseMeta().set(MAX_DEPTH_KEY, maxdepth);
-    }
-  }
-
-  @Override
-  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
-      throws ScoringFilterException {
-    IntWritable depth = (IntWritable) datum.getMetaData().get(DEPTH_KEY_W);
-    if (depth != null) {
-      content.getMetadata().set(DEPTH_KEY, depth.toString());
-    }
-    IntWritable maxdepth = (IntWritable) datum.getMetaData().get(
-        MAX_DEPTH_KEY_W);
-    if (maxdepth != null) {
-      content.getMetadata().set(MAX_DEPTH_KEY, maxdepth.toString());
-    }
-  }
-
-  @Override
-  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
-      List<CrawlDatum> inlinked) throws ScoringFilterException {
-    // find a minimum of all depths
-    int newDepth = DEFAULT_MAX_DEPTH;
-    if (old != null) {
-      IntWritable oldDepth = (IntWritable) old.getMetaData().get(DEPTH_KEY_W);
-      if (oldDepth != null) {
-        newDepth = oldDepth.get();
-      } else {
-        // not set ?
-        initialScore(url, old);
-      }
-    }
-    for (CrawlDatum lnk : inlinked) {
-      IntWritable depth = (IntWritable) lnk.getMetaData().get(DEPTH_KEY_W);
-      if (depth != null && depth.get() < newDepth) {
-        newDepth = depth.get();
-      }
-    }
-    datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(newDepth));
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java
deleted file mode 100644
index aa89797..0000000
--- a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Scoring filter to stop crawling at a configurable depth
- * (number of "hops" from seed URLs).
- */
-package org.apache.nutch.scoring.depth;
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-link/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-link/build.xml b/src/plugin/scoring-link/build.xml
deleted file mode 100644
index 123b1ea..0000000
--- a/src/plugin/scoring-link/build.xml
+++ /dev/null
@@ -1,27 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="scoring-link" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-  <!-- Deploy Unit test dependencies -->
-  <target name="deps-test">
-    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
-  </target>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-link/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-link/ivy.xml b/src/plugin/scoring-link/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/scoring-link/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-link/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-link/plugin.xml b/src/plugin/scoring-link/plugin.xml
deleted file mode 100644
index 2b1c1e1..0000000
--- a/src/plugin/scoring-link/plugin.xml
+++ /dev/null
@@ -1,39 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="scoring-link"
-   name="Link Analysis Scoring Plug-in"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-
-   <runtime>
-      <library name="scoring-link.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <extension id="org.apache.nutch.scoring.link"
-              name="LinkAnalysisScoring"
-              point="org.apache.nutch.scoring.ScoringFilter">
-
-      <implementation id="org.apache.nutch.scoring.link.LinkAnalysisScoringFilter"
-        class="org.apache.nutch.scoring.link.LinkAnalysisScoringFilter" />
-   </extension>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java b/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
deleted file mode 100644
index a143f46..0000000
--- a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.scoring.link;
-
-import java.util.Collection;
-import java.util.List;
-import java.util.Map.Entry;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.scoring.ScoringFilter;
-import org.apache.nutch.scoring.ScoringFilterException;
-
-public class LinkAnalysisScoringFilter implements ScoringFilter {
-
-  private Configuration conf;
-  private float normalizedScore = 1.00f;
-
-  public LinkAnalysisScoringFilter() {
-
-  }
-
-  public Configuration getConf() {
-    return conf;
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    normalizedScore = conf.getFloat("link.analyze.normalize.score", 1.00f);
-  }
-
-  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
-      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
-      CrawlDatum adjust, int allCount) throws ScoringFilterException {
-    return adjust;
-  }
-
-  public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
-      throws ScoringFilterException {
-    return datum.getScore() * initSort;
-  }
-
-  public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
-      CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
-      throws ScoringFilterException {
-    return (normalizedScore * dbDatum.getScore());
-  }
-
-  public void initialScore(Text url, CrawlDatum datum)
-      throws ScoringFilterException {
-    datum.setScore(0.0f);
-  }
-
-  public void injectedScore(Text url, CrawlDatum datum)
-      throws ScoringFilterException {
-  }
-
-  public void passScoreAfterParsing(Text url, Content content, Parse parse)
-      throws ScoringFilterException {
-    parse.getData().getContentMeta()
-        .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
-  }
-
-  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
-      throws ScoringFilterException {
-    content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
-  }
-
-  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
-      List<CrawlDatum> inlinked) throws ScoringFilterException {
-    // nothing to do
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java b/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java
deleted file mode 100644
index 9dc0c35..0000000
--- a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Scoring filter used in conjunction with
- * {@link org.apache.nutch.scoring.webgraph.WebGraph}.
- */
-package org.apache.nutch.scoring.link;
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-opic/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-opic/build.xml b/src/plugin/scoring-opic/build.xml
deleted file mode 100644
index 137dab4..0000000
--- a/src/plugin/scoring-opic/build.xml
+++ /dev/null
@@ -1,27 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="scoring-opic" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-  <!-- Deploy Unit test dependencies -->
-  <target name="deps-test">
-    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
-  </target>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-opic/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-opic/ivy.xml b/src/plugin/scoring-opic/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/scoring-opic/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-opic/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-opic/plugin.xml b/src/plugin/scoring-opic/plugin.xml
deleted file mode 100644
index 3805a31..0000000
--- a/src/plugin/scoring-opic/plugin.xml
+++ /dev/null
@@ -1,39 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="scoring-opic"
-   name="OPIC Scoring Plug-in"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-
-   <runtime>
-      <library name="scoring-opic.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <extension id="org.apache.nutch.scoring.opic"
-              name="OPICScoring"
-              point="org.apache.nutch.scoring.ScoringFilter">
-
-      <implementation id="org.apache.nutch.scoring.opic.OPICScoringFilter"
-                      class="org.apache.nutch.scoring.opic.OPICScoringFilter" />
-   </extension>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java b/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
deleted file mode 100644
index e943d06..0000000
--- a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
+++ /dev/null
@@ -1,173 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.scoring.opic;
-
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.Collection;
-import java.util.List;
-import java.util.Map.Entry;
-
-// Slf4j Logging imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.scoring.ScoringFilter;
-import org.apache.nutch.scoring.ScoringFilterException;
-
-/**
- * This plugin implements a variant of an Online Page Importance Computation
- * (OPIC) score, described in this paper: <a
- * href="http://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html"/>
- * Abiteboul, Serge and Preda, Mihai and Cobena, Gregory (2003), Adaptive
- * On-Line Page Importance Computation </a>.
- * 
- * @author Andrzej Bialecki
- */
-public class OPICScoringFilter implements ScoringFilter {
-
-  private final static Logger LOG = LoggerFactory
-      .getLogger(OPICScoringFilter.class);
-
-  private Configuration conf;
-  private float scoreInjected;
-  private float scorePower;
-  private float internalScoreFactor;
-  private float externalScoreFactor;
-  private boolean countFiltered;
-
-  public Configuration getConf() {
-    return conf;
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    scorePower = conf.getFloat("indexer.score.power", 0.5f);
-    internalScoreFactor = conf.getFloat("db.score.link.internal", 1.0f);
-    externalScoreFactor = conf.getFloat("db.score.link.external", 1.0f);
-    countFiltered = conf.getBoolean("db.score.count.filtered", false);
-  }
-
-  public void injectedScore(Text url, CrawlDatum datum)
-      throws ScoringFilterException {
-  }
-
-  /**
-   * Set to 0.0f (unknown value) - inlink contributions will bring it to a
-   * correct level. Newly discovered pages have at least one inlink.
-   */
-  public void initialScore(Text url, CrawlDatum datum)
-      throws ScoringFilterException {
-    datum.setScore(0.0f);
-  }
-
-  /** Use {@link CrawlDatum#getScore()}. */
-  public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
-      throws ScoringFilterException {
-    return datum.getScore() * initSort;
-  }
-
-  /** Increase the score by a sum of inlinked scores. */
-  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
-      List<CrawlDatum> inlinked) throws ScoringFilterException {
-    float adjust = 0.0f;
-    for (int i = 0; i < inlinked.size(); i++) {
-      CrawlDatum linked = inlinked.get(i);
-      adjust += linked.getScore();
-    }
-    if (old == null)
-      old = datum;
-    datum.setScore(old.getScore() + adjust);
-  }
-
-  /** Store a float value of CrawlDatum.getScore() under Fetcher.SCORE_KEY. */
-  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) {
-    content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
-  }
-
-  /** Copy the value from Content metadata under Fetcher.SCORE_KEY to parseData. */
-  public void passScoreAfterParsing(Text url, Content content, Parse parse) {
-    parse.getData().getContentMeta()
-        .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
-  }
-
-  /**
-   * Get a float value from Fetcher.SCORE_KEY, divide it by the number of
-   * outlinks and apply.
-   */
-  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
-      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
-      CrawlDatum adjust, int allCount) throws ScoringFilterException {
-    float score = scoreInjected;
-    String scoreString = parseData.getContentMeta().get(Nutch.SCORE_KEY);
-    if (scoreString != null) {
-      try {
-        score = Float.parseFloat(scoreString);
-      } catch (Exception e) {
-        LOG.error("Error: ", e);
-      }
-    }
-    int validCount = targets.size();
-    if (countFiltered) {
-      score /= allCount;
-    } else {
-      if (validCount == 0) {
-        // no outlinks to distribute score, so just return adjust
-        return adjust;
-      }
-      score /= validCount;
-    }
-    // internal and external score factor
-    float internalScore = score * internalScoreFactor;
-    float externalScore = score * externalScoreFactor;
-    for (Entry<Text, CrawlDatum> target : targets) {
-      try {
-        String toHost = new URL(target.getKey().toString()).getHost();
-        String fromHost = new URL(fromUrl.toString()).getHost();
-        if (toHost.equalsIgnoreCase(fromHost)) {
-          target.getValue().setScore(internalScore);
-        } else {
-          target.getValue().setScore(externalScore);
-        }
-      } catch (MalformedURLException e) {
-        LOG.error("Error: ", e);
-        target.getValue().setScore(externalScore);
-      }
-    }
-    // XXX (ab) no adjustment? I think this is contrary to the algorithm descr.
-    // XXX in the paper, where page "loses" its score if it's distributed to
-    // XXX linked pages...
-    return adjust;
-  }
-
-  /** Dampen the boost value by scorePower. */
-  public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
-      CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
-      throws ScoringFilterException {
-    return (float) Math.pow(dbDatum.getScore(), scorePower) * initScore;
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java b/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java
deleted file mode 100644
index 26f6cbe..0000000
--- a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Scoring filter implementing a variant of the Online Page Importance Computation
- * (OPIC) algorithm.
- */
-package org.apache.nutch.scoring.opic;
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/build-ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/build-ivy.xml b/src/plugin/scoring-similarity/build-ivy.xml
deleted file mode 100644
index 50fbb96..0000000
--- a/src/plugin/scoring-similarity/build-ivy.xml
+++ /dev/null
@@ -1,54 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="scoring-similarity" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
-
-    <property name="ivy.install.version" value="2.1.0" />
-    <condition property="ivy.home" value="${env.IVY_HOME}">
-      <isset property="env.IVY_HOME" />
-    </condition>
-    <property name="ivy.home" value="${user.home}/.ant" />
-    <property name="ivy.checksums" value="" />
-    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
-    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
-
-    <target name="download-ivy" unless="offline">
-
-        <mkdir dir="${ivy.jar.dir}"/>
-        <!-- download Ivy from web site so that it can be used even without any special installation -->
-        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
-             dest="${ivy.jar.file}" usetimestamp="true"/>
-    </target>
-
-    <target name="init-ivy" depends="download-ivy">
-      <!-- try to load ivy here from ivy home, in case the user has not already dropped
-              it into ant's lib dir (note that the latter copy will always take precedence).
-              We will not fail as long as local lib dir exists (it may be empty) and
-              ivy is in at least one of ant's lib dir or the local lib dir. -->
-        <path id="ivy.lib.path">
-            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
-
-        </path>
-        <taskdef resource="org/apache/ivy/ant/antlib.xml"
-                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
-    </target>
-
-  <target name="deps-jar" depends="init-ivy">
-    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
-  </target>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/build.xml b/src/plugin/scoring-similarity/build.xml
deleted file mode 100644
index 66ac8f3..0000000
--- a/src/plugin/scoring-similarity/build.xml
+++ /dev/null
@@ -1,27 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="scoring-similarity" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-  <!-- Deploy Unit test dependencies -->
-  <target name="deps-test">
-    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
-  </target>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/ivy.xml b/src/plugin/scoring-similarity/ivy.xml
deleted file mode 100644
index be0a1de..0000000
--- a/src/plugin/scoring-similarity/ivy.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-    <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="5.5.0" conf="*->default"/>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/plugin.xml b/src/plugin/scoring-similarity/plugin.xml
deleted file mode 100644
index 9639c18..0000000
--- a/src/plugin/scoring-similarity/plugin.xml
+++ /dev/null
@@ -1,45 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="scoring-similarity"
-   name="Similarity based Scoring Plug-in"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-
-   <runtime>
-      <library name="scoring-similarity.jar">
-         <export name="*"/>
-      </library>
-      <library name="lucene-analyzers-common-5.5.0.jar"/>
-      <library name="lucene-core-5.5.0.jar"/>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-   
-   <extension id="org.apache.nutch.scoring.similarity"
-              name="SimilarityScoring"
-              point="org.apache.nutch.scoring.ScoringFilter">
-
-      <implementation id="scoring-similarity"
-                      class="org.apache.nutch.scoring.similarity.SimilarityScoringFilter" />
-   </extension>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java
deleted file mode 100644
index f44fabd..0000000
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.scoring.similarity;
-
-import java.util.Collection;
-import java.util.Map.Entry;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.protocol.Content;
-
-public interface SimilarityModel {
-
-  public void setConf(Configuration conf);
-  
-  public float setURLScoreAfterParsing(Text url, Content content, Parse parse);
-  
-  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
-      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
-      CrawlDatum adjust, int allCount);
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
deleted file mode 100644
index 0f905b8..0000000
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
+++ /dev/null
@@ -1,70 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.scoring.similarity;
-
-import java.util.Collection;
-import java.util.List;
-import java.util.Map.Entry;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.scoring.AbstractScoringFilter;
-import org.apache.nutch.scoring.ScoringFilterException;
-import org.apache.nutch.scoring.similarity.cosine.CosineSimilarity;
-
-public class SimilarityScoringFilter extends AbstractScoringFilter {
-
-  private Configuration conf;
-  private SimilarityModel similarityModel;
-  @Override
-  public Configuration getConf() {
-    return conf;
-  }
-
-  @Override
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    switch(conf.get("scoring.similarity.model","cosine")){
-    case "cosine":
-      similarityModel = (SimilarityModel) new CosineSimilarity();
-      break;
-    }
-    similarityModel.setConf(conf);
-  }
-
-  @Override
-  public void passScoreAfterParsing(Text url, Content content, Parse parse)
-      throws ScoringFilterException {
-
-    float score = similarityModel.setURLScoreAfterParsing(url, content, parse);
-    parse.getData().getContentMeta()
-    .set(Nutch.SCORE_KEY, score+"");
-  }
-
-  @Override
-  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
-      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
-      CrawlDatum adjust, int allCount) throws ScoringFilterException {
-    similarityModel.distributeScoreToOutlinks(fromUrl, parseData, targets, adjust, allCount);
-    return adjust;
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
deleted file mode 100644
index 9853b34..0000000
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
+++ /dev/null
@@ -1,84 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.scoring.similarity.cosine;
-
-import java.io.IOException;
-import java.util.Collection;
-import java.util.Map.Entry;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.scoring.similarity.SimilarityModel;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public class CosineSimilarity implements SimilarityModel{
-
-  private Configuration conf; 
-  private final static Logger LOG = LoggerFactory
-      .getLogger(CosineSimilarity.class);
-
-  @Override
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-  }
-
-  @Override
-  public float setURLScoreAfterParsing(Text url, Content content, Parse parse) {
-    float score = 1;
-
-    try {
-      if(!Model.isModelCreated){
-        Model.createModel(conf);
-      }
-      String metatags = parse.getData().getParseMeta().get("metatag.keyword");
-      String metaDescription = parse.getData().getParseMeta().get("metatag.description");
-      int[] ngramArr = Model.retrieveNgrams(conf);
-      int mingram = ngramArr[0];
-      int maxgram = ngramArr[1];
-      DocVector docVector = Model.createDocVector(parse.getText()+metaDescription+metatags, mingram, maxgram);
-      if(docVector!=null){
-        score = Model.computeCosineSimilarity(docVector);
-        LOG.info("Setting score of {} to {}",url, score);
-      }
-      else {
-        throw new Exception("Could not create DocVector from parsed text");
-      }
-    } catch (Exception e) {
-      LOG.error("Error creating Cosine Model, setting scores of urls to 1 : {}", StringUtils.stringifyException(e));
-    }
-    return score;
-  }
-
-  @Override
-  public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData,
-      Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust,
-      int allCount) {
-    float score = Float.parseFloat(parseData.getContentMeta().get(Nutch.SCORE_KEY));
-    for (Entry<Text, CrawlDatum> target : targets) {
-      target.getValue().setScore(score);
-    }
-    return adjust;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
deleted file mode 100644
index 33c3a23..0000000
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.scoring.similarity.cosine;
-
-import java.util.HashMap;
-import java.util.Map;
-
-public class DocVector {
-
-  public HashMap<Integer, Long> termVector;
-  public HashMap<String, Integer> termFreqVector;
-
-  public DocVector() {
-    termFreqVector = new HashMap<>();
-  }
-
-  public void setTermFreqVector(HashMap<String, Integer> termFreqVector) {
-    this.termFreqVector = termFreqVector;
-  }
-  
-  public void setVectorEntry(int pos, long freq) {
-    termVector.put(pos, freq);
-  }
-  
-  public float dotProduct(DocVector docVector) {
-    float product = 0;
-    for(Map.Entry<String, Integer> entry : termFreqVector.entrySet()) {
-      if(docVector.termFreqVector.containsKey(entry.getKey())) {
-        product += docVector.termFreqVector.get(entry.getKey())*entry.getValue();
-      }
-    }
-    return product;
-  }
-  
-  public float getL2Norm() {
-    float sum = 0;
-    for(Map.Entry<String, Integer> entry : termFreqVector.entrySet()) {
-      sum += entry.getValue()*entry.getValue();
-    }
-    return (float) Math.sqrt(sum);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
deleted file mode 100644
index d8180f2..0000000
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
+++ /dev/null
@@ -1,190 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.scoring.similarity.cosine;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.StemFilterType;
-import org.apache.nutch.scoring.similarity.util.LuceneTokenizer;
-import org.apache.nutch.scoring.similarity.util.LuceneTokenizer.TokenizerType;
-import org.apache.tika.Tika;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * This class creates a model used to store Document vector representation of the corpus. 
- *
- */
-public class Model {
-
-  //Currently only one file, but in future could accept a corpus hence an ArrayList
-  public static ArrayList<DocVector> docVectors = new ArrayList<>(); 
-  private static final Logger LOG = LoggerFactory.getLogger(Model.class);
-  public static boolean isModelCreated = false;
-  private static List<String> stopWords;
-
-  public static synchronized void createModel(Configuration conf) throws IOException {
-    if(isModelCreated) {
-      LOG.info("Model exists, skipping model creation");
-      return;
-    }
-    LOG.info("Creating Cosine model");
-    try {
-      //If user has specified a stopword file other than the template
-      if(!conf.get("scoring.similarity.stopword.file").equals("stopwords.txt.template")) {
-        stopWords = new ArrayList<String>();
-        String stopWord;
-        BufferedReader br = new BufferedReader(conf.getConfResourceAsReader((conf.get("scoring.similarity.stopword.file"))));
-        while ((stopWord = br.readLine()) != null) {
-          stopWords.add(stopWord);
-        }
-        LOG.info("Loaded custom stopwords from {}",conf.get("scoring.similarity.stopword.file"));
-      }
-
-      int[] ngramArr = retrieveNgrams(conf);
-      int mingram = ngramArr[0];
-      int maxgram = ngramArr[1];
-      LOG.info("Value of mingram: {} maxgram: {}", mingram, maxgram);
-
-      // TODO : Allow for corpus of documents to be provided as gold standard. 
-      String line;
-      StringBuilder sb = new StringBuilder();
-      BufferedReader br = new BufferedReader(conf.getConfResourceAsReader((conf.get("cosine.goldstandard.file"))));
-      while ((line = br.readLine()) != null) {
-        sb.append(line);
-      }
-      DocVector goldStandard = createDocVector(sb.toString(), mingram, maxgram);
-      if(goldStandard!=null)
-        docVectors.add(goldStandard);
-      else {
-        throw new Exception("Could not create DocVector for goldstandard");
-      }
-    } catch (Exception e) {
-      LOG.warn("Failed to add {} to model : {}",conf.get("cosine.goldstandard.file","goldstandard.txt.template"), 
-          StringUtils.stringifyException(e));
-    }
-    if(docVectors.size()>0) {
-      LOG.info("Cosine model creation complete");
-      isModelCreated = true;
-    }
-    else
-      LOG.info("Cosine model creation failed");
-  }
-
-  /**
-   * Used to create a DocVector from given String text. Used during the parse stage of the crawl 
-   * cycle to create a DocVector of the currently parsed page from the parseText attribute value
-   * @param content The text to tokenize
-   * @param mingram Value of mingram for tokenizing
-   * @param maxgram Value of maxgram for tokenizing
-   */
-  public static DocVector createDocVector(String content, int mingram, int maxgram) {
-    LuceneTokenizer tokenizer;
-
-    if(mingram > 1 && maxgram > 1){
-      LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram);
-      tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram);
-    } else if (mingram > 1) {
-      maxgram = mingram;
-      LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram);
-      tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram);
-    }
-    else if(stopWords!=null) {
-      tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, stopWords, true, 
-          StemFilterType.PORTERSTEM_FILTER);
-    }
-    else {
-      tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, true, 
-          StemFilterType.PORTERSTEM_FILTER);
-    }
-    TokenStream tStream = tokenizer.getTokenStream();
-    HashMap<String, Integer> termVector = new HashMap<>();
-    try {
-      CharTermAttribute charTermAttribute = tStream.addAttribute(CharTermAttribute.class);
-      tStream.reset();
-      while(tStream.incrementToken()) {
-        String term = charTermAttribute.toString();
-        LOG.debug(term);
-        if(termVector.containsKey(term)) {
-          int count = termVector.get(term);
-          count++;
-          termVector.put(term, count);
-        }
-        else {
-          termVector.put(term, 1);
-        }
-      }
-      DocVector docVector = new DocVector();
-      docVector.setTermFreqVector(termVector);
-      return docVector;
-    } catch (IOException e) {
-      LOG.error("Error creating DocVector : {}",StringUtils.stringifyException(e));
-    }
-    return null;
-  }
-
-  public static float computeCosineSimilarity(DocVector docVector) {
-    float scores[] = new float[docVectors.size()];
-    int i=0;
-    float maxScore = 0;
-    for(DocVector corpusDoc : docVectors) {
-      float numerator = docVector.dotProduct(corpusDoc);
-      float denominator = docVector.getL2Norm()*corpusDoc.getL2Norm();
-      float currentScore = numerator/denominator;
-      scores[i++] = currentScore;
-      maxScore = (currentScore>maxScore)? currentScore : maxScore;
-    }
-    // Returning the max score amongst all documents in the corpus
-    return maxScore;
-  }
-
-  /**
-   * Retrieves mingram and maxgram from configuration
-   * @param conf Configuration to retrieve mingram and maxgram
-   * @return ngram array as mingram at first index and maxgram at second index
-     */
-  public static int[] retrieveNgrams(Configuration conf){
-    int[] ngramArr = new int[2];
-    //Check if user has specified mingram or ngram for ngram cosine model
-    String[] ngramStr = conf.getStrings("scoring.similarity.ngrams", "1,1");
-    //mingram
-    ngramArr[0] = Integer.parseInt(ngramStr[0]);
-    int maxgram;
-    if (ngramStr.length > 1) {
-      //maxgram
-      ngramArr[1] = Integer.parseInt(ngramStr[1]);
-    } else {
-      //maxgram
-      ngramArr[1] = ngramArr[0];
-    }
-    return ngramArr;
-  }
-}
\ No newline at end of file