You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:20 UTC

[04/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-link/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-link/plugin.xml b/nutch-plugins/scoring-link/plugin.xml
new file mode 100644
index 0000000..2b1c1e1
--- /dev/null
+++ b/nutch-plugins/scoring-link/plugin.xml
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="scoring-link"
+   name="Link Analysis Scoring Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="scoring-link.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <extension id="org.apache.nutch.scoring.link"
+              name="LinkAnalysisScoring"
+              point="org.apache.nutch.scoring.ScoringFilter">
+
+      <implementation id="org.apache.nutch.scoring.link.LinkAnalysisScoringFilter"
+        class="org.apache.nutch.scoring.link.LinkAnalysisScoringFilter" />
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-link/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-link/pom.xml b/nutch-plugins/scoring-link/pom.xml
new file mode 100644
index 0000000..3c7041e
--- /dev/null
+++ b/nutch-plugins/scoring-link/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>scoring-link</artifactId>
+    <packaging>jar</packaging>
+
+    <name>scoring-link</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java b/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
new file mode 100644
index 0000000..a143f46
--- /dev/null
+++ b/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.link;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+
+public class LinkAnalysisScoringFilter implements ScoringFilter {
+
+  private Configuration conf;
+  private float normalizedScore = 1.00f;
+
+  public LinkAnalysisScoringFilter() {
+
+  }
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    normalizedScore = conf.getFloat("link.analyze.normalize.score", 1.00f);
+  }
+
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+      CrawlDatum adjust, int allCount) throws ScoringFilterException {
+    return adjust;
+  }
+
+  public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+      throws ScoringFilterException {
+    return datum.getScore() * initSort;
+  }
+
+  public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
+      CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+      throws ScoringFilterException {
+    return (normalizedScore * dbDatum.getScore());
+  }
+
+  public void initialScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+    datum.setScore(0.0f);
+  }
+
+  public void injectedScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+  }
+
+  public void passScoreAfterParsing(Text url, Content content, Parse parse)
+      throws ScoringFilterException {
+    parse.getData().getContentMeta()
+        .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
+  }
+
+  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
+      throws ScoringFilterException {
+    content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
+  }
+
+  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+      List<CrawlDatum> inlinked) throws ScoringFilterException {
+    // nothing to do
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/package-info.java b/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/package-info.java
new file mode 100644
index 0000000..9dc0c35
--- /dev/null
+++ b/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Scoring filter used in conjunction with
+ * {@link org.apache.nutch.scoring.webgraph.WebGraph}.
+ */
+package org.apache.nutch.scoring.link;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-opic/build.xml b/nutch-plugins/scoring-opic/build.xml
new file mode 100644
index 0000000..137dab4
--- /dev/null
+++ b/nutch-plugins/scoring-opic/build.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="scoring-opic" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-opic/ivy.xml b/nutch-plugins/scoring-opic/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/scoring-opic/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-opic/plugin.xml b/nutch-plugins/scoring-opic/plugin.xml
new file mode 100644
index 0000000..3805a31
--- /dev/null
+++ b/nutch-plugins/scoring-opic/plugin.xml
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="scoring-opic"
+   name="OPIC Scoring Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="scoring-opic.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <extension id="org.apache.nutch.scoring.opic"
+              name="OPICScoring"
+              point="org.apache.nutch.scoring.ScoringFilter">
+
+      <implementation id="org.apache.nutch.scoring.opic.OPICScoringFilter"
+                      class="org.apache.nutch.scoring.opic.OPICScoringFilter" />
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-opic/pom.xml b/nutch-plugins/scoring-opic/pom.xml
new file mode 100644
index 0000000..58e0786
--- /dev/null
+++ b/nutch-plugins/scoring-opic/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>scoring-opic</artifactId>
+    <packaging>jar</packaging>
+
+    <name>scoring-opic</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java b/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
new file mode 100644
index 0000000..e943d06
--- /dev/null
+++ b/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
@@ -0,0 +1,173 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.scoring.opic;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map.Entry;
+
+// Slf4j Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+
+/**
+ * This plugin implements a variant of an Online Page Importance Computation
+ * (OPIC) score, described in this paper: <a
+ * href="http://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html"/>
+ * Abiteboul, Serge and Preda, Mihai and Cobena, Gregory (2003), Adaptive
+ * On-Line Page Importance Computation </a>.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class OPICScoringFilter implements ScoringFilter {
+
+  private final static Logger LOG = LoggerFactory
+      .getLogger(OPICScoringFilter.class);
+
+  private Configuration conf;
+  private float scoreInjected;
+  private float scorePower;
+  private float internalScoreFactor;
+  private float externalScoreFactor;
+  private boolean countFiltered;
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    scorePower = conf.getFloat("indexer.score.power", 0.5f);
+    internalScoreFactor = conf.getFloat("db.score.link.internal", 1.0f);
+    externalScoreFactor = conf.getFloat("db.score.link.external", 1.0f);
+    countFiltered = conf.getBoolean("db.score.count.filtered", false);
+  }
+
+  public void injectedScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+  }
+
+  /**
+   * Set to 0.0f (unknown value) - inlink contributions will bring it to a
+   * correct level. Newly discovered pages have at least one inlink.
+   */
+  public void initialScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+    datum.setScore(0.0f);
+  }
+
+  /** Use {@link CrawlDatum#getScore()}. */
+  public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+      throws ScoringFilterException {
+    return datum.getScore() * initSort;
+  }
+
+  /** Increase the score by a sum of inlinked scores. */
+  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+      List<CrawlDatum> inlinked) throws ScoringFilterException {
+    float adjust = 0.0f;
+    for (int i = 0; i < inlinked.size(); i++) {
+      CrawlDatum linked = inlinked.get(i);
+      adjust += linked.getScore();
+    }
+    if (old == null)
+      old = datum;
+    datum.setScore(old.getScore() + adjust);
+  }
+
+  /** Store a float value of CrawlDatum.getScore() under Fetcher.SCORE_KEY. */
+  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) {
+    content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
+  }
+
+  /** Copy the value from Content metadata under Fetcher.SCORE_KEY to parseData. */
+  public void passScoreAfterParsing(Text url, Content content, Parse parse) {
+    parse.getData().getContentMeta()
+        .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
+  }
+
+  /**
+   * Get a float value from Fetcher.SCORE_KEY, divide it by the number of
+   * outlinks and apply.
+   */
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+      CrawlDatum adjust, int allCount) throws ScoringFilterException {
+    float score = scoreInjected;
+    String scoreString = parseData.getContentMeta().get(Nutch.SCORE_KEY);
+    if (scoreString != null) {
+      try {
+        score = Float.parseFloat(scoreString);
+      } catch (Exception e) {
+        LOG.error("Error: ", e);
+      }
+    }
+    int validCount = targets.size();
+    if (countFiltered) {
+      score /= allCount;
+    } else {
+      if (validCount == 0) {
+        // no outlinks to distribute score, so just return adjust
+        return adjust;
+      }
+      score /= validCount;
+    }
+    // internal and external score factor
+    float internalScore = score * internalScoreFactor;
+    float externalScore = score * externalScoreFactor;
+    for (Entry<Text, CrawlDatum> target : targets) {
+      try {
+        String toHost = new URL(target.getKey().toString()).getHost();
+        String fromHost = new URL(fromUrl.toString()).getHost();
+        if (toHost.equalsIgnoreCase(fromHost)) {
+          target.getValue().setScore(internalScore);
+        } else {
+          target.getValue().setScore(externalScore);
+        }
+      } catch (MalformedURLException e) {
+        LOG.error("Error: ", e);
+        target.getValue().setScore(externalScore);
+      }
+    }
+    // XXX (ab) no adjustment? I think this is contrary to the algorithm descr.
+    // XXX in the paper, where page "loses" its score if it's distributed to
+    // XXX linked pages...
+    return adjust;
+  }
+
+  /** Dampen the boost value by scorePower. */
+  public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
+      CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+      throws ScoringFilterException {
+    return (float) Math.pow(dbDatum.getScore(), scorePower) * initScore;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/package-info.java b/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/package-info.java
new file mode 100644
index 0000000..26f6cbe
--- /dev/null
+++ b/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Scoring filter implementing a variant of the Online Page Importance Computation
+ * (OPIC) algorithm.
+ */
+package org.apache.nutch.scoring.opic;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/build-ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/build-ivy.xml b/nutch-plugins/scoring-similarity/build-ivy.xml
new file mode 100644
index 0000000..50fbb96
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="scoring-similarity" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without any special installation -->
+        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not already dropped
+              it into ant's lib dir (note that the latter copy will always take precedence).
+              We will not fail as long as local lib dir exists (it may be empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/build.xml b/nutch-plugins/scoring-similarity/build.xml
new file mode 100644
index 0000000..66ac8f3
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/build.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="scoring-similarity" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/ivy.xml b/nutch-plugins/scoring-similarity/ivy.xml
new file mode 100644
index 0000000..be0a1de
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="5.5.0" conf="*->default"/>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/plugin.xml b/nutch-plugins/scoring-similarity/plugin.xml
new file mode 100644
index 0000000..9639c18
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/plugin.xml
@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="scoring-similarity"
+   name="Similarity based Scoring Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="scoring-similarity.jar">
+         <export name="*"/>
+      </library>
+      <library name="lucene-analyzers-common-5.5.0.jar"/>
+      <library name="lucene-core-5.5.0.jar"/>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+   
+   <extension id="org.apache.nutch.scoring.similarity"
+              name="SimilarityScoring"
+              point="org.apache.nutch.scoring.ScoringFilter">
+
+      <implementation id="scoring-similarity"
+                      class="org.apache.nutch.scoring.similarity.SimilarityScoringFilter" />
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/pom.xml b/nutch-plugins/scoring-similarity/pom.xml
new file mode 100644
index 0000000..b1f7cb7
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/pom.xml
@@ -0,0 +1,45 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>scoring-similarity</artifactId>
+    <packaging>jar</packaging>
+
+    <name>scoring-similarity</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-analyzers-common</artifactId>
+            <version>5.5.0</version>
+        </dependency>
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityModel.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityModel.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityModel.java
new file mode 100644
index 0000000..f44fabd
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityModel.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity;
+
+import java.util.Collection;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+
+public interface SimilarityModel {
+
+  public void setConf(Configuration conf);
+  
+  public float setURLScoreAfterParsing(Text url, Content content, Parse parse);
+  
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+      CrawlDatum adjust, int allCount);
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
new file mode 100644
index 0000000..0f905b8
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.AbstractScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.scoring.similarity.cosine.CosineSimilarity;
+
+public class SimilarityScoringFilter extends AbstractScoringFilter {
+
+  private Configuration conf;
+  private SimilarityModel similarityModel;
+  @Override
+  public Configuration getConf() {
+    return conf;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    switch(conf.get("scoring.similarity.model","cosine")){
+    case "cosine":
+      similarityModel = (SimilarityModel) new CosineSimilarity();
+      break;
+    }
+    similarityModel.setConf(conf);
+  }
+
+  @Override
+  public void passScoreAfterParsing(Text url, Content content, Parse parse)
+      throws ScoringFilterException {
+
+    float score = similarityModel.setURLScoreAfterParsing(url, content, parse);
+    parse.getData().getContentMeta()
+    .set(Nutch.SCORE_KEY, score+"");
+  }
+
+  @Override
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+      CrawlDatum adjust, int allCount) throws ScoringFilterException {
+    similarityModel.distributeScoreToOutlinks(fromUrl, parseData, targets, adjust, allCount);
+    return adjust;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
new file mode 100644
index 0000000..9853b34
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.cosine;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.similarity.SimilarityModel;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class CosineSimilarity implements SimilarityModel{
+
+  private Configuration conf; 
+  private final static Logger LOG = LoggerFactory
+      .getLogger(CosineSimilarity.class);
+
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  @Override
+  public float setURLScoreAfterParsing(Text url, Content content, Parse parse) {
+    float score = 1;
+
+    try {
+      if(!Model.isModelCreated){
+        Model.createModel(conf);
+      }
+      String metatags = parse.getData().getParseMeta().get("metatag.keyword");
+      String metaDescription = parse.getData().getParseMeta().get("metatag.description");
+      int[] ngramArr = Model.retrieveNgrams(conf);
+      int mingram = ngramArr[0];
+      int maxgram = ngramArr[1];
+      DocVector docVector = Model.createDocVector(parse.getText()+metaDescription+metatags, mingram, maxgram);
+      if(docVector!=null){
+        score = Model.computeCosineSimilarity(docVector);
+        LOG.info("Setting score of {} to {}",url, score);
+      }
+      else {
+        throw new Exception("Could not create DocVector from parsed text");
+      }
+    } catch (Exception e) {
+      LOG.error("Error creating Cosine Model, setting scores of urls to 1 : {}", StringUtils.stringifyException(e));
+    }
+    return score;
+  }
+
+  @Override
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData,
+      Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust,
+      int allCount) {
+    float score = Float.parseFloat(parseData.getContentMeta().get(Nutch.SCORE_KEY));
+    for (Entry<Text, CrawlDatum> target : targets) {
+      target.getValue().setScore(score);
+    }
+    return adjust;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
new file mode 100644
index 0000000..33c3a23
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.cosine;
+
+import java.util.HashMap;
+import java.util.Map;
+
+public class DocVector {
+
+  public HashMap<Integer, Long> termVector;
+  public HashMap<String, Integer> termFreqVector;
+
+  public DocVector() {
+    termFreqVector = new HashMap<>();
+  }
+
+  public void setTermFreqVector(HashMap<String, Integer> termFreqVector) {
+    this.termFreqVector = termFreqVector;
+  }
+  
+  public void setVectorEntry(int pos, long freq) {
+    termVector.put(pos, freq);
+  }
+  
+  public float dotProduct(DocVector docVector) {
+    float product = 0;
+    for(Map.Entry<String, Integer> entry : termFreqVector.entrySet()) {
+      if(docVector.termFreqVector.containsKey(entry.getKey())) {
+        product += docVector.termFreqVector.get(entry.getKey())*entry.getValue();
+      }
+    }
+    return product;
+  }
+  
+  public float getL2Norm() {
+    float sum = 0;
+    for(Map.Entry<String, Integer> entry : termFreqVector.entrySet()) {
+      sum += entry.getValue()*entry.getValue();
+    }
+    return (float) Math.sqrt(sum);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/Model.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/Model.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/Model.java
new file mode 100644
index 0000000..d8180f2
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/Model.java
@@ -0,0 +1,190 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.cosine;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.StemFilterType;
+import org.apache.nutch.scoring.similarity.util.LuceneTokenizer;
+import org.apache.nutch.scoring.similarity.util.LuceneTokenizer.TokenizerType;
+import org.apache.tika.Tika;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class creates a model used to store Document vector representation of the corpus. 
+ *
+ */
+public class Model {
+
+  //Currently only one file, but in future could accept a corpus hence an ArrayList
+  public static ArrayList<DocVector> docVectors = new ArrayList<>(); 
+  private static final Logger LOG = LoggerFactory.getLogger(Model.class);
+  public static boolean isModelCreated = false;
+  private static List<String> stopWords;
+
+  public static synchronized void createModel(Configuration conf) throws IOException {
+    if(isModelCreated) {
+      LOG.info("Model exists, skipping model creation");
+      return;
+    }
+    LOG.info("Creating Cosine model");
+    try {
+      //If user has specified a stopword file other than the template
+      if(!conf.get("scoring.similarity.stopword.file").equals("stopwords.txt.template")) {
+        stopWords = new ArrayList<String>();
+        String stopWord;
+        BufferedReader br = new BufferedReader(conf.getConfResourceAsReader((conf.get("scoring.similarity.stopword.file"))));
+        while ((stopWord = br.readLine()) != null) {
+          stopWords.add(stopWord);
+        }
+        LOG.info("Loaded custom stopwords from {}",conf.get("scoring.similarity.stopword.file"));
+      }
+
+      int[] ngramArr = retrieveNgrams(conf);
+      int mingram = ngramArr[0];
+      int maxgram = ngramArr[1];
+      LOG.info("Value of mingram: {} maxgram: {}", mingram, maxgram);
+
+      // TODO : Allow for corpus of documents to be provided as gold standard. 
+      String line;
+      StringBuilder sb = new StringBuilder();
+      BufferedReader br = new BufferedReader(conf.getConfResourceAsReader((conf.get("cosine.goldstandard.file"))));
+      while ((line = br.readLine()) != null) {
+        sb.append(line);
+      }
+      DocVector goldStandard = createDocVector(sb.toString(), mingram, maxgram);
+      if(goldStandard!=null)
+        docVectors.add(goldStandard);
+      else {
+        throw new Exception("Could not create DocVector for goldstandard");
+      }
+    } catch (Exception e) {
+      LOG.warn("Failed to add {} to model : {}",conf.get("cosine.goldstandard.file","goldstandard.txt.template"), 
+          StringUtils.stringifyException(e));
+    }
+    if(docVectors.size()>0) {
+      LOG.info("Cosine model creation complete");
+      isModelCreated = true;
+    }
+    else
+      LOG.info("Cosine model creation failed");
+  }
+
+  /**
+   * Used to create a DocVector from given String text. Used during the parse stage of the crawl 
+   * cycle to create a DocVector of the currently parsed page from the parseText attribute value
+   * @param content The text to tokenize
+   * @param mingram Value of mingram for tokenizing
+   * @param maxgram Value of maxgram for tokenizing
+   */
+  public static DocVector createDocVector(String content, int mingram, int maxgram) {
+    LuceneTokenizer tokenizer;
+
+    if(mingram > 1 && maxgram > 1){
+      LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram);
+      tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram);
+    } else if (mingram > 1) {
+      maxgram = mingram;
+      LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram);
+      tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram);
+    }
+    else if(stopWords!=null) {
+      tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, stopWords, true, 
+          StemFilterType.PORTERSTEM_FILTER);
+    }
+    else {
+      tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, true, 
+          StemFilterType.PORTERSTEM_FILTER);
+    }
+    TokenStream tStream = tokenizer.getTokenStream();
+    HashMap<String, Integer> termVector = new HashMap<>();
+    try {
+      CharTermAttribute charTermAttribute = tStream.addAttribute(CharTermAttribute.class);
+      tStream.reset();
+      while(tStream.incrementToken()) {
+        String term = charTermAttribute.toString();
+        LOG.debug(term);
+        if(termVector.containsKey(term)) {
+          int count = termVector.get(term);
+          count++;
+          termVector.put(term, count);
+        }
+        else {
+          termVector.put(term, 1);
+        }
+      }
+      DocVector docVector = new DocVector();
+      docVector.setTermFreqVector(termVector);
+      return docVector;
+    } catch (IOException e) {
+      LOG.error("Error creating DocVector : {}",StringUtils.stringifyException(e));
+    }
+    return null;
+  }
+
+  public static float computeCosineSimilarity(DocVector docVector) {
+    float scores[] = new float[docVectors.size()];
+    int i=0;
+    float maxScore = 0;
+    for(DocVector corpusDoc : docVectors) {
+      float numerator = docVector.dotProduct(corpusDoc);
+      float denominator = docVector.getL2Norm()*corpusDoc.getL2Norm();
+      float currentScore = numerator/denominator;
+      scores[i++] = currentScore;
+      maxScore = (currentScore>maxScore)? currentScore : maxScore;
+    }
+    // Returning the max score amongst all documents in the corpus
+    return maxScore;
+  }
+
+  /**
+   * Retrieves mingram and maxgram from configuration
+   * @param conf Configuration to retrieve mingram and maxgram
+   * @return ngram array as mingram at first index and maxgram at second index
+     */
+  public static int[] retrieveNgrams(Configuration conf){
+    int[] ngramArr = new int[2];
+    //Check if user has specified mingram or ngram for ngram cosine model
+    String[] ngramStr = conf.getStrings("scoring.similarity.ngrams", "1,1");
+    //mingram
+    ngramArr[0] = Integer.parseInt(ngramStr[0]);
+    int maxgram;
+    if (ngramStr.length > 1) {
+      //maxgram
+      ngramArr[1] = Integer.parseInt(ngramStr[1]);
+    } else {
+      //maxgram
+      ngramArr[1] = ngramArr[0];
+    }
+    return ngramArr;
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/package-info.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/package-info.java
new file mode 100644
index 0000000..70ae4ab
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/package-info.java
@@ -0,0 +1,7 @@
+/**
+ * 
+ */
+/** Implements the cosine similarity metric for scoring relevant documents 
+ *
+ */
+package org.apache.nutch.scoring.similarity.cosine;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
new file mode 100644
index 0000000..4b519bc
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.util;
+
+import java.io.Reader;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
+import org.apache.lucene.analysis.en.PorterStemFilter;
+import org.apache.lucene.analysis.standard.ClassicTokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.util.CharArraySet;
+
+/**
+ * Creates a custom analyzer based on user provided inputs
+ *
+ */
+public class LuceneAnalyzerUtil extends Analyzer{ 
+  
+  public static enum StemFilterType { PORTERSTEM_FILTER, ENGLISHMINIMALSTEM_FILTER, NONE }
+  
+  private static StemFilterType stemFilterType;
+  private static CharArraySet stopSet;
+  
+  
+  /**
+   * Creates an analyzer instance based on Lucene default stopword set if @param useStopFilter is set to true
+   */
+  public LuceneAnalyzerUtil(StemFilterType stemFilterType, boolean useStopFilter) {
+    LuceneAnalyzerUtil.stemFilterType = stemFilterType;
+    if(useStopFilter) {
+      stopSet = StandardAnalyzer.STOP_WORDS_SET;
+    }
+    else {
+      stopSet = null;
+    }
+  }
+  
+  /**
+   * Creates an analyzer instance based on user provided stop words. If @param addToDefault is set to true, then 
+   * user provided stop words will be added to the Lucene default stopset.
+   */
+  public LuceneAnalyzerUtil(StemFilterType stemFilterType, List<String> stopWords, boolean addToDefault) {
+    LuceneAnalyzerUtil.stemFilterType = stemFilterType;
+    if(addToDefault) {
+      stopSet.addAll(stopWords);
+    }
+    else {
+      stopSet = StopFilter.makeStopSet(stopWords);
+    }
+  }
+    
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName) {
+    Tokenizer source = new ClassicTokenizer();
+    TokenStream filter = new LowerCaseFilter(source);
+    if(stopSet != null) {
+      filter = new StopFilter(filter, stopSet);
+    }
+    
+    switch(stemFilterType){
+    case PORTERSTEM_FILTER:
+      filter = new PorterStemFilter(filter);
+      break;
+    case ENGLISHMINIMALSTEM_FILTER:
+      filter = new EnglishMinimalStemFilter(filter);
+      break;
+    default:
+      break;        
+    }
+    return new TokenStreamComponents(source, filter);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
new file mode 100644
index 0000000..acb987c
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
@@ -0,0 +1,166 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.util;
+
+import java.io.StringReader;
+import java.util.List;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
+import org.apache.lucene.analysis.en.PorterStemFilter;
+import org.apache.lucene.analysis.standard.ClassicTokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.StemFilterType;
+
+public class LuceneTokenizer {
+
+  private TokenStream tokenStream; 
+  private TokenizerType tokenizer;
+  private StemFilterType stemFilterType;
+  private CharArraySet stopSet = null;
+
+  public static enum TokenizerType {CLASSIC, STANDARD}
+
+  /**
+   * Creates a tokenizer based on param values
+   * @param content - The text to tokenize
+   * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT 
+   * @param useStopFilter - if set to true the token stream will be filtered using default Lucene stopset 
+   * @param stemFilterType - Type of stemming to perform 
+   */
+  public LuceneTokenizer(String content, TokenizerType tokenizer, boolean useStopFilter, StemFilterType stemFilterType) {
+    this.tokenizer = tokenizer;
+    this.stemFilterType = stemFilterType;
+    if(useStopFilter) {
+      stopSet = StandardAnalyzer.STOP_WORDS_SET;
+    }
+    tokenStream = createTokenStream(content);
+  }
+
+  /**
+   * Creates a tokenizer based on param values
+   * @param content - The text to tokenize
+   * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT 
+   * @param stopSet - Provide a set of user defined stop words
+   * @param addToDefault - If set to true, the stopSet words will be added to the Lucene default stop set.
+   * If false, then only the user provided words will be used as the stop set
+   * @param stemFilterType
+   */
+  public LuceneTokenizer(String content, TokenizerType tokenizer, List<String> stopWords, boolean addToDefault, StemFilterType stemFilterType) {
+    this.tokenizer = tokenizer;
+    this.stemFilterType = stemFilterType;
+    if(addToDefault) {
+      CharArraySet stopSet = CharArraySet.copy(StandardAnalyzer.STOP_WORDS_SET);;
+      for(String word : stopWords){
+        stopSet.add(word);
+      }
+      this.stopSet = stopSet;
+    }
+    else {
+      stopSet = new CharArraySet(stopWords, true);
+    }
+    tokenStream = createTokenStream(content);
+  }
+
+  /**
+   * Returns the tokenStream created by the Tokenizer
+   * @return
+   */
+  public TokenStream getTokenStream() {
+    return tokenStream;
+  }
+  
+  /**
+   * Creates a tokenizer for the ngram model based on param values
+   * @param content - The text to tokenize
+   * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT 
+   * @param stemFilterType - Type of stemming to perform
+   * @param mingram - Value of mingram for tokenizing
+   * @param maxgram - Value of maxgram for tokenizing
+   */
+  public LuceneTokenizer(String content, TokenizerType tokenizer, StemFilterType stemFilterType, int mingram, int maxgram) {
+    this.tokenizer = tokenizer;
+    this.stemFilterType = stemFilterType;
+    tokenStream = createNGramTokenStream(content, mingram, maxgram);
+  }
+  
+  private TokenStream createTokenStream(String content) {
+    tokenStream = generateTokenStreamFromText(content, tokenizer);
+    tokenStream = new LowerCaseFilter(tokenStream);
+    if(stopSet != null) {
+      tokenStream = applyStopFilter(stopSet);
+    }
+    tokenStream = applyStemmer(stemFilterType);
+    return tokenStream;
+  }
+
+  private TokenStream generateTokenStreamFromText(String content, TokenizerType tokenizerType){
+    Tokenizer tokenizer = null;
+    switch(tokenizerType){
+    case CLASSIC:
+      tokenizer = new ClassicTokenizer();
+      break;
+
+    case STANDARD:
+    default:
+      tokenizer = new StandardTokenizer();
+    }
+
+    tokenizer.setReader(new StringReader(content));
+
+    tokenStream = tokenizer;
+
+    return tokenStream;
+  }
+
+  private TokenStream createNGramTokenStream(String content, int mingram, int maxgram) {
+    Tokenizer tokenizer = new StandardTokenizer();
+    tokenizer.setReader(new StringReader(content));
+    tokenStream = new LowerCaseFilter(tokenizer);
+    tokenStream = applyStemmer(stemFilterType);
+    ShingleFilter shingleFilter = new ShingleFilter(tokenStream, mingram, maxgram);
+    shingleFilter.setOutputUnigrams(false);
+    tokenStream = (TokenStream)shingleFilter;
+    return tokenStream;
+  }
+
+  private TokenStream applyStopFilter(CharArraySet stopWords) {
+    tokenStream = new StopFilter(tokenStream, stopWords); 
+    return tokenStream;
+  }
+
+  private TokenStream applyStemmer(StemFilterType stemFilterType) {
+    switch(stemFilterType){
+    case ENGLISHMINIMALSTEM_FILTER:
+      tokenStream = new EnglishMinimalStemFilter(tokenStream);
+      break;
+    case PORTERSTEM_FILTER:
+      tokenStream = new PorterStemFilter(tokenStream);
+      break;
+    default:
+      break;
+    }
+
+    return tokenStream; 
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/package-info.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/package-info.java
new file mode 100644
index 0000000..f660977
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/package-info.java
@@ -0,0 +1,24 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * 
+ */
+/**
+ * Utility package for Lucene functions
+ *
+ */
+package org.apache.nutch.scoring.similarity.util;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/README.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/README.txt b/nutch-plugins/subcollection/README.txt
new file mode 100644
index 0000000..6b79d16
--- /dev/null
+++ b/nutch-plugins/subcollection/README.txt
@@ -0,0 +1,10 @@
+For brief description about this plugin see
+src/java/org/apache/nutch/collection/package.html
+
+Basically:
+You need to enable this during indexing and during searching
+
+After indexing you can limit your searches to certain
+subcollection with keyword subcollection, eg. 
+
+"subcollection:nutch hadoop"

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/build.xml b/nutch-plugins/subcollection/build.xml
new file mode 100644
index 0000000..77beac6
--- /dev/null
+++ b/nutch-plugins/subcollection/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="subcollection" default="jar">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/ivy.xml b/nutch-plugins/subcollection/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/subcollection/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/plugin.xml b/nutch-plugins/subcollection/plugin.xml
new file mode 100644
index 0000000..ca2cf2f
--- /dev/null
+++ b/nutch-plugins/subcollection/plugin.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="subcollection"
+   name="Subcollection indexing and query filter"
+   version="1.0.0"
+   provider-name="apache.org">
+
+   <runtime>
+      <library name="subcollection.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+   
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.indexer.subcollection.indexing"
+              name="Subcollection Indexing Filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="SubcollectionIndexingFilter"
+                      class="org.apache.nutch.indexer.subcollection.SubcollectionIndexingFilter"/>
+                      
+   </extension>
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/pom.xml b/nutch-plugins/subcollection/pom.xml
new file mode 100644
index 0000000..d8e3a97
--- /dev/null
+++ b/nutch-plugins/subcollection/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>subcollection</artifactId>
+    <packaging>jar</packaging>
+
+    <name>subcollection</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/CollectionManager.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/CollectionManager.java b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/CollectionManager.java
new file mode 100644
index 0000000..0dff3f8
--- /dev/null
+++ b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/CollectionManager.java
@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.collection;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.nutch.util.DomUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.ObjectCache;
+import org.apache.xerces.dom.DocumentImpl;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+
+public class CollectionManager extends Configured {
+
+  public static final String DEFAULT_FILE_NAME = "subcollections.xml";
+
+  static final Logger LOG = LoggerFactory.getLogger(CollectionManager.class);
+
+  transient Map<String, Subcollection> collectionMap = new HashMap<String, Subcollection>();
+
+  transient URL configfile;
+
+  public CollectionManager(Configuration conf) {
+    super(conf);
+    init();
+  }
+
+  /**
+   * Used for testing
+   */
+  protected CollectionManager() {
+    super(NutchConfiguration.create());
+  }
+
+  protected void init() {
+    try {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("initializing CollectionManager");
+      }
+      // initialize known subcollections
+      configfile = getConf().getResource(
+          getConf().get("subcollections.config", DEFAULT_FILE_NAME));
+
+      InputStream input = getConf().getConfResourceAsInputStream(
+          getConf().get("subcollections.config", DEFAULT_FILE_NAME));
+      parse(input);
+    } catch (Exception e) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Error occured:" + e);
+      }
+    }
+  }
+
+  protected void parse(InputStream input) {
+    Element collections = DomUtil.getDom(input);
+
+    if (collections != null) {
+      NodeList nodeList = collections
+          .getElementsByTagName(Subcollection.TAG_COLLECTION);
+
+      if (LOG.isInfoEnabled()) {
+        LOG.info("file has " + nodeList.getLength() + " elements");
+      }
+
+      for (int i = 0; i < nodeList.getLength(); i++) {
+        Element scElem = (Element) nodeList.item(i);
+        Subcollection subCol = new Subcollection(getConf());
+        subCol.initialize(scElem);
+        collectionMap.put(subCol.name, subCol);
+      }
+    } else if (LOG.isInfoEnabled()) {
+      LOG.info("Cannot find collections");
+    }
+  }
+
+  public static CollectionManager getCollectionManager(Configuration conf) {
+    String key = "collectionmanager";
+    ObjectCache objectCache = ObjectCache.get(conf);
+    CollectionManager impl = (CollectionManager) objectCache.getObject(key);
+    if (impl == null) {
+      try {
+        if (LOG.isInfoEnabled()) {
+          LOG.info("Instantiating CollectionManager");
+        }
+        impl = new CollectionManager(conf);
+        objectCache.setObject(key, impl);
+      } catch (Exception e) {
+        throw new RuntimeException("Couldn't create CollectionManager", e);
+      }
+    }
+    return impl;
+  }
+
+  /**
+   * Returns named subcollection
+   * 
+   * @param id
+   * @return Named SubCollection (or null if not existing)
+   */
+  public Subcollection getSubColection(final String id) {
+    return (Subcollection) collectionMap.get(id);
+  }
+
+  /**
+   * Delete named subcollection
+   * 
+   * @param id
+   *          Id of SubCollection to delete
+   */
+  public void deleteSubCollection(final String id) throws IOException {
+    final Subcollection subCol = getSubColection(id);
+    if (subCol != null) {
+      collectionMap.remove(id);
+    }
+  }
+
+  /**
+   * Create a new subcollection.
+   * 
+   * @param name
+   *          Name of SubCollection to create
+   * @return Created SubCollection or null if allready existed
+   */
+  public Subcollection createSubCollection(final String id, final String name) {
+    Subcollection subCol = null;
+
+    if (!collectionMap.containsKey(id)) {
+      subCol = new Subcollection(id, name, getConf());
+      collectionMap.put(id, subCol);
+    }
+
+    return subCol;
+  }
+
+  /**
+   * Return names of collections url is part of
+   * 
+   * @param url
+   *          The url to test against Collections
+   * @return Subcollections
+   */
+  public List<Subcollection> getSubCollections(final String url) {
+    List<Subcollection> collections = new ArrayList<Subcollection>();
+    final Iterator iterator = collectionMap.values().iterator();
+
+    while (iterator.hasNext()) {
+      final Subcollection subCol = (Subcollection) iterator.next();
+      if (subCol.filter(url) != null) {
+        collections.add(subCol);
+      }
+    }
+    if (LOG.isTraceEnabled()) {
+      LOG.trace("subcollections:" + Arrays.toString(collections.toArray()));
+    }
+
+    return collections;
+  }
+
+  /**
+   * Returns all collections
+   * 
+   * @return All collections CollectionManager knows about
+   */
+  public Collection getAll() {
+    return collectionMap.values();
+  }
+
+  /**
+   * Save collections into file
+   * 
+   * @throws Exception
+   */
+  public void save() throws IOException {
+    try {
+      final FileOutputStream fos = new FileOutputStream(new File(
+          configfile.getFile()));
+      final Document doc = new DocumentImpl();
+      final Element collections = doc
+          .createElement(Subcollection.TAG_COLLECTIONS);
+      final Iterator iterator = collectionMap.values().iterator();
+
+      while (iterator.hasNext()) {
+        final Subcollection subCol = (Subcollection) iterator.next();
+        final Element collection = doc
+            .createElement(Subcollection.TAG_COLLECTION);
+        collections.appendChild(collection);
+        final Element name = doc.createElement(Subcollection.TAG_NAME);
+        name.setNodeValue(subCol.getName());
+        collection.appendChild(name);
+        final Element whiteList = doc
+            .createElement(Subcollection.TAG_WHITELIST);
+        whiteList.setNodeValue(subCol.getWhiteListString());
+        collection.appendChild(whiteList);
+        final Element blackList = doc
+            .createElement(Subcollection.TAG_BLACKLIST);
+        blackList.setNodeValue(subCol.getBlackListString());
+        collection.appendChild(blackList);
+      }
+
+      DomUtil.saveDom(fos, collections);
+      fos.flush();
+      fos.close();
+    } catch (FileNotFoundException e) {
+      throw new IOException(e.toString());
+    }
+  }
+}