You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:20 UTC
[04/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build
for nutch-core and nutch-plugins
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-link/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-link/plugin.xml b/nutch-plugins/scoring-link/plugin.xml
new file mode 100644
index 0000000..2b1c1e1
--- /dev/null
+++ b/nutch-plugins/scoring-link/plugin.xml
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="scoring-link"
+ name="Link Analysis Scoring Plug-in"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+
+ <runtime>
+ <library name="scoring-link.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <extension id="org.apache.nutch.scoring.link"
+ name="LinkAnalysisScoring"
+ point="org.apache.nutch.scoring.ScoringFilter">
+
+ <implementation id="org.apache.nutch.scoring.link.LinkAnalysisScoringFilter"
+ class="org.apache.nutch.scoring.link.LinkAnalysisScoringFilter" />
+ </extension>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-link/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-link/pom.xml b/nutch-plugins/scoring-link/pom.xml
new file mode 100644
index 0000000..3c7041e
--- /dev/null
+++ b/nutch-plugins/scoring-link/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>scoring-link</artifactId>
+ <packaging>jar</packaging>
+
+ <name>scoring-link</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java b/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
new file mode 100644
index 0000000..a143f46
--- /dev/null
+++ b/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.link;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+
+public class LinkAnalysisScoringFilter implements ScoringFilter {
+
+ private Configuration conf;
+ private float normalizedScore = 1.00f;
+
+ public LinkAnalysisScoringFilter() {
+
+ }
+
+ public Configuration getConf() {
+ return conf;
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ normalizedScore = conf.getFloat("link.analyze.normalize.score", 1.00f);
+ }
+
+ public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+ ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+ CrawlDatum adjust, int allCount) throws ScoringFilterException {
+ return adjust;
+ }
+
+ public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+ throws ScoringFilterException {
+ return datum.getScore() * initSort;
+ }
+
+ public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
+ CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+ throws ScoringFilterException {
+ return (normalizedScore * dbDatum.getScore());
+ }
+
+ public void initialScore(Text url, CrawlDatum datum)
+ throws ScoringFilterException {
+ datum.setScore(0.0f);
+ }
+
+ public void injectedScore(Text url, CrawlDatum datum)
+ throws ScoringFilterException {
+ }
+
+ public void passScoreAfterParsing(Text url, Content content, Parse parse)
+ throws ScoringFilterException {
+ parse.getData().getContentMeta()
+ .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
+ }
+
+ public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
+ throws ScoringFilterException {
+ content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
+ }
+
+ public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+ List<CrawlDatum> inlinked) throws ScoringFilterException {
+ // nothing to do
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/package-info.java b/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/package-info.java
new file mode 100644
index 0000000..9dc0c35
--- /dev/null
+++ b/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Scoring filter used in conjunction with
+ * {@link org.apache.nutch.scoring.webgraph.WebGraph}.
+ */
+package org.apache.nutch.scoring.link;
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-opic/build.xml b/nutch-plugins/scoring-opic/build.xml
new file mode 100644
index 0000000..137dab4
--- /dev/null
+++ b/nutch-plugins/scoring-opic/build.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="scoring-opic" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- Deploy Unit test dependencies -->
+ <target name="deps-test">
+ <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+ </target>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-opic/ivy.xml b/nutch-plugins/scoring-opic/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/scoring-opic/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-opic/plugin.xml b/nutch-plugins/scoring-opic/plugin.xml
new file mode 100644
index 0000000..3805a31
--- /dev/null
+++ b/nutch-plugins/scoring-opic/plugin.xml
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="scoring-opic"
+ name="OPIC Scoring Plug-in"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+
+ <runtime>
+ <library name="scoring-opic.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <extension id="org.apache.nutch.scoring.opic"
+ name="OPICScoring"
+ point="org.apache.nutch.scoring.ScoringFilter">
+
+ <implementation id="org.apache.nutch.scoring.opic.OPICScoringFilter"
+ class="org.apache.nutch.scoring.opic.OPICScoringFilter" />
+ </extension>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-opic/pom.xml b/nutch-plugins/scoring-opic/pom.xml
new file mode 100644
index 0000000..58e0786
--- /dev/null
+++ b/nutch-plugins/scoring-opic/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>scoring-opic</artifactId>
+ <packaging>jar</packaging>
+
+ <name>scoring-opic</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java b/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
new file mode 100644
index 0000000..e943d06
--- /dev/null
+++ b/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
@@ -0,0 +1,173 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.scoring.opic;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map.Entry;
+
+// Slf4j Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+
+/**
+ * This plugin implements a variant of an Online Page Importance Computation
+ * (OPIC) score, described in this paper: <a
+ * href="http://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html"/>
+ * Abiteboul, Serge and Preda, Mihai and Cobena, Gregory (2003), Adaptive
+ * On-Line Page Importance Computation </a>.
+ *
+ * @author Andrzej Bialecki
+ */
+public class OPICScoringFilter implements ScoringFilter {
+
+ private final static Logger LOG = LoggerFactory
+ .getLogger(OPICScoringFilter.class);
+
+ private Configuration conf;
+ private float scoreInjected;
+ private float scorePower;
+ private float internalScoreFactor;
+ private float externalScoreFactor;
+ private boolean countFiltered;
+
+ public Configuration getConf() {
+ return conf;
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ scorePower = conf.getFloat("indexer.score.power", 0.5f);
+ internalScoreFactor = conf.getFloat("db.score.link.internal", 1.0f);
+ externalScoreFactor = conf.getFloat("db.score.link.external", 1.0f);
+ countFiltered = conf.getBoolean("db.score.count.filtered", false);
+ }
+
+ public void injectedScore(Text url, CrawlDatum datum)
+ throws ScoringFilterException {
+ }
+
+ /**
+ * Set to 0.0f (unknown value) - inlink contributions will bring it to a
+ * correct level. Newly discovered pages have at least one inlink.
+ */
+ public void initialScore(Text url, CrawlDatum datum)
+ throws ScoringFilterException {
+ datum.setScore(0.0f);
+ }
+
+ /** Use {@link CrawlDatum#getScore()}. */
+ public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+ throws ScoringFilterException {
+ return datum.getScore() * initSort;
+ }
+
+ /** Increase the score by a sum of inlinked scores. */
+ public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+ List<CrawlDatum> inlinked) throws ScoringFilterException {
+ float adjust = 0.0f;
+ for (int i = 0; i < inlinked.size(); i++) {
+ CrawlDatum linked = inlinked.get(i);
+ adjust += linked.getScore();
+ }
+ if (old == null)
+ old = datum;
+ datum.setScore(old.getScore() + adjust);
+ }
+
+ /** Store a float value of CrawlDatum.getScore() under Fetcher.SCORE_KEY. */
+ public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) {
+ content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
+ }
+
+ /** Copy the value from Content metadata under Fetcher.SCORE_KEY to parseData. */
+ public void passScoreAfterParsing(Text url, Content content, Parse parse) {
+ parse.getData().getContentMeta()
+ .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
+ }
+
+ /**
+ * Get a float value from Fetcher.SCORE_KEY, divide it by the number of
+ * outlinks and apply.
+ */
+ public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+ ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+ CrawlDatum adjust, int allCount) throws ScoringFilterException {
+ float score = scoreInjected;
+ String scoreString = parseData.getContentMeta().get(Nutch.SCORE_KEY);
+ if (scoreString != null) {
+ try {
+ score = Float.parseFloat(scoreString);
+ } catch (Exception e) {
+ LOG.error("Error: ", e);
+ }
+ }
+ int validCount = targets.size();
+ if (countFiltered) {
+ score /= allCount;
+ } else {
+ if (validCount == 0) {
+ // no outlinks to distribute score, so just return adjust
+ return adjust;
+ }
+ score /= validCount;
+ }
+ // internal and external score factor
+ float internalScore = score * internalScoreFactor;
+ float externalScore = score * externalScoreFactor;
+ for (Entry<Text, CrawlDatum> target : targets) {
+ try {
+ String toHost = new URL(target.getKey().toString()).getHost();
+ String fromHost = new URL(fromUrl.toString()).getHost();
+ if (toHost.equalsIgnoreCase(fromHost)) {
+ target.getValue().setScore(internalScore);
+ } else {
+ target.getValue().setScore(externalScore);
+ }
+ } catch (MalformedURLException e) {
+ LOG.error("Error: ", e);
+ target.getValue().setScore(externalScore);
+ }
+ }
+ // XXX (ab) no adjustment? I think this is contrary to the algorithm descr.
+ // XXX in the paper, where page "loses" its score if it's distributed to
+ // XXX linked pages...
+ return adjust;
+ }
+
+ /** Dampen the boost value by scorePower. */
+ public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
+ CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+ throws ScoringFilterException {
+ return (float) Math.pow(dbDatum.getScore(), scorePower) * initScore;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/package-info.java b/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/package-info.java
new file mode 100644
index 0000000..26f6cbe
--- /dev/null
+++ b/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Scoring filter implementing a variant of the Online Page Importance Computation
+ * (OPIC) algorithm.
+ */
+package org.apache.nutch.scoring.opic;
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/build-ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/build-ivy.xml b/nutch-plugins/scoring-similarity/build-ivy.xml
new file mode 100644
index 0000000..50fbb96
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="scoring-similarity" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+ <property name="ivy.install.version" value="2.1.0" />
+ <condition property="ivy.home" value="${env.IVY_HOME}">
+ <isset property="env.IVY_HOME" />
+ </condition>
+ <property name="ivy.home" value="${user.home}/.ant" />
+ <property name="ivy.checksums" value="" />
+ <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+ <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+ <target name="download-ivy" unless="offline">
+
+ <mkdir dir="${ivy.jar.dir}"/>
+ <!-- download Ivy from web site so that it can be used even without any special installation -->
+ <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar"
+ dest="${ivy.jar.file}" usetimestamp="true"/>
+ </target>
+
+ <target name="init-ivy" depends="download-ivy">
+ <!-- try to load ivy here from ivy home, in case the user has not already dropped
+ it into ant's lib dir (note that the latter copy will always take precedence).
+ We will not fail as long as local lib dir exists (it may be empty) and
+ ivy is in at least one of ant's lib dir or the local lib dir. -->
+ <path id="ivy.lib.path">
+ <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+ </path>
+ <taskdef resource="org/apache/ivy/ant/antlib.xml"
+ uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+ </target>
+
+ <target name="deps-jar" depends="init-ivy">
+ <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+ </target>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/build.xml b/nutch-plugins/scoring-similarity/build.xml
new file mode 100644
index 0000000..66ac8f3
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/build.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="scoring-similarity" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- Deploy Unit test dependencies -->
+ <target name="deps-test">
+ <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+ </target>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/ivy.xml b/nutch-plugins/scoring-similarity/ivy.xml
new file mode 100644
index 0000000..be0a1de
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="5.5.0" conf="*->default"/>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/plugin.xml b/nutch-plugins/scoring-similarity/plugin.xml
new file mode 100644
index 0000000..9639c18
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/plugin.xml
@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="scoring-similarity"
+ name="Similarity based Scoring Plug-in"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+
+ <runtime>
+ <library name="scoring-similarity.jar">
+ <export name="*"/>
+ </library>
+ <library name="lucene-analyzers-common-5.5.0.jar"/>
+ <library name="lucene-core-5.5.0.jar"/>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.scoring.similarity"
+ name="SimilarityScoring"
+ point="org.apache.nutch.scoring.ScoringFilter">
+
+ <implementation id="scoring-similarity"
+ class="org.apache.nutch.scoring.similarity.SimilarityScoringFilter" />
+ </extension>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/pom.xml b/nutch-plugins/scoring-similarity/pom.xml
new file mode 100644
index 0000000..b1f7cb7
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/pom.xml
@@ -0,0 +1,45 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>scoring-similarity</artifactId>
+ <packaging>jar</packaging>
+
+ <name>scoring-similarity</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-analyzers-common</artifactId>
+ <version>5.5.0</version>
+ </dependency>
+ </dependencies>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityModel.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityModel.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityModel.java
new file mode 100644
index 0000000..f44fabd
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityModel.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity;
+
+import java.util.Collection;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+
+public interface SimilarityModel {
+
+ public void setConf(Configuration conf);
+
+ public float setURLScoreAfterParsing(Text url, Content content, Parse parse);
+
+ public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+ ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+ CrawlDatum adjust, int allCount);
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
new file mode 100644
index 0000000..0f905b8
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.AbstractScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.scoring.similarity.cosine.CosineSimilarity;
+
+public class SimilarityScoringFilter extends AbstractScoringFilter {
+
+ private Configuration conf;
+ private SimilarityModel similarityModel;
+ @Override
+ public Configuration getConf() {
+ return conf;
+ }
+
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ switch(conf.get("scoring.similarity.model","cosine")){
+ case "cosine":
+ similarityModel = (SimilarityModel) new CosineSimilarity();
+ break;
+ }
+ similarityModel.setConf(conf);
+ }
+
+ @Override
+ public void passScoreAfterParsing(Text url, Content content, Parse parse)
+ throws ScoringFilterException {
+
+ float score = similarityModel.setURLScoreAfterParsing(url, content, parse);
+ parse.getData().getContentMeta()
+ .set(Nutch.SCORE_KEY, score+"");
+ }
+
+ @Override
+ public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+ ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+ CrawlDatum adjust, int allCount) throws ScoringFilterException {
+ similarityModel.distributeScoreToOutlinks(fromUrl, parseData, targets, adjust, allCount);
+ return adjust;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
new file mode 100644
index 0000000..9853b34
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.cosine;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.similarity.SimilarityModel;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class CosineSimilarity implements SimilarityModel{
+
+ private Configuration conf;
+ private final static Logger LOG = LoggerFactory
+ .getLogger(CosineSimilarity.class);
+
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ @Override
+ public float setURLScoreAfterParsing(Text url, Content content, Parse parse) {
+ float score = 1;
+
+ try {
+ if(!Model.isModelCreated){
+ Model.createModel(conf);
+ }
+ String metatags = parse.getData().getParseMeta().get("metatag.keyword");
+ String metaDescription = parse.getData().getParseMeta().get("metatag.description");
+ int[] ngramArr = Model.retrieveNgrams(conf);
+ int mingram = ngramArr[0];
+ int maxgram = ngramArr[1];
+ DocVector docVector = Model.createDocVector(parse.getText()+metaDescription+metatags, mingram, maxgram);
+ if(docVector!=null){
+ score = Model.computeCosineSimilarity(docVector);
+ LOG.info("Setting score of {} to {}",url, score);
+ }
+ else {
+ throw new Exception("Could not create DocVector from parsed text");
+ }
+ } catch (Exception e) {
+ LOG.error("Error creating Cosine Model, setting scores of urls to 1 : {}", StringUtils.stringifyException(e));
+ }
+ return score;
+ }
+
+ @Override
+ public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData,
+ Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust,
+ int allCount) {
+ float score = Float.parseFloat(parseData.getContentMeta().get(Nutch.SCORE_KEY));
+ for (Entry<Text, CrawlDatum> target : targets) {
+ target.getValue().setScore(score);
+ }
+ return adjust;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
new file mode 100644
index 0000000..33c3a23
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.cosine;
+
+import java.util.HashMap;
+import java.util.Map;
+
+public class DocVector {
+
+ public HashMap<Integer, Long> termVector;
+ public HashMap<String, Integer> termFreqVector;
+
+ public DocVector() {
+ termFreqVector = new HashMap<>();
+ }
+
+ public void setTermFreqVector(HashMap<String, Integer> termFreqVector) {
+ this.termFreqVector = termFreqVector;
+ }
+
+ public void setVectorEntry(int pos, long freq) {
+ termVector.put(pos, freq);
+ }
+
+ public float dotProduct(DocVector docVector) {
+ float product = 0;
+ for(Map.Entry<String, Integer> entry : termFreqVector.entrySet()) {
+ if(docVector.termFreqVector.containsKey(entry.getKey())) {
+ product += docVector.termFreqVector.get(entry.getKey())*entry.getValue();
+ }
+ }
+ return product;
+ }
+
+ public float getL2Norm() {
+ float sum = 0;
+ for(Map.Entry<String, Integer> entry : termFreqVector.entrySet()) {
+ sum += entry.getValue()*entry.getValue();
+ }
+ return (float) Math.sqrt(sum);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/Model.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/Model.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/Model.java
new file mode 100644
index 0000000..d8180f2
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/Model.java
@@ -0,0 +1,190 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.cosine;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.StemFilterType;
+import org.apache.nutch.scoring.similarity.util.LuceneTokenizer;
+import org.apache.nutch.scoring.similarity.util.LuceneTokenizer.TokenizerType;
+import org.apache.tika.Tika;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class creates a model used to store Document vector representation of the corpus.
+ *
+ */
+public class Model {
+
+ //Currently only one file, but in future could accept a corpus hence an ArrayList
+ public static ArrayList<DocVector> docVectors = new ArrayList<>();
+ private static final Logger LOG = LoggerFactory.getLogger(Model.class);
+ public static boolean isModelCreated = false;
+ private static List<String> stopWords;
+
+ public static synchronized void createModel(Configuration conf) throws IOException {
+ if(isModelCreated) {
+ LOG.info("Model exists, skipping model creation");
+ return;
+ }
+ LOG.info("Creating Cosine model");
+ try {
+ //If user has specified a stopword file other than the template
+ if(!conf.get("scoring.similarity.stopword.file").equals("stopwords.txt.template")) {
+ stopWords = new ArrayList<String>();
+ String stopWord;
+ BufferedReader br = new BufferedReader(conf.getConfResourceAsReader((conf.get("scoring.similarity.stopword.file"))));
+ while ((stopWord = br.readLine()) != null) {
+ stopWords.add(stopWord);
+ }
+ LOG.info("Loaded custom stopwords from {}",conf.get("scoring.similarity.stopword.file"));
+ }
+
+ int[] ngramArr = retrieveNgrams(conf);
+ int mingram = ngramArr[0];
+ int maxgram = ngramArr[1];
+ LOG.info("Value of mingram: {} maxgram: {}", mingram, maxgram);
+
+ // TODO : Allow for corpus of documents to be provided as gold standard.
+ String line;
+ StringBuilder sb = new StringBuilder();
+ BufferedReader br = new BufferedReader(conf.getConfResourceAsReader((conf.get("cosine.goldstandard.file"))));
+ while ((line = br.readLine()) != null) {
+ sb.append(line);
+ }
+ DocVector goldStandard = createDocVector(sb.toString(), mingram, maxgram);
+ if(goldStandard!=null)
+ docVectors.add(goldStandard);
+ else {
+ throw new Exception("Could not create DocVector for goldstandard");
+ }
+ } catch (Exception e) {
+ LOG.warn("Failed to add {} to model : {}",conf.get("cosine.goldstandard.file","goldstandard.txt.template"),
+ StringUtils.stringifyException(e));
+ }
+ if(docVectors.size()>0) {
+ LOG.info("Cosine model creation complete");
+ isModelCreated = true;
+ }
+ else
+ LOG.info("Cosine model creation failed");
+ }
+
+ /**
+ * Used to create a DocVector from given String text. Used during the parse stage of the crawl
+ * cycle to create a DocVector of the currently parsed page from the parseText attribute value
+ * @param content The text to tokenize
+ * @param mingram Value of mingram for tokenizing
+ * @param maxgram Value of maxgram for tokenizing
+ */
+ public static DocVector createDocVector(String content, int mingram, int maxgram) {
+ LuceneTokenizer tokenizer;
+
+ if(mingram > 1 && maxgram > 1){
+ LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram);
+ tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram);
+ } else if (mingram > 1) {
+ maxgram = mingram;
+ LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram);
+ tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram);
+ }
+ else if(stopWords!=null) {
+ tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, stopWords, true,
+ StemFilterType.PORTERSTEM_FILTER);
+ }
+ else {
+ tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, true,
+ StemFilterType.PORTERSTEM_FILTER);
+ }
+ TokenStream tStream = tokenizer.getTokenStream();
+ HashMap<String, Integer> termVector = new HashMap<>();
+ try {
+ CharTermAttribute charTermAttribute = tStream.addAttribute(CharTermAttribute.class);
+ tStream.reset();
+ while(tStream.incrementToken()) {
+ String term = charTermAttribute.toString();
+ LOG.debug(term);
+ if(termVector.containsKey(term)) {
+ int count = termVector.get(term);
+ count++;
+ termVector.put(term, count);
+ }
+ else {
+ termVector.put(term, 1);
+ }
+ }
+ DocVector docVector = new DocVector();
+ docVector.setTermFreqVector(termVector);
+ return docVector;
+ } catch (IOException e) {
+ LOG.error("Error creating DocVector : {}",StringUtils.stringifyException(e));
+ }
+ return null;
+ }
+
+ public static float computeCosineSimilarity(DocVector docVector) {
+ float scores[] = new float[docVectors.size()];
+ int i=0;
+ float maxScore = 0;
+ for(DocVector corpusDoc : docVectors) {
+ float numerator = docVector.dotProduct(corpusDoc);
+ float denominator = docVector.getL2Norm()*corpusDoc.getL2Norm();
+ float currentScore = numerator/denominator;
+ scores[i++] = currentScore;
+ maxScore = (currentScore>maxScore)? currentScore : maxScore;
+ }
+ // Returning the max score amongst all documents in the corpus
+ return maxScore;
+ }
+
+ /**
+ * Retrieves mingram and maxgram from configuration
+ * @param conf Configuration to retrieve mingram and maxgram
+ * @return ngram array as mingram at first index and maxgram at second index
+ */
+ public static int[] retrieveNgrams(Configuration conf){
+ int[] ngramArr = new int[2];
+ //Check if user has specified mingram or ngram for ngram cosine model
+ String[] ngramStr = conf.getStrings("scoring.similarity.ngrams", "1,1");
+ //mingram
+ ngramArr[0] = Integer.parseInt(ngramStr[0]);
+ int maxgram;
+ if (ngramStr.length > 1) {
+ //maxgram
+ ngramArr[1] = Integer.parseInt(ngramStr[1]);
+ } else {
+ //maxgram
+ ngramArr[1] = ngramArr[0];
+ }
+ return ngramArr;
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/package-info.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/package-info.java
new file mode 100644
index 0000000..70ae4ab
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/package-info.java
@@ -0,0 +1,7 @@
+/**
+ *
+ */
+/** Implements the cosine similarity metric for scoring relevant documents
+ *
+ */
+package org.apache.nutch.scoring.similarity.cosine;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
new file mode 100644
index 0000000..4b519bc
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.util;
+
+import java.io.Reader;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
+import org.apache.lucene.analysis.en.PorterStemFilter;
+import org.apache.lucene.analysis.standard.ClassicTokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.util.CharArraySet;
+
+/**
+ * Creates a custom analyzer based on user provided inputs
+ *
+ */
+public class LuceneAnalyzerUtil extends Analyzer{
+
+ public static enum StemFilterType { PORTERSTEM_FILTER, ENGLISHMINIMALSTEM_FILTER, NONE }
+
+ private static StemFilterType stemFilterType;
+ private static CharArraySet stopSet;
+
+
+ /**
+ * Creates an analyzer instance based on Lucene default stopword set if @param useStopFilter is set to true
+ */
+ public LuceneAnalyzerUtil(StemFilterType stemFilterType, boolean useStopFilter) {
+ LuceneAnalyzerUtil.stemFilterType = stemFilterType;
+ if(useStopFilter) {
+ stopSet = StandardAnalyzer.STOP_WORDS_SET;
+ }
+ else {
+ stopSet = null;
+ }
+ }
+
+ /**
+ * Creates an analyzer instance based on user provided stop words. If @param addToDefault is set to true, then
+ * user provided stop words will be added to the Lucene default stopset.
+ */
+ public LuceneAnalyzerUtil(StemFilterType stemFilterType, List<String> stopWords, boolean addToDefault) {
+ LuceneAnalyzerUtil.stemFilterType = stemFilterType;
+ if(addToDefault) {
+ stopSet.addAll(stopWords);
+ }
+ else {
+ stopSet = StopFilter.makeStopSet(stopWords);
+ }
+ }
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer source = new ClassicTokenizer();
+ TokenStream filter = new LowerCaseFilter(source);
+ if(stopSet != null) {
+ filter = new StopFilter(filter, stopSet);
+ }
+
+ switch(stemFilterType){
+ case PORTERSTEM_FILTER:
+ filter = new PorterStemFilter(filter);
+ break;
+ case ENGLISHMINIMALSTEM_FILTER:
+ filter = new EnglishMinimalStemFilter(filter);
+ break;
+ default:
+ break;
+ }
+ return new TokenStreamComponents(source, filter);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
new file mode 100644
index 0000000..acb987c
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
@@ -0,0 +1,166 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.util;
+
+import java.io.StringReader;
+import java.util.List;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
+import org.apache.lucene.analysis.en.PorterStemFilter;
+import org.apache.lucene.analysis.standard.ClassicTokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.StemFilterType;
+
+public class LuceneTokenizer {
+
+ private TokenStream tokenStream;
+ private TokenizerType tokenizer;
+ private StemFilterType stemFilterType;
+ private CharArraySet stopSet = null;
+
+ public static enum TokenizerType {CLASSIC, STANDARD}
+
+ /**
+ * Creates a tokenizer based on param values
+ * @param content - The text to tokenize
+ * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT
+ * @param useStopFilter - if set to true the token stream will be filtered using default Lucene stopset
+ * @param stemFilterType - Type of stemming to perform
+ */
+ public LuceneTokenizer(String content, TokenizerType tokenizer, boolean useStopFilter, StemFilterType stemFilterType) {
+ this.tokenizer = tokenizer;
+ this.stemFilterType = stemFilterType;
+ if(useStopFilter) {
+ stopSet = StandardAnalyzer.STOP_WORDS_SET;
+ }
+ tokenStream = createTokenStream(content);
+ }
+
+ /**
+ * Creates a tokenizer based on param values
+ * @param content - The text to tokenize
+ * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT
+ * @param stopSet - Provide a set of user defined stop words
+ * @param addToDefault - If set to true, the stopSet words will be added to the Lucene default stop set.
+ * If false, then only the user provided words will be used as the stop set
+ * @param stemFilterType
+ */
+ public LuceneTokenizer(String content, TokenizerType tokenizer, List<String> stopWords, boolean addToDefault, StemFilterType stemFilterType) {
+ this.tokenizer = tokenizer;
+ this.stemFilterType = stemFilterType;
+ if(addToDefault) {
+ CharArraySet stopSet = CharArraySet.copy(StandardAnalyzer.STOP_WORDS_SET);;
+ for(String word : stopWords){
+ stopSet.add(word);
+ }
+ this.stopSet = stopSet;
+ }
+ else {
+ stopSet = new CharArraySet(stopWords, true);
+ }
+ tokenStream = createTokenStream(content);
+ }
+
+ /**
+ * Returns the tokenStream created by the Tokenizer
+ * @return
+ */
+ public TokenStream getTokenStream() {
+ return tokenStream;
+ }
+
+ /**
+ * Creates a tokenizer for the ngram model based on param values
+ * @param content - The text to tokenize
+ * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT
+ * @param stemFilterType - Type of stemming to perform
+ * @param mingram - Value of mingram for tokenizing
+ * @param maxgram - Value of maxgram for tokenizing
+ */
+ public LuceneTokenizer(String content, TokenizerType tokenizer, StemFilterType stemFilterType, int mingram, int maxgram) {
+ this.tokenizer = tokenizer;
+ this.stemFilterType = stemFilterType;
+ tokenStream = createNGramTokenStream(content, mingram, maxgram);
+ }
+
+ private TokenStream createTokenStream(String content) {
+ tokenStream = generateTokenStreamFromText(content, tokenizer);
+ tokenStream = new LowerCaseFilter(tokenStream);
+ if(stopSet != null) {
+ tokenStream = applyStopFilter(stopSet);
+ }
+ tokenStream = applyStemmer(stemFilterType);
+ return tokenStream;
+ }
+
+ private TokenStream generateTokenStreamFromText(String content, TokenizerType tokenizerType){
+ Tokenizer tokenizer = null;
+ switch(tokenizerType){
+ case CLASSIC:
+ tokenizer = new ClassicTokenizer();
+ break;
+
+ case STANDARD:
+ default:
+ tokenizer = new StandardTokenizer();
+ }
+
+ tokenizer.setReader(new StringReader(content));
+
+ tokenStream = tokenizer;
+
+ return tokenStream;
+ }
+
+ private TokenStream createNGramTokenStream(String content, int mingram, int maxgram) {
+ Tokenizer tokenizer = new StandardTokenizer();
+ tokenizer.setReader(new StringReader(content));
+ tokenStream = new LowerCaseFilter(tokenizer);
+ tokenStream = applyStemmer(stemFilterType);
+ ShingleFilter shingleFilter = new ShingleFilter(tokenStream, mingram, maxgram);
+ shingleFilter.setOutputUnigrams(false);
+ tokenStream = (TokenStream)shingleFilter;
+ return tokenStream;
+ }
+
+ private TokenStream applyStopFilter(CharArraySet stopWords) {
+ tokenStream = new StopFilter(tokenStream, stopWords);
+ return tokenStream;
+ }
+
+ private TokenStream applyStemmer(StemFilterType stemFilterType) {
+ switch(stemFilterType){
+ case ENGLISHMINIMALSTEM_FILTER:
+ tokenStream = new EnglishMinimalStemFilter(tokenStream);
+ break;
+ case PORTERSTEM_FILTER:
+ tokenStream = new PorterStemFilter(tokenStream);
+ break;
+ default:
+ break;
+ }
+
+ return tokenStream;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/package-info.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/package-info.java
new file mode 100644
index 0000000..f660977
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/package-info.java
@@ -0,0 +1,24 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *
+ */
+/**
+ * Utility package for Lucene functions
+ *
+ */
+package org.apache.nutch.scoring.similarity.util;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/README.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/README.txt b/nutch-plugins/subcollection/README.txt
new file mode 100644
index 0000000..6b79d16
--- /dev/null
+++ b/nutch-plugins/subcollection/README.txt
@@ -0,0 +1,10 @@
+For brief description about this plugin see
+src/java/org/apache/nutch/collection/package.html
+
+Basically:
+You need to enable this during indexing and during searching
+
+After indexing you can limit your searches to certain
+subcollection with keyword subcollection, eg.
+
+"subcollection:nutch hadoop"
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/build.xml b/nutch-plugins/subcollection/build.xml
new file mode 100644
index 0000000..77beac6
--- /dev/null
+++ b/nutch-plugins/subcollection/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="subcollection" default="jar">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/ivy.xml b/nutch-plugins/subcollection/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/subcollection/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/plugin.xml b/nutch-plugins/subcollection/plugin.xml
new file mode 100644
index 0000000..ca2cf2f
--- /dev/null
+++ b/nutch-plugins/subcollection/plugin.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="subcollection"
+ name="Subcollection indexing and query filter"
+ version="1.0.0"
+ provider-name="apache.org">
+
+ <runtime>
+ <library name="subcollection.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.indexer.subcollection.indexing"
+ name="Subcollection Indexing Filter"
+ point="org.apache.nutch.indexer.IndexingFilter">
+ <implementation id="SubcollectionIndexingFilter"
+ class="org.apache.nutch.indexer.subcollection.SubcollectionIndexingFilter"/>
+
+ </extension>
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/pom.xml b/nutch-plugins/subcollection/pom.xml
new file mode 100644
index 0000000..d8e3a97
--- /dev/null
+++ b/nutch-plugins/subcollection/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>subcollection</artifactId>
+ <packaging>jar</packaging>
+
+ <name>subcollection</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/CollectionManager.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/CollectionManager.java b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/CollectionManager.java
new file mode 100644
index 0000000..0dff3f8
--- /dev/null
+++ b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/CollectionManager.java
@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.collection;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.nutch.util.DomUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.ObjectCache;
+import org.apache.xerces.dom.DocumentImpl;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+
+public class CollectionManager extends Configured {
+
+ public static final String DEFAULT_FILE_NAME = "subcollections.xml";
+
+ static final Logger LOG = LoggerFactory.getLogger(CollectionManager.class);
+
+ transient Map<String, Subcollection> collectionMap = new HashMap<String, Subcollection>();
+
+ transient URL configfile;
+
+ public CollectionManager(Configuration conf) {
+ super(conf);
+ init();
+ }
+
+ /**
+ * Used for testing
+ */
+ protected CollectionManager() {
+ super(NutchConfiguration.create());
+ }
+
+ protected void init() {
+ try {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("initializing CollectionManager");
+ }
+ // initialize known subcollections
+ configfile = getConf().getResource(
+ getConf().get("subcollections.config", DEFAULT_FILE_NAME));
+
+ InputStream input = getConf().getConfResourceAsInputStream(
+ getConf().get("subcollections.config", DEFAULT_FILE_NAME));
+ parse(input);
+ } catch (Exception e) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Error occured:" + e);
+ }
+ }
+ }
+
+ protected void parse(InputStream input) {
+ Element collections = DomUtil.getDom(input);
+
+ if (collections != null) {
+ NodeList nodeList = collections
+ .getElementsByTagName(Subcollection.TAG_COLLECTION);
+
+ if (LOG.isInfoEnabled()) {
+ LOG.info("file has " + nodeList.getLength() + " elements");
+ }
+
+ for (int i = 0; i < nodeList.getLength(); i++) {
+ Element scElem = (Element) nodeList.item(i);
+ Subcollection subCol = new Subcollection(getConf());
+ subCol.initialize(scElem);
+ collectionMap.put(subCol.name, subCol);
+ }
+ } else if (LOG.isInfoEnabled()) {
+ LOG.info("Cannot find collections");
+ }
+ }
+
+ public static CollectionManager getCollectionManager(Configuration conf) {
+ String key = "collectionmanager";
+ ObjectCache objectCache = ObjectCache.get(conf);
+ CollectionManager impl = (CollectionManager) objectCache.getObject(key);
+ if (impl == null) {
+ try {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("Instantiating CollectionManager");
+ }
+ impl = new CollectionManager(conf);
+ objectCache.setObject(key, impl);
+ } catch (Exception e) {
+ throw new RuntimeException("Couldn't create CollectionManager", e);
+ }
+ }
+ return impl;
+ }
+
+ /**
+ * Returns named subcollection
+ *
+ * @param id
+ * @return Named SubCollection (or null if not existing)
+ */
+ public Subcollection getSubColection(final String id) {
+ return (Subcollection) collectionMap.get(id);
+ }
+
+ /**
+ * Delete named subcollection
+ *
+ * @param id
+ * Id of SubCollection to delete
+ */
+ public void deleteSubCollection(final String id) throws IOException {
+ final Subcollection subCol = getSubColection(id);
+ if (subCol != null) {
+ collectionMap.remove(id);
+ }
+ }
+
+ /**
+ * Create a new subcollection.
+ *
+ * @param name
+ * Name of SubCollection to create
+ * @return Created SubCollection or null if allready existed
+ */
+ public Subcollection createSubCollection(final String id, final String name) {
+ Subcollection subCol = null;
+
+ if (!collectionMap.containsKey(id)) {
+ subCol = new Subcollection(id, name, getConf());
+ collectionMap.put(id, subCol);
+ }
+
+ return subCol;
+ }
+
+ /**
+ * Return names of collections url is part of
+ *
+ * @param url
+ * The url to test against Collections
+ * @return Subcollections
+ */
+ public List<Subcollection> getSubCollections(final String url) {
+ List<Subcollection> collections = new ArrayList<Subcollection>();
+ final Iterator iterator = collectionMap.values().iterator();
+
+ while (iterator.hasNext()) {
+ final Subcollection subCol = (Subcollection) iterator.next();
+ if (subCol.filter(url) != null) {
+ collections.add(subCol);
+ }
+ }
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("subcollections:" + Arrays.toString(collections.toArray()));
+ }
+
+ return collections;
+ }
+
+ /**
+ * Returns all collections
+ *
+ * @return All collections CollectionManager knows about
+ */
+ public Collection getAll() {
+ return collectionMap.values();
+ }
+
+ /**
+ * Save collections into file
+ *
+ * @throws Exception
+ */
+ public void save() throws IOException {
+ try {
+ final FileOutputStream fos = new FileOutputStream(new File(
+ configfile.getFile()));
+ final Document doc = new DocumentImpl();
+ final Element collections = doc
+ .createElement(Subcollection.TAG_COLLECTIONS);
+ final Iterator iterator = collectionMap.values().iterator();
+
+ while (iterator.hasNext()) {
+ final Subcollection subCol = (Subcollection) iterator.next();
+ final Element collection = doc
+ .createElement(Subcollection.TAG_COLLECTION);
+ collections.appendChild(collection);
+ final Element name = doc.createElement(Subcollection.TAG_NAME);
+ name.setNodeValue(subCol.getName());
+ collection.appendChild(name);
+ final Element whiteList = doc
+ .createElement(Subcollection.TAG_WHITELIST);
+ whiteList.setNodeValue(subCol.getWhiteListString());
+ collection.appendChild(whiteList);
+ final Element blackList = doc
+ .createElement(Subcollection.TAG_BLACKLIST);
+ blackList.setNodeValue(subCol.getBlackListString());
+ collection.appendChild(blackList);
+ }
+
+ DomUtil.saveDom(fos, collections);
+ fos.flush();
+ fos.close();
+ } catch (FileNotFoundException e) {
+ throw new IOException(e.toString());
+ }
+ }
+}