You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/06/19 05:15:16 UTC
svn commit: r1686351 - in /nutch/trunk: ./ src/plugin/
src/plugin/scoring-similarity/ src/plugin/scoring-similarity/src/
src/plugin/scoring-similarity/src/java/
src/plugin/scoring-similarity/src/java/org/
src/plugin/scoring-similarity/src/java/org/apac...
Author: mattmann
Date: Fri Jun 19 03:15:16 2015
New Revision: 1686351
URL: http://svn.apache.org/r1686351
Log:
Fix for NUTCH-2039: Relevance based scoring filter contributed by Sujen Shah <su...@gmail.com> this closes #30.
Added:
nutch/trunk/src/plugin/scoring-similarity/
nutch/trunk/src/plugin/scoring-similarity/build.xml
nutch/trunk/src/plugin/scoring-similarity/ivy.xml
nutch/trunk/src/plugin/scoring-similarity/plugin.xml
nutch/trunk/src/plugin/scoring-similarity/src/
nutch/trunk/src/plugin/scoring-similarity/src/java/
nutch/trunk/src/plugin/scoring-similarity/src/java/org/
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/Cosine/
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/Cosine/CosineSimilarityModel.java
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/Cosine/DocumentVector.java
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/CosineSimilarity.java
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/DocumentVector.java
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/ScoringFilterModel.java
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/build.xml
nutch/trunk/default.properties
nutch/trunk/src/plugin/build.xml
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1686351&r1=1686350&r2=1686351&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jun 19 03:15:16 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-2039 Relevance based scoring filter (Sujen Shah via mattmann)
+
* NUTCH-2037 Job endpoint to support Indexing from the REST API (Sujen Shah via mattmann)
* NUTCH-2017 Remove debug log from MimeUtil (snagel)
Modified: nutch/trunk/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1686351&r1=1686350&r2=1686351&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Fri Jun 19 03:15:16 2015
@@ -200,6 +200,7 @@
<packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
<packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
<packageset dir="${plugins.dir}/scoring-depth/src/java"/>
+ <packageset dir="${plugins.dir}/scoring-similarity/src/java"/>
<packageset dir="${plugins.dir}/scoring-link/src/java"/>
<packageset dir="${plugins.dir}/scoring-opic/src/java"/>
<packageset dir="${plugins.dir}/subcollection/src/java"/>
@@ -610,6 +611,7 @@
<packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
<packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
<packageset dir="${plugins.dir}/scoring-depth/src/java"/>
+ <packageset dir="${plugins.dir}/scoring-similarity/src/java"/>
<packageset dir="${plugins.dir}/scoring-link/src/java"/>
<packageset dir="${plugins.dir}/scoring-opic/src/java"/>
<packageset dir="${plugins.dir}/subcollection/src/java"/>
@@ -1019,6 +1021,7 @@
<source path="${plugins.dir}/protocol-selenium/src/java"/>
<source path="${plugins.dir}/protocol-selenium/src/test"/>
<source path="${plugins.dir}/scoring-depth/src/java/" />
+ <source path="${plugins.dir}/scoring-similarity/src/java/" />
<source path="${plugins.dir}/scoring-link/src/java/" />
<source path="${plugins.dir}/scoring-opic/src/java/" />
<source path="${plugins.dir}/subcollection/src/java/" />
Modified: nutch/trunk/default.properties
URL: http://svn.apache.org/viewvc/nutch/trunk/default.properties?rev=1686351&r1=1686350&r2=1686351&view=diff
==============================================================================
--- nutch/trunk/default.properties (original)
+++ nutch/trunk/default.properties Fri Jun 19 03:15:16 2015
@@ -87,7 +87,8 @@ plugins.protocol=\
org.apache.nutch.protocol.file*:\
org.apache.nutch.protocol.ftp*:\
org.apache.nutch.protocol.http*:\
- org.apache.nutch.protocol.httpclient*
+ org.apache.nutch.protocol.httpclient*:\
+ org.apache.nutch.protocol.selenium*
#
# URL Filter Plugins
@@ -118,6 +119,7 @@ plugins.scoring=\
org.apache.nutch.scoring.depth*:\
org.apache.nutch.scoring.link*:\
org.apache.nutch.scoring.opic*:\
+ org.apache.nutch.scoring.similarity*:\
org.apache.nutch.scoring.tld*:\
org.apache.nutch.scoring.urlmeta*
Modified: nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1686351&r1=1686350&r2=1686351&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Fri Jun 19 03:15:16 2015
@@ -62,6 +62,7 @@
<ant dir="scoring-depth" target="deploy"/>
<ant dir="scoring-opic" target="deploy"/>
<ant dir="scoring-link" target="deploy"/>
+ <ant dir="scoring-similarity" target="deploy"/>
<ant dir="subcollection" target="deploy"/>
<ant dir="tld" target="deploy"/>
<ant dir="urlfilter-automaton" target="deploy"/>
@@ -165,6 +166,7 @@
<ant dir="scoring-depth" target="clean"/>
<ant dir="scoring-opic" target="clean"/>
<ant dir="scoring-link" target="clean"/>
+ <ant dir="scoring-similarity" target="clean"/>
<ant dir="subcollection" target="clean"/>
<ant dir="tld" target="clean"/>
<ant dir="urlfilter-automaton" target="clean"/>
Added: nutch/trunk/src/plugin/scoring-similarity/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/build.xml?rev=1686351&view=auto
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/build.xml (added)
+++ nutch/trunk/src/plugin/scoring-similarity/build.xml Fri Jun 19 03:15:16 2015
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="scoring-similarity" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- Deploy Unit test dependencies -->
+ <target name="deps-test">
+ <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+ </target>
+
+</project>
Added: nutch/trunk/src/plugin/scoring-similarity/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/ivy.xml?rev=1686351&view=auto
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/ivy.xml (added)
+++ nutch/trunk/src/plugin/scoring-similarity/ivy.xml Fri Jun 19 03:15:16 2015
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
Added: nutch/trunk/src/plugin/scoring-similarity/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/plugin.xml?rev=1686351&view=auto
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/plugin.xml (added)
+++ nutch/trunk/src/plugin/scoring-similarity/plugin.xml Fri Jun 19 03:15:16 2015
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="scoring-similarity"
+ name="Similarity based Scoring Plug-in"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+
+ <runtime>
+ <library name="scoring-similarity.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <extension id="org.apache.nutch.scoring.similarity"
+ name="SimilarityScoring"
+ point="org.apache.nutch.scoring.ScoringFilter">
+
+ <implementation id="scoring-similarity"
+ class="org.apache.nutch.scoring.similarity.SimilarityScoringFilter" />
+ </extension>
+
+</plugin>
Added: nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/Cosine/CosineSimilarityModel.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/Cosine/CosineSimilarityModel.java?rev=1686351&view=auto
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/Cosine/CosineSimilarityModel.java (added)
+++ nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/Cosine/CosineSimilarityModel.java Fri Jun 19 03:15:16 2015
@@ -0,0 +1,156 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.cosine;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Collection;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.similarity.SimilarityModel;
+import org.apache.nutch.scoring.similarity.cosine.CosineSimilarityModel;
+import org.apache.nutch.scoring.similarity.cosine.DocumentVector;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class CosineSimilarityModel implements SimilarityModel{
+
+ private Configuration conf;
+ private String goldStandardDocPath;
+ private static DocumentVector goldStandardDocVect;
+ private final static Logger LOG = LoggerFactory
+ .getLogger(CosineSimilarityModel.class);
+
+ private double calculateCosineSimilarity(String goldStandard, String document2){
+
+ DocumentVector docVect1 = new DocumentVector(goldStandard, conf);
+ DocumentVector docVect2 = new DocumentVector(document2, conf);
+
+ double doc1Dist = getEuclideanDist(docVect1);
+ double doc2Dist = getEuclideanDist(docVect2);
+
+ double dotProduct = getDotProduct(docVect1, docVect2);
+ if(doc1Dist*doc2Dist == 0){
+ return 0.0;
+ }
+ return dotProduct/(doc1Dist*doc2Dist);
+ }
+
+ private double calculateCosineSimilarity(DocumentVector docVect1, DocumentVector docVect2){
+
+ double doc1Dist = getEuclideanDist(docVect1);
+ double doc2Dist = getEuclideanDist(docVect2);
+
+ double dotProduct = getDotProduct(docVect1, docVect2);
+ if(doc1Dist*doc2Dist == 0){
+ return 0.0;
+ }
+ return dotProduct/(doc1Dist*doc2Dist);
+ }
+
+ private double getDotProduct(DocumentVector docVect1, DocumentVector docVect2) {
+ double dotProduct = 0.0;
+ Map<String, Integer> doc2TermFreqVect = docVect2.getTermFreqVect();
+ for(Map.Entry<String, Integer> pair : docVect1.getTermFreqVect().entrySet()){
+ double doc1value = pair.getValue();
+ double doc2value = 0;
+
+ if(doc2TermFreqVect.containsKey(pair.getKey()))
+ doc2value = doc2TermFreqVect.get(pair.getKey());
+
+ dotProduct += doc1value*doc2value;
+ }
+
+ return dotProduct;
+ }
+
+ private double getEuclideanDist(DocumentVector docVect) {
+ float sum = 0f;
+ for(Map.Entry<String, Integer> pair : docVect.getTermFreqVect().entrySet()){
+ sum += pair.getValue() * pair.getValue();
+ }
+ return Math.sqrt(sum);
+ }
+
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ goldStandardDocPath = conf.get("scoring.similarity.model.path");
+ Reader reader = conf.getConfResourceAsReader(goldStandardDocPath);
+ try {
+ String fileContent = IOUtils.toString(reader);
+ if(goldStandardDocVect == null) {
+ goldStandardDocVect = new DocumentVector(fileContent, conf);
+ LOG.info("Creating DocVector from path - {}",goldStandardDocPath);
+ }
+ } catch (IOException e) {
+ LOG.error("Failed to create Document vector : {}",StringUtils.stringifyException(e));
+ e.printStackTrace();
+ }
+ }
+ @Override
+ public float setURLScoreAfterParsing(Text url, Content content, Parse parse) {
+
+ DocumentVector parseTextDocVect = new DocumentVector(parse.getText(), conf);
+ double parseTextSimilarity = calculateCosineSimilarity(goldStandardDocVect, parseTextDocVect);
+
+ DocumentVector parseMetaKeywordDocVect = new DocumentVector(parse.getData().getParseMeta().get("metatag.keyword"), conf);
+ double metaKeywordSimilarity = calculateCosineSimilarity(goldStandardDocVect, parseMetaKeywordDocVect);
+
+ DocumentVector parseMetaDescDocVect = new DocumentVector(parse.getData().getParseMeta().get("metatag.description"), conf);
+ double metaDescriptionSimilarity = calculateCosineSimilarity(goldStandardDocVect, parseMetaDescDocVect);
+
+ LOG.info("Calculating similarity between gold-standard and {}",url);
+ int count = 0;
+ if(parseTextSimilarity!=0)
+ count++;
+ if(metaDescriptionSimilarity!=0)
+ count++;
+ if(metaKeywordSimilarity!=0)
+ count++;
+ if(count==0)
+ count++;
+
+ float score = (float) ((parseTextSimilarity+metaDescriptionSimilarity + metaKeywordSimilarity)/count);
+ LOG.info("Setting score of {} to {}",url, score);
+ LOG.info("Score break down TextSimilarity : {}, metaKeywordSimilarity : {}, metaDescriptionSimilarity : {}",
+ parseTextSimilarity, metaKeywordSimilarity, metaDescriptionSimilarity);
+ return score;
+ }
+
+ @Override
+ public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+ ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+ CrawlDatum adjust, int allCount) {
+ float score = Float.parseFloat(parseData.getContentMeta().get(Nutch.SCORE_KEY));
+ for (Entry<Text, CrawlDatum> target : targets) {
+ target.getValue().setScore((float)score);
+ LOG.info("Setting score of {} to {}",target.getKey(), score);
+ }
+ return adjust;
+ }
+}
Added: nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/Cosine/DocumentVector.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/Cosine/DocumentVector.java?rev=1686351&view=auto
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/Cosine/DocumentVector.java (added)
+++ nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/Cosine/DocumentVector.java Fri Jun 19 03:15:16 2015
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.cosine;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.scoring.similarity.cosine.DocumentVector;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class DocumentVector {
+
+ private Map<String, Integer> termFreqVect;
+ private static Set<String> stopwordSet;
+ private final static Logger LOG = LoggerFactory
+ .getLogger(DocumentVector.class);
+
+ public DocumentVector(String text, Configuration conf){
+
+ termFreqVect = new HashMap<String, Integer>();
+ if(stopwordSet == null){
+ stopwordSet = new HashSet<String>();
+ populateStopWordSet(conf);
+ }
+ createDocVect(text);
+ removeStopWords(conf);
+ }
+
+ private void createDocVect(String text){
+ if(text!=null){
+ String[] tokens = text.replaceAll("[^a-zA-Z0-9 ]", " ").toLowerCase().split("\\s+");
+ for(String token: tokens){
+ if(termFreqVect.containsKey(token)){
+ int count = termFreqVect.get(token)+1;
+ termFreqVect.put(token, count);
+ }
+ else
+ termFreqVect.put(token, 1);
+ }
+ }
+ }
+
+ public Map<String, Integer> getTermFreqVect(){
+ return termFreqVect;
+ }
+
+ private void removeStopWords(Configuration conf){
+ for(String stopWord: stopwordSet){
+ stopWord = stopWord.trim();
+ if(termFreqVect.containsKey(stopWord)){
+ termFreqVect.remove(stopWord);
+ }
+ }
+ }
+
+ private void populateStopWordSet(Configuration conf){
+ String stopWordFilePath = conf.get("scoring.similarity.stopword.file");
+ Reader reader = conf.getConfResourceAsReader(stopWordFilePath);
+ try {
+ LOG.info("Populating stopwords from {}", stopWordFilePath);
+ String[] stopWordList1 = IOUtils.toString(reader).split("\n");
+ for(String stopWord: stopWordList1){
+ stopWord = stopWord.trim();
+ stopwordSet.add(stopWord);
+ }
+ } catch (IOException e) {
+ LOG.error("Failed to populate stopwords : {}", StringUtils.stringifyException(e));
+ e.printStackTrace();
+ }
+ }
+}
Added: nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/CosineSimilarity.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/CosineSimilarity.java?rev=1686351&view=auto
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/CosineSimilarity.java (added)
+++ nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/CosineSimilarity.java Fri Jun 19 03:15:16 2015
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity;
+
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+
+public class CosineSimilarity {
+
+ private Configuration conf;
+
+ public CosineSimilarity(Configuration conf){
+ this.conf = conf;
+ }
+ public double calculateCosineSimilarity(String goldStandard, String document2){
+
+ DocumentVector docVect1 = new DocumentVector(goldStandard, conf);
+ DocumentVector docVect2 = new DocumentVector(document2, conf);
+
+ double doc1Dist = getEuclideanDist(docVect1);
+ double doc2Dist = getEuclideanDist(docVect2);
+
+ double dotProduct = getDotProduct(docVect1, docVect2);
+ if(doc1Dist*doc2Dist == 0){
+ return 0.0;
+ }
+ return dotProduct/(doc1Dist*doc2Dist);
+ }
+
+ public double calculateCosineSimilarity(DocumentVector docVect1, DocumentVector docVect2){
+
+ double doc1Dist = getEuclideanDist(docVect1);
+ double doc2Dist = getEuclideanDist(docVect2);
+
+ double dotProduct = getDotProduct(docVect1, docVect2);
+ if(doc1Dist*doc2Dist == 0){
+ return 0.0;
+ }
+ return dotProduct/(doc1Dist*doc2Dist);
+ }
+
+ private double getDotProduct(DocumentVector docVect1, DocumentVector docVect2) {
+ double dotProduct = 0.0;
+ Map<String, Integer> doc2TermFreqVect = docVect2.getTermFreqVect();
+ for(Map.Entry<String, Integer> pair : docVect1.getTermFreqVect().entrySet()){
+ double doc1value = pair.getValue();
+ double doc2value = 0;
+
+ if(doc2TermFreqVect.containsKey(pair.getKey()))
+ doc2value = doc2TermFreqVect.get(pair.getKey());
+
+ dotProduct += doc1value*doc2value;
+ }
+
+ return dotProduct;
+ }
+
+ private double getEuclideanDist(DocumentVector docVect) {
+ float sum = 0f;
+ for(Map.Entry<String, Integer> pair : docVect.getTermFreqVect().entrySet()){
+ sum += pair.getValue() * pair.getValue();
+ }
+ return Math.sqrt(sum);
+ }
+}
Added: nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/DocumentVector.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/DocumentVector.java?rev=1686351&view=auto
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/DocumentVector.java (added)
+++ nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/DocumentVector.java Fri Jun 19 03:15:16 2015
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+
+public class DocumentVector {
+
+ Map<String, Integer> termFreqVect;
+
+ public DocumentVector(String text, Configuration conf){
+
+ termFreqVect = new HashMap<String, Integer>();
+ createDocVect(text);
+ removeStopWords(conf);
+ }
+
+ private void createDocVect(String text){
+ if(text!=null){
+ String[] tokens = text.replaceAll("[^a-zA-Z0-9 ]", " ").toLowerCase().split("\\s+");
+ for(String token: tokens){
+ if(termFreqVect.containsKey(token)){
+ int count = termFreqVect.get(token)+1;
+ termFreqVect.put(token, count);
+ }
+ else
+ termFreqVect.put(token, 1);
+ }
+ }
+ }
+
+ public Map<String, Integer> getTermFreqVect(){
+ return termFreqVect;
+ }
+
+ private void removeStopWords(Configuration conf){
+ String stopWordFilePath = conf.get("scoring.similarity.stopword.file");
+ Reader reader = conf.getConfResourceAsReader(stopWordFilePath);
+ try {
+ String[] stopWordList1 = IOUtils.toString(reader).split("\n");
+ for(String stopWord: stopWordList1){
+ stopWord = stopWord.trim();
+ if(termFreqVect.containsKey(stopWord)){
+ termFreqVect.remove(stopWord);
+ }
+ }
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+}
Added: nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/ScoringFilterModel.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/ScoringFilterModel.java?rev=1686351&view=auto
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/ScoringFilterModel.java (added)
+++ nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/ScoringFilterModel.java Fri Jun 19 03:15:16 2015
@@ -0,0 +1,12 @@
+package org.apache.nutch.scoring.similarity;
+
+/**
+ * This interface defines the methods to be implemented to
+ * create a plugable model in the scoring-similarity based filter
+ * @author Sujen Shah
+ *
+ */
+public interface ScoringFilterModel {
+
+ public int score();
+}
Added: nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java?rev=1686351&view=auto
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java (added)
+++ nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java Fri Jun 19 03:15:16 2015
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity;
+
+import java.util.Collection;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+
+public interface SimilarityModel {
+
+ public void setConf(Configuration conf);
+
+ public float setURLScoreAfterParsing(Text url, Content content, Parse parse);
+
+ public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+ ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+ CrawlDatum adjust, int allCount);
+}
Added: nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java?rev=1686351&view=auto
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java (added)
+++ nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java Fri Jun 19 03:15:16 2015
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity;
+
+import java.util.Collection;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.AbstractScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.scoring.similarity.cosine.CosineSimilarityModel;
+
+public class SimilarityScoringFilter extends AbstractScoringFilter {
+
+ private Configuration conf;
+ private SimilarityModel similarityModel;
+
+ @Override
+ public Configuration getConf() {
+ return conf;
+ }
+
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ similarityModel = new CosineSimilarityModel();
+ similarityModel.setConf(conf);
+ }
+
+ @Override
+ public void passScoreAfterParsing(Text url, Content content, Parse parse)
+ throws ScoringFilterException {
+
+ float score = similarityModel.setURLScoreAfterParsing(url, content, parse);
+ parse.getData().getContentMeta()
+ .set(Nutch.SCORE_KEY, score+"");
+ }
+
+ @Override
+ public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+ ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+ CrawlDatum adjust, int allCount) throws ScoringFilterException {
+ similarityModel.distributeScoreToOutlinks(fromUrl, parseData, targets, adjust, allCount);
+ return adjust;
+ }
+}