You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/06/19 05:15:16 UTC

svn commit: r1686351 - in /nutch/trunk: ./ src/plugin/ src/plugin/scoring-similarity/ src/plugin/scoring-similarity/src/ src/plugin/scoring-similarity/src/java/ src/plugin/scoring-similarity/src/java/org/ src/plugin/scoring-similarity/src/java/org/apac...

Author: mattmann
Date: Fri Jun 19 03:15:16 2015
New Revision: 1686351

URL: http://svn.apache.org/r1686351
Log:
Fix for NUTCH-2039: Relevance based scoring filter contributed by Sujen Shah <su...@gmail.com> this closes #30.

Added:
    nutch/trunk/src/plugin/scoring-similarity/
    nutch/trunk/src/plugin/scoring-similarity/build.xml
    nutch/trunk/src/plugin/scoring-similarity/ivy.xml
    nutch/trunk/src/plugin/scoring-similarity/plugin.xml
    nutch/trunk/src/plugin/scoring-similarity/src/
    nutch/trunk/src/plugin/scoring-similarity/src/java/
    nutch/trunk/src/plugin/scoring-similarity/src/java/org/
    nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/
    nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/
    nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/
    nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/
    nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/Cosine/
    nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/Cosine/CosineSimilarityModel.java
    nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/Cosine/DocumentVector.java
    nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/CosineSimilarity.java
    nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/DocumentVector.java
    nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/ScoringFilterModel.java
    nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java
    nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/build.xml
    nutch/trunk/default.properties
    nutch/trunk/src/plugin/build.xml

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1686351&r1=1686350&r2=1686351&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jun 19 03:15:16 2015
@@ -2,6 +2,8 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-2039 Relevance based scoring filter (Sujen Shah via mattmann)
+
 * NUTCH-2037 Job endpoint to support Indexing from the REST API (Sujen Shah via mattmann)
 
 * NUTCH-2017 Remove debug log from MimeUtil (snagel)

Modified: nutch/trunk/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1686351&r1=1686350&r2=1686351&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Fri Jun 19 03:15:16 2015
@@ -200,6 +200,7 @@
       <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
       <packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
       <packageset dir="${plugins.dir}/scoring-depth/src/java"/>
+      <packageset dir="${plugins.dir}/scoring-similarity/src/java"/>
       <packageset dir="${plugins.dir}/scoring-link/src/java"/>
       <packageset dir="${plugins.dir}/scoring-opic/src/java"/>
       <packageset dir="${plugins.dir}/subcollection/src/java"/>
@@ -610,6 +611,7 @@
       <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
       <packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
       <packageset dir="${plugins.dir}/scoring-depth/src/java"/>
+      <packageset dir="${plugins.dir}/scoring-similarity/src/java"/>
       <packageset dir="${plugins.dir}/scoring-link/src/java"/>
       <packageset dir="${plugins.dir}/scoring-opic/src/java"/>
       <packageset dir="${plugins.dir}/subcollection/src/java"/>
@@ -1019,6 +1021,7 @@
         <source path="${plugins.dir}/protocol-selenium/src/java"/>
         <source path="${plugins.dir}/protocol-selenium/src/test"/>
         <source path="${plugins.dir}/scoring-depth/src/java/" />
+        <source path="${plugins.dir}/scoring-similarity/src/java/" />
         <source path="${plugins.dir}/scoring-link/src/java/" />
         <source path="${plugins.dir}/scoring-opic/src/java/" />
         <source path="${plugins.dir}/subcollection/src/java/" />

Modified: nutch/trunk/default.properties
URL: http://svn.apache.org/viewvc/nutch/trunk/default.properties?rev=1686351&r1=1686350&r2=1686351&view=diff
==============================================================================
--- nutch/trunk/default.properties (original)
+++ nutch/trunk/default.properties Fri Jun 19 03:15:16 2015
@@ -87,7 +87,8 @@ plugins.protocol=\
    org.apache.nutch.protocol.file*:\
    org.apache.nutch.protocol.ftp*:\
    org.apache.nutch.protocol.http*:\
-   org.apache.nutch.protocol.httpclient*
+   org.apache.nutch.protocol.httpclient*:\
+   org.apache.nutch.protocol.selenium*
 
 #
 # URL Filter Plugins
@@ -118,6 +119,7 @@ plugins.scoring=\
    org.apache.nutch.scoring.depth*:\
    org.apache.nutch.scoring.link*:\
    org.apache.nutch.scoring.opic*:\
+   org.apache.nutch.scoring.similarity*:\
    org.apache.nutch.scoring.tld*:\
    org.apache.nutch.scoring.urlmeta*
    

Modified: nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1686351&r1=1686350&r2=1686351&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Fri Jun 19 03:15:16 2015
@@ -62,6 +62,7 @@
      <ant dir="scoring-depth" target="deploy"/>
      <ant dir="scoring-opic" target="deploy"/>
      <ant dir="scoring-link" target="deploy"/>
+     <ant dir="scoring-similarity" target="deploy"/>
      <ant dir="subcollection" target="deploy"/>
      <ant dir="tld" target="deploy"/>
      <ant dir="urlfilter-automaton" target="deploy"/>
@@ -165,6 +166,7 @@
     <ant dir="scoring-depth" target="clean"/>
     <ant dir="scoring-opic" target="clean"/>
     <ant dir="scoring-link" target="clean"/>
+    <ant dir="scoring-similarity" target="clean"/>
     <ant dir="subcollection" target="clean"/>
     <ant dir="tld" target="clean"/>
     <ant dir="urlfilter-automaton" target="clean"/>

Added: nutch/trunk/src/plugin/scoring-similarity/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/build.xml?rev=1686351&view=auto
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/build.xml (added)
+++ nutch/trunk/src/plugin/scoring-similarity/build.xml Fri Jun 19 03:15:16 2015
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="scoring-similarity" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+  </target>
+
+</project>

Added: nutch/trunk/src/plugin/scoring-similarity/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/ivy.xml?rev=1686351&view=auto
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/ivy.xml (added)
+++ nutch/trunk/src/plugin/scoring-similarity/ivy.xml Fri Jun 19 03:15:16 2015
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

Added: nutch/trunk/src/plugin/scoring-similarity/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/plugin.xml?rev=1686351&view=auto
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/plugin.xml (added)
+++ nutch/trunk/src/plugin/scoring-similarity/plugin.xml Fri Jun 19 03:15:16 2015
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="scoring-similarity"
+   name="Similarity based Scoring Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="scoring-similarity.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <extension id="org.apache.nutch.scoring.similarity"
+              name="SimilarityScoring"
+              point="org.apache.nutch.scoring.ScoringFilter">
+
+      <implementation id="scoring-similarity"
+                      class="org.apache.nutch.scoring.similarity.SimilarityScoringFilter" />
+   </extension>
+
+</plugin>

Added: nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/Cosine/CosineSimilarityModel.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/Cosine/CosineSimilarityModel.java?rev=1686351&view=auto
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/Cosine/CosineSimilarityModel.java (added)
+++ nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/Cosine/CosineSimilarityModel.java Fri Jun 19 03:15:16 2015
@@ -0,0 +1,156 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.cosine;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Collection;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.similarity.SimilarityModel;
+import org.apache.nutch.scoring.similarity.cosine.CosineSimilarityModel;
+import org.apache.nutch.scoring.similarity.cosine.DocumentVector;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class CosineSimilarityModel implements SimilarityModel{
+
+  private Configuration conf;
+  private String goldStandardDocPath;
+  private static DocumentVector goldStandardDocVect;
+  private final static Logger LOG = LoggerFactory
+      .getLogger(CosineSimilarityModel.class);
+  
+  private double calculateCosineSimilarity(String goldStandard, String document2){
+
+    DocumentVector docVect1 = new DocumentVector(goldStandard, conf);
+    DocumentVector docVect2 = new DocumentVector(document2, conf);
+
+    double doc1Dist = getEuclideanDist(docVect1);
+    double doc2Dist = getEuclideanDist(docVect2);
+
+    double dotProduct = getDotProduct(docVect1, docVect2);
+    if(doc1Dist*doc2Dist == 0){
+      return 0.0;
+    }
+    return dotProduct/(doc1Dist*doc2Dist);
+  }
+
+  private double calculateCosineSimilarity(DocumentVector docVect1, DocumentVector docVect2){
+
+    double doc1Dist = getEuclideanDist(docVect1);
+    double doc2Dist = getEuclideanDist(docVect2);
+
+    double dotProduct = getDotProduct(docVect1, docVect2);
+    if(doc1Dist*doc2Dist == 0){
+      return 0.0;
+    }
+    return dotProduct/(doc1Dist*doc2Dist);
+  }
+
+  private double getDotProduct(DocumentVector docVect1, DocumentVector docVect2) {
+    double dotProduct = 0.0;
+    Map<String, Integer> doc2TermFreqVect = docVect2.getTermFreqVect();
+    for(Map.Entry<String, Integer> pair : docVect1.getTermFreqVect().entrySet()){
+      double doc1value = pair.getValue();
+      double doc2value = 0;
+
+      if(doc2TermFreqVect.containsKey(pair.getKey()))
+        doc2value = doc2TermFreqVect.get(pair.getKey());
+
+      dotProduct += doc1value*doc2value;
+    }
+
+    return dotProduct;
+  }
+
+  private double getEuclideanDist(DocumentVector docVect) {
+    float sum = 0f;
+    for(Map.Entry<String, Integer> pair : docVect.getTermFreqVect().entrySet()){
+      sum += pair.getValue() * pair.getValue();
+    }    
+    return Math.sqrt(sum);
+  }
+  
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    goldStandardDocPath = conf.get("scoring.similarity.model.path");
+    Reader reader = conf.getConfResourceAsReader(goldStandardDocPath);
+    try {
+      String fileContent = IOUtils.toString(reader);
+      if(goldStandardDocVect == null) {
+        goldStandardDocVect = new DocumentVector(fileContent, conf);
+        LOG.info("Creating DocVector from path - {}",goldStandardDocPath);
+      }
+    } catch (IOException e) {
+      LOG.error("Failed to create Document vector : {}",StringUtils.stringifyException(e));
+      e.printStackTrace();
+    }    
+  }
+  @Override
+  public float setURLScoreAfterParsing(Text url, Content content, Parse parse) {
+
+    DocumentVector parseTextDocVect = new DocumentVector(parse.getText(), conf);
+    double parseTextSimilarity = calculateCosineSimilarity(goldStandardDocVect, parseTextDocVect);
+  
+    DocumentVector parseMetaKeywordDocVect = new DocumentVector(parse.getData().getParseMeta().get("metatag.keyword"), conf);
+    double metaKeywordSimilarity = calculateCosineSimilarity(goldStandardDocVect, parseMetaKeywordDocVect);
+    
+    DocumentVector parseMetaDescDocVect = new DocumentVector(parse.getData().getParseMeta().get("metatag.description"), conf);
+    double metaDescriptionSimilarity = calculateCosineSimilarity(goldStandardDocVect, parseMetaDescDocVect);
+    
+    LOG.info("Calculating similarity between gold-standard and {}",url);
+    int count = 0;
+    if(parseTextSimilarity!=0)
+      count++;
+    if(metaDescriptionSimilarity!=0)
+      count++;
+    if(metaKeywordSimilarity!=0)
+      count++;
+    if(count==0)
+      count++;
+
+    float score =  (float) ((parseTextSimilarity+metaDescriptionSimilarity + metaKeywordSimilarity)/count);
+    LOG.info("Setting score of {} to {}",url, score);
+    LOG.info("Score break down TextSimilarity : {}, metaKeywordSimilarity : {}, metaDescriptionSimilarity : {}",
+        parseTextSimilarity, metaKeywordSimilarity, metaDescriptionSimilarity);
+    return score;
+  }
+  
+  @Override
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+      CrawlDatum adjust, int allCount) {
+    float score = Float.parseFloat(parseData.getContentMeta().get(Nutch.SCORE_KEY));
+    for (Entry<Text, CrawlDatum> target : targets) {
+      target.getValue().setScore((float)score);
+      LOG.info("Setting score of {} to {}",target.getKey(), score);
+    }
+    return adjust;
+  }
+}

Added: nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/Cosine/DocumentVector.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/Cosine/DocumentVector.java?rev=1686351&view=auto
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/Cosine/DocumentVector.java (added)
+++ nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/Cosine/DocumentVector.java Fri Jun 19 03:15:16 2015
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.cosine;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.scoring.similarity.cosine.DocumentVector;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class DocumentVector {
+
+  private Map<String, Integer> termFreqVect;
+  private static Set<String> stopwordSet;
+  private final static Logger LOG = LoggerFactory
+      .getLogger(DocumentVector.class);
+
+  public DocumentVector(String text, Configuration conf){
+
+    termFreqVect = new HashMap<String, Integer>();
+    if(stopwordSet == null){
+      stopwordSet = new HashSet<String>();
+      populateStopWordSet(conf);
+    }
+    createDocVect(text);
+    removeStopWords(conf);
+  }
+
+  private void createDocVect(String text){
+    if(text!=null){
+      String[] tokens = text.replaceAll("[^a-zA-Z0-9 ]", " ").toLowerCase().split("\\s+");
+      for(String token: tokens){
+        if(termFreqVect.containsKey(token)){
+          int count = termFreqVect.get(token)+1;
+          termFreqVect.put(token, count);
+        }
+        else
+          termFreqVect.put(token, 1);
+      }
+    }
+  }
+
+  public Map<String, Integer> getTermFreqVect(){
+    return termFreqVect;
+  }
+
+  private void removeStopWords(Configuration conf){
+    for(String stopWord: stopwordSet){
+      stopWord = stopWord.trim();
+      if(termFreqVect.containsKey(stopWord)){
+        termFreqVect.remove(stopWord);
+      }
+    }
+  }
+
+  private void populateStopWordSet(Configuration conf){
+    String stopWordFilePath = conf.get("scoring.similarity.stopword.file");
+    Reader reader = conf.getConfResourceAsReader(stopWordFilePath);
+    try {
+      LOG.info("Populating stopwords from {}", stopWordFilePath);
+      String[] stopWordList1 = IOUtils.toString(reader).split("\n");
+      for(String stopWord: stopWordList1){
+        stopWord = stopWord.trim();
+        stopwordSet.add(stopWord);
+      }
+    } catch (IOException e) {
+      LOG.error("Failed to populate stopwords : {}", StringUtils.stringifyException(e));
+      e.printStackTrace();
+    }
+  }
+}

Added: nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/CosineSimilarity.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/CosineSimilarity.java?rev=1686351&view=auto
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/CosineSimilarity.java (added)
+++ nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/CosineSimilarity.java Fri Jun 19 03:15:16 2015
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity;
+
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+
+public class CosineSimilarity {
+
+  private Configuration conf;
+  
+  public CosineSimilarity(Configuration conf){
+    this.conf = conf;
+  }
+  public double calculateCosineSimilarity(String goldStandard, String document2){
+
+    DocumentVector docVect1 = new DocumentVector(goldStandard, conf);
+    DocumentVector docVect2 = new DocumentVector(document2, conf);
+
+    double doc1Dist = getEuclideanDist(docVect1);
+    double doc2Dist = getEuclideanDist(docVect2);
+
+    double dotProduct = getDotProduct(docVect1, docVect2);
+    if(doc1Dist*doc2Dist == 0){
+      return 0.0;
+    }
+    return dotProduct/(doc1Dist*doc2Dist);
+  }
+
+  public double calculateCosineSimilarity(DocumentVector docVect1, DocumentVector docVect2){
+
+    double doc1Dist = getEuclideanDist(docVect1);
+    double doc2Dist = getEuclideanDist(docVect2);
+
+    double dotProduct = getDotProduct(docVect1, docVect2);
+    if(doc1Dist*doc2Dist == 0){
+      return 0.0;
+    }
+    return dotProduct/(doc1Dist*doc2Dist);
+  }
+
+  private double getDotProduct(DocumentVector docVect1, DocumentVector docVect2) {
+    double dotProduct = 0.0;
+    Map<String, Integer> doc2TermFreqVect = docVect2.getTermFreqVect();
+    for(Map.Entry<String, Integer> pair : docVect1.getTermFreqVect().entrySet()){
+      double doc1value = pair.getValue();
+      double doc2value = 0;
+
+      if(doc2TermFreqVect.containsKey(pair.getKey()))
+        doc2value = doc2TermFreqVect.get(pair.getKey());
+
+      dotProduct += doc1value*doc2value;
+    }
+
+    return dotProduct;
+  }
+
+  private double getEuclideanDist(DocumentVector docVect) {
+    float sum = 0f;
+    for(Map.Entry<String, Integer> pair : docVect.getTermFreqVect().entrySet()){
+      sum += pair.getValue() * pair.getValue();
+    }    
+    return Math.sqrt(sum);
+  }
+}

Added: nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/DocumentVector.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/DocumentVector.java?rev=1686351&view=auto
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/DocumentVector.java (added)
+++ nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/DocumentVector.java Fri Jun 19 03:15:16 2015
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+
+public class DocumentVector {
+
+  Map<String, Integer> termFreqVect;
+
+  public DocumentVector(String text, Configuration conf){
+    
+    termFreqVect = new HashMap<String, Integer>();
+    createDocVect(text);
+    removeStopWords(conf);
+  }
+  
+  private void createDocVect(String text){
+    if(text!=null){
+      String[] tokens = text.replaceAll("[^a-zA-Z0-9 ]", " ").toLowerCase().split("\\s+");
+      for(String token: tokens){
+        if(termFreqVect.containsKey(token)){
+          int count = termFreqVect.get(token)+1;
+          termFreqVect.put(token, count);
+        }
+        else
+          termFreqVect.put(token, 1);
+      }
+    }
+  }
+  
+  public Map<String, Integer> getTermFreqVect(){
+    return termFreqVect;
+  }
+  
+  private void removeStopWords(Configuration conf){
+    String stopWordFilePath = conf.get("scoring.similarity.stopword.file");
+    Reader reader = conf.getConfResourceAsReader(stopWordFilePath);
+    try {
+      String[] stopWordList1 = IOUtils.toString(reader).split("\n");
+      for(String stopWord: stopWordList1){
+        stopWord = stopWord.trim();
+        if(termFreqVect.containsKey(stopWord)){
+          termFreqVect.remove(stopWord);
+        }
+      }
+    } catch (IOException e) {
+      // TODO Auto-generated catch block
+      e.printStackTrace();
+    }
+  }
+}

Added: nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/ScoringFilterModel.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/ScoringFilterModel.java?rev=1686351&view=auto
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/ScoringFilterModel.java (added)
+++ nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/ScoringFilterModel.java Fri Jun 19 03:15:16 2015
@@ -0,0 +1,12 @@
+package org.apache.nutch.scoring.similarity;
+
+/**
+ * This interface defines the methods to be implemented to 
+ * create a plugable model in the scoring-similarity based filter 
+ * @author Sujen Shah
+ *
+ */
+public interface ScoringFilterModel {
+
+  public int score();
+}

Added: nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java?rev=1686351&view=auto
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java (added)
+++ nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java Fri Jun 19 03:15:16 2015
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity;
+
+import java.util.Collection;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+
+public interface SimilarityModel {
+
+  public void setConf(Configuration conf);
+  
+  public float setURLScoreAfterParsing(Text url, Content content, Parse parse);
+  
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+      CrawlDatum adjust, int allCount);
+}

Added: nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java?rev=1686351&view=auto
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java (added)
+++ nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java Fri Jun 19 03:15:16 2015
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity;
+
+import java.util.Collection;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.AbstractScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.scoring.similarity.cosine.CosineSimilarityModel;
+
+public class SimilarityScoringFilter extends AbstractScoringFilter {
+
+  private Configuration conf;
+  private SimilarityModel similarityModel;
+
+  @Override
+  public Configuration getConf() {
+    return conf;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    similarityModel = new CosineSimilarityModel();
+    similarityModel.setConf(conf);
+  }
+
+  @Override
+  public void passScoreAfterParsing(Text url, Content content, Parse parse)
+      throws ScoringFilterException {
+
+    float score = similarityModel.setURLScoreAfterParsing(url, content, parse);
+    parse.getData().getContentMeta()
+    .set(Nutch.SCORE_KEY, score+"");
+  }
+
+  @Override
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+      CrawlDatum adjust, int allCount) throws ScoringFilterException {
+    similarityModel.distributeScoreToOutlinks(fromUrl, parseData, targets, adjust, allCount);
+    return adjust;
+  }
+}