You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@nutch.apache.org by su...@apache.org on 2016/04/04 08:47:11 UTC

[1/2] nutch git commit: Fix for NUTCH-2245 contributed by Bhavya Sanghavi

Repository: nutch
Updated Branches:
  refs/heads/master a9b2491a3 -> b62f43fda


Fix for NUTCH-2245 contributed by Bhavya Sanghavi


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/2c426808
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/2c426808
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/2c426808

Branch: refs/heads/master
Commit: 2c42680823079faf87705df4d0698dcf8b43ef66
Parents: a9b2491
Author: Bhavya Sanghavi <Bh...@Bhavyas-MacBook-Pro.local>
Authored: Wed Mar 23 22:24:40 2016 -0700
Committer: Sujen Shah <su...@apache.org>
Committed: Sun Apr 3 23:31:37 2016 -0700

----------------------------------------------------------------------
 conf/nutch-default.xml                          |  6 +++
 .../similarity/cosine/CosineSimilarity.java     |  3 +-
 .../nutch/scoring/similarity/cosine/Model.java  | 22 +++++++---
 .../similarity/util/LuceneTokenizer.java        | 44 +++++++++++++++-----
 4 files changed, 59 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/2c426808/conf/nutch-default.xml
----------------------------------------------------------------------
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 93503f3..fe031d5 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1442,6 +1442,12 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
     </description>
 </property>
 
+ <property>
+  <name>scoring.similarity.ngrams</name>
+  <value>1</value>
+  <description>Specifies the 'n' in ngrams</description>
+</property>
+
 <property>
     <name>cosine.goldstandard.file</name>
     <value>goldstandard.txt</value>

http://git-wip-us.apache.org/repos/asf/nutch/blob/2c426808/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
index d41f5e2..81b1eba 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
@@ -53,7 +53,8 @@ public class CosineSimilarity implements SimilarityModel{
       }
       String metatags = parse.getData().getParseMeta().get("metatag.keyword");
       String metaDescription = parse.getData().getParseMeta().get("metatag.description");
-      DocVector docVector = Model.createDocVector(parse.getText()+metaDescription+metatags);
+      int ngram = conf.getInt("scoring.similarity.ngrams", 1);
+      DocVector docVector = Model.createDocVector(parse.getText()+metaDescription+metatags, ngram);
       if(docVector!=null){
         score = Model.computeCosineSimilarity(docVector);
         LOG.info("Setting score of {} to {}",url, score);

http://git-wip-us.apache.org/repos/asf/nutch/blob/2c426808/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
index ba0006a..371f241 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
@@ -68,6 +68,11 @@ public class Model {
         }
         LOG.info("Loaded custom stopwords from {}",conf.get("scoring.similarity.stopword.file"));
       }
+
+      //Check if user has specified n for ngram cosine model
+      int ngram = conf.getInt("scoring.similarity.ngrams", 1);
+      LOG.info("Value of ngram: {}",ngram);
+
       // TODO : Allow for corpus of documents to be provided as gold standard. 
       String line;
       StringBuilder sb = new StringBuilder();
@@ -75,7 +80,7 @@ public class Model {
       while ((line = br.readLine()) != null) {
         sb.append(line);
       }
-      DocVector goldStandard = createDocVector(sb.toString());
+      DocVector goldStandard = createDocVector(sb.toString(), ngram);
       if(goldStandard!=null)
         docVectors.add(goldStandard);
       else {
@@ -97,15 +102,21 @@ public class Model {
    * Used to create a DocVector from given String text. Used during the parse stage of the crawl 
    * cycle to create a DocVector of the currently parsed page from the parseText attribute value
    * @param content
+   * @param ngram
    */
-  public static DocVector createDocVector(String content) {
+  public static DocVector createDocVector(String content, int ngram) {
     LuceneTokenizer tokenizer;
-    if(stopWords!=null) {
-      tokenizer = new LuceneTokenizer(content, TokenizerType.CLASSIC, stopWords, true, 
+    
+    if(ngram > 1){
+      LOG.info("Using Ngram Cosine Model, user specified ngram value : {}", ngram);
+      tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, ngram);
+    }
+    else if(stopWords!=null) {
+      tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, stopWords, true, 
           StemFilterType.PORTERSTEM_FILTER);
     }
     else {
-      tokenizer = new LuceneTokenizer(content, TokenizerType.CLASSIC, true, 
+      tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, true, 
           StemFilterType.PORTERSTEM_FILTER);
     }
     TokenStream tStream = tokenizer.getTokenStream();
@@ -115,6 +126,7 @@ public class Model {
       tStream.reset();
       while(tStream.incrementToken()) {
         String term = charTermAttribute.toString();
+        LOG.debug(term);
         if(termVector.containsKey(term)) {
           int count = termVector.get(term);
           count++;

http://git-wip-us.apache.org/repos/asf/nutch/blob/2c426808/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
index 3ce0fee..c95033a 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
@@ -27,6 +27,7 @@ import org.apache.lucene.analysis.en.PorterStemFilter;
 import org.apache.lucene.analysis.standard.ClassicTokenizer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.StemFilterType;
 
@@ -36,9 +37,9 @@ public class LuceneTokenizer {
   private TokenizerType tokenizer;
   private StemFilterType stemFilterType;
   private CharArraySet stopSet = null;
-  
+
   public static enum TokenizerType {CLASSIC, STANDARD}
-  
+
   /**
    * Creates a tokenizer based on param values
    * @param content - The text to tokenize
@@ -54,7 +55,7 @@ public class LuceneTokenizer {
     }
     tokenStream = createTokenStream(content);
   }
-  
+
   /**
    * Creates a tokenizer based on param values
    * @param content - The text to tokenize
@@ -79,7 +80,7 @@ public class LuceneTokenizer {
     }
     tokenStream = createTokenStream(content);
   }
-  
+
   /**
    * Returns the tokenStream created by the Tokenizer
    * @return
@@ -88,6 +89,19 @@ public class LuceneTokenizer {
     return tokenStream;
   }
   
+  /**
+   * Creates a tokenizer for the ngram model based on param values
+   * @param content - The text to tokenize
+   * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT 
+   * @param stemFilterType - Type of stemming to perform
+   * @param ngram - Value of ngram for tokenizing
+   */
+  public LuceneTokenizer(String content, TokenizerType tokenizer, StemFilterType stemFilterType, int ngram) {
+    this.tokenizer = tokenizer;
+    this.stemFilterType = stemFilterType;
+    tokenStream = createNGramTokenStream(content,ngram);
+  }
+  
   private TokenStream createTokenStream(String content) {
     tokenStream = generateTokenStreamFromText(content, tokenizer);
     tokenStream = new LowerCaseFilter(tokenStream);
@@ -97,24 +111,34 @@ public class LuceneTokenizer {
     tokenStream = applyStemmer(stemFilterType);
     return tokenStream;
   }
-  
+
   private TokenStream generateTokenStreamFromText(String content, TokenizerType tokenizer){
     switch(tokenizer){
     case CLASSIC:
       tokenStream = new ClassicTokenizer(new StringReader(content));
       break;
-      
+
     case STANDARD:
       tokenStream = new StandardTokenizer(new StringReader(content));
     }
     return tokenStream;
   }
-  
+
+  private TokenStream createNGramTokenStream(String content, int ngram) {
+    tokenStream = new StandardTokenizer(new StringReader(content));
+    tokenStream = new LowerCaseFilter(tokenStream);
+    tokenStream = applyStemmer(stemFilterType);
+    ShingleFilter shingleFilter = new ShingleFilter(tokenStream, ngram, ngram);
+    shingleFilter.setOutputUnigrams(false);
+    tokenStream = (TokenStream)shingleFilter;
+    return tokenStream;
+  }
+
   private TokenStream applyStopFilter(CharArraySet stopWords) {
     tokenStream = new StopFilter(tokenStream, stopWords); 
     return tokenStream;
   }
-  
+
   private TokenStream applyStemmer(StemFilterType stemFilterType) {
     switch(stemFilterType){
     case ENGLISHMINIMALSTEM_FILTER:
@@ -123,8 +147,8 @@ public class LuceneTokenizer {
     case PORTERSTEM_FILTER:
       tokenStream = new PorterStemFilter(tokenStream);
       break;
-     default:
-       break;
+    default:
+      break;
     }
 
     return tokenStream;

[2/2] nutch git commit: Add changes record for NUTCH-2245 Developed the NGram Model on the existing Unigram Cosine Similarity Model contributed by Bhavya Sanghavi this closes #101

Posted by su...@apache.org.

Add changes record for NUTCH-2245 Developed the NGram Model on the existing Unigram Cosine Similarity Model contributed by Bhavya Sanghavi this closes #101


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/b62f43fd
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/b62f43fd
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/b62f43fd

Branch: refs/heads/master
Commit: b62f43fda3648bd7a37da550a180c3da5a1e3986
Parents: 2c42680
Author: Sujen Shah <su...@apache.org>
Authored: Sun Apr 3 23:39:12 2016 -0700
Committer: Sujen Shah <su...@apache.org>
Committed: Sun Apr 3 23:41:15 2016 -0700

----------------------------------------------------------------------
 CHANGES.txt | 2 ++
 1 file changed, 2 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/b62f43fd/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index b15b78c..5e061a4 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -10,6 +10,8 @@ in the release announcement and keep it on top in this CHANGES.txt for the Nutch
 
 Nutch Change Log
 
+* NUTCH-2245 Developed the NGram Model on the existing Unigram Cosine Similarity Model (bhavyasanghavi via sujen)
+
 * NUTCH-2241 Unstable Selenium plugin in Nutch. Fixed bugs and enhanced configuration (karanjeets via mattmann)
 
 * NUTCH-2213 CommonCrawlDataDumper saves gzipped body in extracted form (jnioche via mattmann)