You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2016/05/20 01:35:20 UTC

nutch git commit: NUTCH-2263 mingram and maxgram support for Unigram Cosine Similarity Model is provided.

Repository: nutch
Updated Branches:
  refs/heads/master 956538984 -> da252eb7b


NUTCH-2263 mingram and maxgram support for Unigram Cosine Similarity Model is provided.


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/da252eb7
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/da252eb7
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/da252eb7

Branch: refs/heads/master
Commit: da252eb7b3d2d7b7021480db3bec1d82e6fa564d
Parents: 9565389
Author: Furkan KAMACI <fu...@gmail.com>
Authored: Thu May 19 04:13:04 2016 +0300
Committer: Furkan KAMACI <fu...@gmail.com>
Committed: Thu May 19 04:13:04 2016 +0300

----------------------------------------------------------------------
 conf/nutch-default.xml                          |  6 ++-
 .../similarity/cosine/CosineSimilarity.java     |  6 ++-
 .../nutch/scoring/similarity/cosine/Model.java  | 50 +++++++++++++++-----
 .../similarity/util/LuceneTokenizer.java        | 11 +++--
 4 files changed, 53 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/da252eb7/conf/nutch-default.xml
----------------------------------------------------------------------
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 641809f..51b3fd9 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1444,8 +1444,10 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
 
  <property>
   <name>scoring.similarity.ngrams</name>
-  <value>1</value>
-  <description>Specifies the 'n' in ngrams</description>
+  <value>1,1</value>
+  <description>Specifies the min 'n' and max 'n' in ngrams as comma-separated.
+    If one value is specified as 'n', it will be used for both the min 'n' and max 'n' in ngrams.
+  </description>
 </property>
 
 <property>

http://git-wip-us.apache.org/repos/asf/nutch/blob/da252eb7/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
index 81b1eba..9853b34 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
@@ -53,8 +53,10 @@ public class CosineSimilarity implements SimilarityModel{
       }
       String metatags = parse.getData().getParseMeta().get("metatag.keyword");
       String metaDescription = parse.getData().getParseMeta().get("metatag.description");
-      int ngram = conf.getInt("scoring.similarity.ngrams", 1);
-      DocVector docVector = Model.createDocVector(parse.getText()+metaDescription+metatags, ngram);
+      int[] ngramArr = Model.retrieveNgrams(conf);
+      int mingram = ngramArr[0];
+      int maxgram = ngramArr[1];
+      DocVector docVector = Model.createDocVector(parse.getText()+metaDescription+metatags, mingram, maxgram);
       if(docVector!=null){
         score = Model.computeCosineSimilarity(docVector);
         LOG.info("Setting score of {} to {}",url, score);

http://git-wip-us.apache.org/repos/asf/nutch/blob/da252eb7/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
index 371f241..d8180f2 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
@@ -69,9 +69,10 @@ public class Model {
         LOG.info("Loaded custom stopwords from {}",conf.get("scoring.similarity.stopword.file"));
       }
 
-      //Check if user has specified n for ngram cosine model
-      int ngram = conf.getInt("scoring.similarity.ngrams", 1);
-      LOG.info("Value of ngram: {}",ngram);
+      int[] ngramArr = retrieveNgrams(conf);
+      int mingram = ngramArr[0];
+      int maxgram = ngramArr[1];
+      LOG.info("Value of mingram: {} maxgram: {}", mingram, maxgram);
 
       // TODO : Allow for corpus of documents to be provided as gold standard. 
       String line;
@@ -80,7 +81,7 @@ public class Model {
       while ((line = br.readLine()) != null) {
         sb.append(line);
       }
-      DocVector goldStandard = createDocVector(sb.toString(), ngram);
+      DocVector goldStandard = createDocVector(sb.toString(), mingram, maxgram);
       if(goldStandard!=null)
         docVectors.add(goldStandard);
       else {
@@ -101,15 +102,20 @@ public class Model {
   /**
    * Used to create a DocVector from given String text. Used during the parse stage of the crawl 
    * cycle to create a DocVector of the currently parsed page from the parseText attribute value
-   * @param content
-   * @param ngram
+   * @param content The text to tokenize
+   * @param mingram Value of mingram for tokenizing
+   * @param maxgram Value of maxgram for tokenizing
    */
-  public static DocVector createDocVector(String content, int ngram) {
+  public static DocVector createDocVector(String content, int mingram, int maxgram) {
     LuceneTokenizer tokenizer;
-    
-    if(ngram > 1){
-      LOG.info("Using Ngram Cosine Model, user specified ngram value : {}", ngram);
-      tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, ngram);
+
+    if(mingram > 1 && maxgram > 1){
+      LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram);
+      tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram);
+    } else if (mingram > 1) {
+      maxgram = mingram;
+      LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram);
+      tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram);
     }
     else if(stopWords!=null) {
       tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, stopWords, true, 
@@ -159,4 +165,26 @@ public class Model {
     // Returning the max score amongst all documents in the corpus
     return maxScore;
   }
+
+  /**
+   * Retrieves mingram and maxgram from configuration
+   * @param conf Configuration to retrieve mingram and maxgram
+   * @return ngram array as mingram at first index and maxgram at second index
+     */
+  public static int[] retrieveNgrams(Configuration conf){
+    int[] ngramArr = new int[2];
+    //Check if user has specified mingram or ngram for ngram cosine model
+    String[] ngramStr = conf.getStrings("scoring.similarity.ngrams", "1,1");
+    //mingram
+    ngramArr[0] = Integer.parseInt(ngramStr[0]);
+    int maxgram;
+    if (ngramStr.length > 1) {
+      //maxgram
+      ngramArr[1] = Integer.parseInt(ngramStr[1]);
+    } else {
+      //maxgram
+      ngramArr[1] = ngramArr[0];
+    }
+    return ngramArr;
+  }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/da252eb7/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
index c95033a..6f6d4d4 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
@@ -94,12 +94,13 @@ public class LuceneTokenizer {
    * @param content - The text to tokenize
    * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT 
    * @param stemFilterType - Type of stemming to perform
-   * @param ngram - Value of ngram for tokenizing
+   * @param mingram - Value of mingram for tokenizing
+   * @param maxgram - Value of maxgram for tokenizing
    */
-  public LuceneTokenizer(String content, TokenizerType tokenizer, StemFilterType stemFilterType, int ngram) {
+  public LuceneTokenizer(String content, TokenizerType tokenizer, StemFilterType stemFilterType, int mingram, int maxgram) {
     this.tokenizer = tokenizer;
     this.stemFilterType = stemFilterType;
-    tokenStream = createNGramTokenStream(content,ngram);
+    tokenStream = createNGramTokenStream(content, mingram, maxgram);
   }
   
   private TokenStream createTokenStream(String content) {
@@ -124,11 +125,11 @@ public class LuceneTokenizer {
     return tokenStream;
   }
 
-  private TokenStream createNGramTokenStream(String content, int ngram) {
+  private TokenStream createNGramTokenStream(String content, int mingram, int maxgram) {
     tokenStream = new StandardTokenizer(new StringReader(content));
     tokenStream = new LowerCaseFilter(tokenStream);
     tokenStream = applyStemmer(stemFilterType);
-    ShingleFilter shingleFilter = new ShingleFilter(tokenStream, ngram, ngram);
+    ShingleFilter shingleFilter = new ShingleFilter(tokenStream, mingram, maxgram);
     shingleFilter.setOutputUnigrams(false);
     tokenStream = (TokenStream)shingleFilter;
     return tokenStream;