You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2016/05/20 01:35:20 UTC
nutch git commit: NUTCH-2263 mingram and maxgram support for Unigram
Cosine Similarity Model is provided.
Repository: nutch
Updated Branches:
refs/heads/master 956538984 -> da252eb7b
NUTCH-2263 mingram and maxgram support for Unigram Cosine Similarity Model is provided.
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/da252eb7
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/da252eb7
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/da252eb7
Branch: refs/heads/master
Commit: da252eb7b3d2d7b7021480db3bec1d82e6fa564d
Parents: 9565389
Author: Furkan KAMACI <fu...@gmail.com>
Authored: Thu May 19 04:13:04 2016 +0300
Committer: Furkan KAMACI <fu...@gmail.com>
Committed: Thu May 19 04:13:04 2016 +0300
----------------------------------------------------------------------
conf/nutch-default.xml | 6 ++-
.../similarity/cosine/CosineSimilarity.java | 6 ++-
.../nutch/scoring/similarity/cosine/Model.java | 50 +++++++++++++++-----
.../similarity/util/LuceneTokenizer.java | 11 +++--
4 files changed, 53 insertions(+), 20 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/da252eb7/conf/nutch-default.xml
----------------------------------------------------------------------
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 641809f..51b3fd9 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1444,8 +1444,10 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
<property>
<name>scoring.similarity.ngrams</name>
- <value>1</value>
- <description>Specifies the 'n' in ngrams</description>
+ <value>1,1</value>
+ <description>Specifies the min 'n' and max 'n' in ngrams as comma-separated.
+ If one value is specified as 'n', it will be used for both the min 'n' and max 'n' in ngrams.
+ </description>
</property>
<property>
http://git-wip-us.apache.org/repos/asf/nutch/blob/da252eb7/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
index 81b1eba..9853b34 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
@@ -53,8 +53,10 @@ public class CosineSimilarity implements SimilarityModel{
}
String metatags = parse.getData().getParseMeta().get("metatag.keyword");
String metaDescription = parse.getData().getParseMeta().get("metatag.description");
- int ngram = conf.getInt("scoring.similarity.ngrams", 1);
- DocVector docVector = Model.createDocVector(parse.getText()+metaDescription+metatags, ngram);
+ int[] ngramArr = Model.retrieveNgrams(conf);
+ int mingram = ngramArr[0];
+ int maxgram = ngramArr[1];
+ DocVector docVector = Model.createDocVector(parse.getText()+metaDescription+metatags, mingram, maxgram);
if(docVector!=null){
score = Model.computeCosineSimilarity(docVector);
LOG.info("Setting score of {} to {}",url, score);
http://git-wip-us.apache.org/repos/asf/nutch/blob/da252eb7/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
index 371f241..d8180f2 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
@@ -69,9 +69,10 @@ public class Model {
LOG.info("Loaded custom stopwords from {}",conf.get("scoring.similarity.stopword.file"));
}
- //Check if user has specified n for ngram cosine model
- int ngram = conf.getInt("scoring.similarity.ngrams", 1);
- LOG.info("Value of ngram: {}",ngram);
+ int[] ngramArr = retrieveNgrams(conf);
+ int mingram = ngramArr[0];
+ int maxgram = ngramArr[1];
+ LOG.info("Value of mingram: {} maxgram: {}", mingram, maxgram);
// TODO : Allow for corpus of documents to be provided as gold standard.
String line;
@@ -80,7 +81,7 @@ public class Model {
while ((line = br.readLine()) != null) {
sb.append(line);
}
- DocVector goldStandard = createDocVector(sb.toString(), ngram);
+ DocVector goldStandard = createDocVector(sb.toString(), mingram, maxgram);
if(goldStandard!=null)
docVectors.add(goldStandard);
else {
@@ -101,15 +102,20 @@ public class Model {
/**
* Used to create a DocVector from given String text. Used during the parse stage of the crawl
* cycle to create a DocVector of the currently parsed page from the parseText attribute value
- * @param content
- * @param ngram
+ * @param content The text to tokenize
+ * @param mingram Value of mingram for tokenizing
+ * @param maxgram Value of maxgram for tokenizing
*/
- public static DocVector createDocVector(String content, int ngram) {
+ public static DocVector createDocVector(String content, int mingram, int maxgram) {
LuceneTokenizer tokenizer;
-
- if(ngram > 1){
- LOG.info("Using Ngram Cosine Model, user specified ngram value : {}", ngram);
- tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, ngram);
+
+ if(mingram > 1 && maxgram > 1){
+ LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram);
+ tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram);
+ } else if (mingram > 1) {
+ maxgram = mingram;
+ LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram);
+ tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram);
}
else if(stopWords!=null) {
tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, stopWords, true,
@@ -159,4 +165,26 @@ public class Model {
// Returning the max score amongst all documents in the corpus
return maxScore;
}
+
+ /**
+ * Retrieves mingram and maxgram from configuration
+ * @param conf Configuration to retrieve mingram and maxgram
+ * @return ngram array as mingram at first index and maxgram at second index
+ */
+ public static int[] retrieveNgrams(Configuration conf){
+ int[] ngramArr = new int[2];
+ //Check if user has specified mingram or ngram for ngram cosine model
+ String[] ngramStr = conf.getStrings("scoring.similarity.ngrams", "1,1");
+ //mingram
+ ngramArr[0] = Integer.parseInt(ngramStr[0]);
+ int maxgram;
+ if (ngramStr.length > 1) {
+ //maxgram
+ ngramArr[1] = Integer.parseInt(ngramStr[1]);
+ } else {
+ //maxgram
+ ngramArr[1] = ngramArr[0];
+ }
+ return ngramArr;
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/da252eb7/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
index c95033a..6f6d4d4 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
@@ -94,12 +94,13 @@ public class LuceneTokenizer {
* @param content - The text to tokenize
* @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT
* @param stemFilterType - Type of stemming to perform
- * @param ngram - Value of ngram for tokenizing
+ * @param mingram - Value of mingram for tokenizing
+ * @param maxgram - Value of maxgram for tokenizing
*/
- public LuceneTokenizer(String content, TokenizerType tokenizer, StemFilterType stemFilterType, int ngram) {
+ public LuceneTokenizer(String content, TokenizerType tokenizer, StemFilterType stemFilterType, int mingram, int maxgram) {
this.tokenizer = tokenizer;
this.stemFilterType = stemFilterType;
- tokenStream = createNGramTokenStream(content,ngram);
+ tokenStream = createNGramTokenStream(content, mingram, maxgram);
}
private TokenStream createTokenStream(String content) {
@@ -124,11 +125,11 @@ public class LuceneTokenizer {
return tokenStream;
}
- private TokenStream createNGramTokenStream(String content, int ngram) {
+ private TokenStream createNGramTokenStream(String content, int mingram, int maxgram) {
tokenStream = new StandardTokenizer(new StringReader(content));
tokenStream = new LowerCaseFilter(tokenStream);
tokenStream = applyStemmer(stemFilterType);
- ShingleFilter shingleFilter = new ShingleFilter(tokenStream, ngram, ngram);
+ ShingleFilter shingleFilter = new ShingleFilter(tokenStream, mingram, maxgram);
shingleFilter.setOutputUnigrams(false);
tokenStream = (TokenStream)shingleFilter;
return tokenStream;