You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by su...@apache.org on 2016/04/04 08:47:11 UTC
[1/2] nutch git commit: Fix for NUTCH-2245 contributed by Bhavya
Sanghavi
Repository: nutch
Updated Branches:
refs/heads/master a9b2491a3 -> b62f43fda
Fix for NUTCH-2245 contributed by Bhavya Sanghavi
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/2c426808
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/2c426808
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/2c426808
Branch: refs/heads/master
Commit: 2c42680823079faf87705df4d0698dcf8b43ef66
Parents: a9b2491
Author: Bhavya Sanghavi <Bh...@Bhavyas-MacBook-Pro.local>
Authored: Wed Mar 23 22:24:40 2016 -0700
Committer: Sujen Shah <su...@apache.org>
Committed: Sun Apr 3 23:31:37 2016 -0700
----------------------------------------------------------------------
conf/nutch-default.xml | 6 +++
.../similarity/cosine/CosineSimilarity.java | 3 +-
.../nutch/scoring/similarity/cosine/Model.java | 22 +++++++---
.../similarity/util/LuceneTokenizer.java | 44 +++++++++++++++-----
4 files changed, 59 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/2c426808/conf/nutch-default.xml
----------------------------------------------------------------------
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 93503f3..fe031d5 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1442,6 +1442,12 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
</description>
</property>
+ <property>
+ <name>scoring.similarity.ngrams</name>
+ <value>1</value>
+ <description>Specifies the 'n' in ngrams</description>
+</property>
+
<property>
<name>cosine.goldstandard.file</name>
<value>goldstandard.txt</value>
http://git-wip-us.apache.org/repos/asf/nutch/blob/2c426808/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
index d41f5e2..81b1eba 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
@@ -53,7 +53,8 @@ public class CosineSimilarity implements SimilarityModel{
}
String metatags = parse.getData().getParseMeta().get("metatag.keyword");
String metaDescription = parse.getData().getParseMeta().get("metatag.description");
- DocVector docVector = Model.createDocVector(parse.getText()+metaDescription+metatags);
+ int ngram = conf.getInt("scoring.similarity.ngrams", 1);
+ DocVector docVector = Model.createDocVector(parse.getText()+metaDescription+metatags, ngram);
if(docVector!=null){
score = Model.computeCosineSimilarity(docVector);
LOG.info("Setting score of {} to {}",url, score);
http://git-wip-us.apache.org/repos/asf/nutch/blob/2c426808/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
index ba0006a..371f241 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
@@ -68,6 +68,11 @@ public class Model {
}
LOG.info("Loaded custom stopwords from {}",conf.get("scoring.similarity.stopword.file"));
}
+
+ //Check if user has specified n for ngram cosine model
+ int ngram = conf.getInt("scoring.similarity.ngrams", 1);
+ LOG.info("Value of ngram: {}",ngram);
+
// TODO : Allow for corpus of documents to be provided as gold standard.
String line;
StringBuilder sb = new StringBuilder();
@@ -75,7 +80,7 @@ public class Model {
while ((line = br.readLine()) != null) {
sb.append(line);
}
- DocVector goldStandard = createDocVector(sb.toString());
+ DocVector goldStandard = createDocVector(sb.toString(), ngram);
if(goldStandard!=null)
docVectors.add(goldStandard);
else {
@@ -97,15 +102,21 @@ public class Model {
* Used to create a DocVector from given String text. Used during the parse stage of the crawl
* cycle to create a DocVector of the currently parsed page from the parseText attribute value
* @param content
+ * @param ngram
*/
- public static DocVector createDocVector(String content) {
+ public static DocVector createDocVector(String content, int ngram) {
LuceneTokenizer tokenizer;
- if(stopWords!=null) {
- tokenizer = new LuceneTokenizer(content, TokenizerType.CLASSIC, stopWords, true,
+
+ if(ngram > 1){
+ LOG.info("Using Ngram Cosine Model, user specified ngram value : {}", ngram);
+ tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, ngram);
+ }
+ else if(stopWords!=null) {
+ tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, stopWords, true,
StemFilterType.PORTERSTEM_FILTER);
}
else {
- tokenizer = new LuceneTokenizer(content, TokenizerType.CLASSIC, true,
+ tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, true,
StemFilterType.PORTERSTEM_FILTER);
}
TokenStream tStream = tokenizer.getTokenStream();
@@ -115,6 +126,7 @@ public class Model {
tStream.reset();
while(tStream.incrementToken()) {
String term = charTermAttribute.toString();
+ LOG.debug(term);
if(termVector.containsKey(term)) {
int count = termVector.get(term);
count++;
http://git-wip-us.apache.org/repos/asf/nutch/blob/2c426808/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
index 3ce0fee..c95033a 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
@@ -27,6 +27,7 @@ import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.standard.ClassicTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.StemFilterType;
@@ -36,9 +37,9 @@ public class LuceneTokenizer {
private TokenizerType tokenizer;
private StemFilterType stemFilterType;
private CharArraySet stopSet = null;
-
+
public static enum TokenizerType {CLASSIC, STANDARD}
-
+
/**
* Creates a tokenizer based on param values
* @param content - The text to tokenize
@@ -54,7 +55,7 @@ public class LuceneTokenizer {
}
tokenStream = createTokenStream(content);
}
-
+
/**
* Creates a tokenizer based on param values
* @param content - The text to tokenize
@@ -79,7 +80,7 @@ public class LuceneTokenizer {
}
tokenStream = createTokenStream(content);
}
-
+
/**
* Returns the tokenStream created by the Tokenizer
* @return
@@ -88,6 +89,19 @@ public class LuceneTokenizer {
return tokenStream;
}
+ /**
+ * Creates a tokenizer for the ngram model based on param values
+ * @param content - The text to tokenize
+ * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT
+ * @param stemFilterType - Type of stemming to perform
+ * @param ngram - Value of ngram for tokenizing
+ */
+ public LuceneTokenizer(String content, TokenizerType tokenizer, StemFilterType stemFilterType, int ngram) {
+ this.tokenizer = tokenizer;
+ this.stemFilterType = stemFilterType;
+ tokenStream = createNGramTokenStream(content,ngram);
+ }
+
private TokenStream createTokenStream(String content) {
tokenStream = generateTokenStreamFromText(content, tokenizer);
tokenStream = new LowerCaseFilter(tokenStream);
@@ -97,24 +111,34 @@ public class LuceneTokenizer {
tokenStream = applyStemmer(stemFilterType);
return tokenStream;
}
-
+
private TokenStream generateTokenStreamFromText(String content, TokenizerType tokenizer){
switch(tokenizer){
case CLASSIC:
tokenStream = new ClassicTokenizer(new StringReader(content));
break;
-
+
case STANDARD:
tokenStream = new StandardTokenizer(new StringReader(content));
}
return tokenStream;
}
-
+
+ private TokenStream createNGramTokenStream(String content, int ngram) {
+ tokenStream = new StandardTokenizer(new StringReader(content));
+ tokenStream = new LowerCaseFilter(tokenStream);
+ tokenStream = applyStemmer(stemFilterType);
+ ShingleFilter shingleFilter = new ShingleFilter(tokenStream, ngram, ngram);
+ shingleFilter.setOutputUnigrams(false);
+ tokenStream = (TokenStream)shingleFilter;
+ return tokenStream;
+ }
+
private TokenStream applyStopFilter(CharArraySet stopWords) {
tokenStream = new StopFilter(tokenStream, stopWords);
return tokenStream;
}
-
+
private TokenStream applyStemmer(StemFilterType stemFilterType) {
switch(stemFilterType){
case ENGLISHMINIMALSTEM_FILTER:
@@ -123,8 +147,8 @@ public class LuceneTokenizer {
case PORTERSTEM_FILTER:
tokenStream = new PorterStemFilter(tokenStream);
break;
- default:
- break;
+ default:
+ break;
}
return tokenStream;
[2/2] nutch git commit: Add changes record for NUTCH-2245 Developed
the NGram Model on the existing Unigram Cosine Similarity Model contributed
by Bhavya Sanghavi this closes #101
Posted by su...@apache.org.
Add changes record for NUTCH-2245 Developed the NGram Model on the existing Unigram Cosine Similarity Model contributed by Bhavya Sanghavi this closes #101
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/b62f43fd
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/b62f43fd
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/b62f43fd
Branch: refs/heads/master
Commit: b62f43fda3648bd7a37da550a180c3da5a1e3986
Parents: 2c42680
Author: Sujen Shah <su...@apache.org>
Authored: Sun Apr 3 23:39:12 2016 -0700
Committer: Sujen Shah <su...@apache.org>
Committed: Sun Apr 3 23:41:15 2016 -0700
----------------------------------------------------------------------
CHANGES.txt | 2 ++
1 file changed, 2 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/b62f43fd/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index b15b78c..5e061a4 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -10,6 +10,8 @@ in the release announcement and keep it on top in this CHANGES.txt for the Nutch
Nutch Change Log
+* NUTCH-2245 Developed the NGram Model on the existing Unigram Cosine Similarity Model (bhavyasanghavi via sujen)
+
* NUTCH-2241 Unstable Selenium plugin in Nutch. Fixed bugs and enhanced configuration (karanjeets via mattmann)
* NUTCH-2213 CommonCrawlDataDumper saves gzipped body in extracted form (jnioche via mattmann)