You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by Apache Wiki <wi...@apache.org> on 2006/07/28 16:55:46 UTC
[Nutch Wiki] Update of "Stemming" by MatthewHolt
Dear Wiki user,
You have subscribed to a wiki page or wiki category on "Nutch Wiki" for change notification.
The following page has been changed by MatthewHolt:
http://wiki.apache.org/nutch/Stemming
New page:
[[TableOfContents]]
The following steps need to be taken to implement Stemming in Nutch. Howie Wang is the person credited with doing so for version 0.7.2.
I updated the process for Version 0.8. That can be found below. - Matthew Holt
"I've gotten a couple of questions offlist about stemming
so I thought I'd just post here with my changes. Sorry that
some of the changes are in the main code and not in a plugin. It
seemed that it's more efficient to put in the main analyzer. It
would be nice if later releases could add support for plugging
in a custom stemmer/analyzer."
=Version 0.7.2=
The first change I made is in NutchDocumentAnalyzer.java.
Import the following classes at the top of the file:
{{{
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.PorterStemFilter;
}}}
Change tokenStream to:
{{{
public TokenStream tokenStream(String field, Reader reader) {
TokenStream ts = CommonGrams.getFilter(new NutchDocumentTokenizer(reader), field);
if (field.equals("content") || field.equals("title")) {
ts = new LowerCaseFilter(ts);
return new PorterStemFilter(ts);
}
else {
return ts;
}
}
}}}
The second change is in CommonGrams.java.
Import the following classes near the top:
{{{
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.PorterStemFilter;
}}}
In optimizePhrase, after this line:
{{{
TokenStream ts = getFilter(new ArrayTokens(phrase), field);
}}}
Add:
{{{
ts = new PorterStemFilter(new LowerCaseFilter(ts));
}}}
And the rest is a new QueryFilter plugin that I'm calling query-stemmer.
Here's the full source for the Java file. You can copy the build.xml
and plugin.xml from query-basic, and alter the names for query-stemmer.
{{{
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package org.apache.nutch.searcher.stemmer;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.PorterStemFilter;
import org.apache.nutch.analysis.NutchDocumentAnalyzer;
import org.apache.nutch.analysis.CommonGrams;
import org.apache.nutch.searcher.QueryFilter;
import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.Query.*;
import java.io.IOException;
import java.util.HashSet;
import java.io.StringReader;
/** The default query filter. Query terms in the default query field are
* expanded to search the url, anchor and content document fields.*/
public class StemmerQueryFilter implements QueryFilter {
private static float URL_BOOST = 4.0f;
private static float ANCHOR_BOOST = 2.0f;
private static int SLOP = Integer.MAX_VALUE;
private static float PHRASE_BOOST = 1.0f;
private static final String[] FIELDS = {"url", "anchor", "content",
"title"};
private static final float[] FIELD_BOOSTS = {URL_BOOST, ANCHOR_BOOST,
1.0f, 2.0f};
/** Set the boost factor for url matches, relative to content and anchor
* matches */
public static void setUrlBoost(float boost) { URL_BOOST = boost; }
/** Set the boost factor for title/anchor matches, relative to url and
* content matches. */
public static void setAnchorBoost(float boost) { ANCHOR_BOOST = boost; }
/** Set the boost factor for sloppy phrase matches relative to unordered
term
* matches. */
public static void setPhraseBoost(float boost) { PHRASE_BOOST = boost; }
/** Set the maximum number of terms permitted between matching terms in a
* sloppy phrase match. */
public static void setSlop(int slop) { SLOP = slop; }
public BooleanQuery filter(Query input, BooleanQuery output) {
addTerms(input, output);
addSloppyPhrases(input, output);
return output;
}
private static void addTerms(Query input, BooleanQuery output) {
Clause[] clauses = input.getClauses();
for (int i = 0; i < clauses.length; i++) {
Clause c = clauses[i];
if (!c.getField().equals(Clause.DEFAULT_FIELD))
continue; // skip non-default fields
BooleanQuery out = new BooleanQuery();
for (int f = 0; f < FIELDS.length; f++) {
Clause o = c;
String[] opt;
// TODO: I'm a little nervous about stemming for all default fields.
// Should keep an eye on this.
if (c.isPhrase()) { // optimize phrase
clauses
opt = CommonGrams.optimizePhrase(c.getPhrase(), FIELDS[f]);
} else {
System.out.println("o.getTerm = " + o.getTerm().toString());
opt = getStemmedWords(o.getTerm().toString());
}
if (opt.length==1) {
o = new Clause(new Term(opt[0]), c.isRequired(),
c.isProhibited());
} else {
o = new Clause(new Phrase(opt), c.isRequired(),
c.isProhibited());
}
out.add(o.isPhrase()
? exactPhrase(o.getPhrase(), FIELDS[f], FIELD_BOOSTS[f])
: termQuery(FIELDS[f], o.getTerm(), FIELD_BOOSTS[f]),
false, false);
}
output.add(out, c.isRequired(), c.isProhibited());
}
System.out.println("query = " + output.toString());
}
private static String[] getStemmedWords(String value) {
StringReader sr = new StringReader(value);
TokenStream ts = new PorterStemFilter(new LowerCaseTokenizer(sr));
String stemmedValue = "";
try {
Token token = ts.next();
int count = 0;
while (token != null) {
System.out.println("token = " + token.termText());
System.out.println("type = " + token.type());
if (count == 0)
stemmedValue = token.termText();
else
stemmedValue = stemmedValue + " " + token.termText();
token = ts.next();
count++;
}
} catch (Exception e) {
stemmedValue = value;
}
if (stemmedValue.equals("")) {
stemmedValue = value;
}
String[] stemmedValues = stemmedValue.split("\\s+");
for (int j=0; j<stemmedValues.length; j++) {
System.out.println("stemmedValues = " + stemmedValues[j]);
}
return stemmedValues;
}
private static void addSloppyPhrases(Query input, BooleanQuery output) {
Clause[] clauses = input.getClauses();
for (int f = 0; f < FIELDS.length; f++) {
PhraseQuery sloppyPhrase = new PhraseQuery();
sloppyPhrase.setBoost(FIELD_BOOSTS[f] * PHRASE_BOOST);
sloppyPhrase.setSlop("anchor".equals(FIELDS[f])
? NutchDocumentAnalyzer.INTER_ANCHOR_GAP
: SLOP);
int sloppyTerms = 0;
for (int i = 0; i < clauses.length; i++) {
Clause c = clauses[i];
if (!c.getField().equals(Clause.DEFAULT_FIELD))
continue; // skip non-default fields
if (c.isPhrase()) // skip exact phrases
continue;
if (c.isProhibited()) // skip prohibited terms
continue;
sloppyPhrase.add(luceneTerm(FIELDS[f], c.getTerm()));
sloppyTerms++;
}
if (sloppyTerms > 1)
output.add(sloppyPhrase, false, false);
}
}
private static org.apache.lucene.search.Query
termQuery(String field, Term term, float boost) {
TermQuery result = new TermQuery(luceneTerm(field, term));
result.setBoost(boost);
return result;
}
/** Utility to construct a Lucene exact phrase query for a Nutch phrase.
*/
private static org.apache.lucene.search.Query
exactPhrase(Phrase nutchPhrase,
String field, float boost) {
Term[] terms = nutchPhrase.getTerms();
PhraseQuery exactPhrase = new PhraseQuery();
for (int i = 0; i < terms.length; i++) {
exactPhrase.add(luceneTerm(field, terms[i]));
}
exactPhrase.setBoost(boost);
return exactPhrase;
}
/** Utility to construct a Lucene Term given a Nutch query term and field.
*/
private static org.apache.lucene.index.Term luceneTerm(String field,
Term term) {
return new org.apache.lucene.index.Term(field, term.toString());
}
}
}}}
=Version 0.8=
The first change I made is in NutchDocumentAnalyzer.java.
Import the following classes at the top of the file:
{{{
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.PorterStemFilter;
}}}
Change tokenStream at the bottom of the file to:
{{{
public TokenStream tokenStream(String field, Reader reader) {
Analyzer analyzer;
if ("anchor".equals(field)) {
analyzer = ANCHOR_ANALYZER;
}
else {
analyzer = CONTENT_ANALYZER;
TokenStream ts = analyzer.tokenStream(field, reader);
if (field.equals("content") || field.equals("title")) {
ts = new LowerCaseFilter(ts);
return new PorterStemFilter(ts);
}
else {
return ts;
}
}
}
}}}
The second change is in CommonGrams.java.
Import the following classes near the top:
{{{
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.PorterStemFilter;
}}}
In optimizePhrase, after this line:
{{{
TokenStream ts = getFilter(new ArrayTokens(phrase), field);
}}}
Add:
{{{
ts = new PorterStemFilter(new LowerCaseFilter(ts));
}}}
And the rest is a new QueryFilter plugin that I'm calling query-stemmer.
Here's the full source for the Java file. You can copy the build.xml
and plugin.xml from query-basic, and alter the names for query-stemmer.
{{{{
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package org.apache.nutch.searcher.stemmer;
import org.apache.hadoop.conf.Configuration;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.PorterStemFilter;
import org.apache.nutch.analysis.NutchDocumentAnalyzer;
import org.apache.nutch.analysis.CommonGrams;
import org.apache.nutch.searcher.QueryFilter;
import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.Query.*;
import java.io.IOException;
import java.util.HashSet;
import java.io.StringReader;
/**
* The default query filter. Query terms in the default query field are expanded
* to search the url, anchor and content document fields.
*/
public class StemmerQueryFilter implements QueryFilter {
private static int SLOP = Integer.MAX_VALUE;
private float PHRASE_BOOST = 1.0f;
private static final String[] FIELDS = { "url", "anchor", "content",
"title" , "host" };
private final float[] FIELD_BOOSTS = { 4.0f, 2.0f, 1.0f, 1.5f, 2.0f };
private Configuration conf;
/**
* Set the boost factor for url matches, relative to content and anchor
* matches
*/
public void setUrlBoost(float boost) {
FIELD_BOOSTS[0] = boost;
}
/**
* Set the boost factor for title/anchor matches, relative to url and
* content matches.
*/
public void setAnchorBoost(float boost) {
FIELD_BOOSTS[1] = boost;
}
/**
* Set the boost factor for sloppy phrase matches relative to unordered term
* matches.
*/
public void setPhraseBoost(float boost) {
PHRASE_BOOST = boost;
}
/**
* Set the maximum number of terms permitted between matching terms in a
* sloppy phrase match.
*/
public static void setSlop(int slop) {
SLOP = slop;
}
public BooleanQuery filter(Query input, BooleanQuery output) {
addTerms(input, output);
addSloppyPhrases(input, output);
return output;
}
private void addTerms(Query input, BooleanQuery output) {
Clause[] clauses = input.getClauses();
for (int i = 0; i < clauses.length; i++) {
Clause c = clauses[i];
if (!c.getField().equals(Clause.DEFAULT_FIELD))
continue; // skip non-default fields
BooleanQuery out = new BooleanQuery();
for (int f = 0; f < FIELDS.length; f++) {
Clause o = c;
String[] opt;
// TODO: I'm a little nervous about stemming for all default
// fields.
// Should keep an eye on this.
if (c.isPhrase()) { // optimize phrase clauses
opt = new CommonGrams(getConf()).optimizePhrase(c
.getPhrase(), FIELDS[f]);
} else {
System.out.println("o.getTerm = " + o.getTerm().toString());
opt = getStemmedWords(o.getTerm().toString());
}
if (opt.length == 1) {
o = new Clause(new Term(opt[0]), c.isRequired(), c
.isProhibited(), getConf());
} else {
o = new Clause(new Phrase(opt), c.isRequired(), c
.isProhibited(), getConf());
}
out.add(o.isPhrase() ? exactPhrase(o.getPhrase(), FIELDS[f],
FIELD_BOOSTS[f]) : termQuery(FIELDS[f], o.getTerm(),
FIELD_BOOSTS[f]), BooleanClause.Occur.SHOULD);
}
output.add(out, (c.isProhibited() ? BooleanClause.Occur.MUST_NOT
: (c.isRequired() ? BooleanClause.Occur.MUST
: BooleanClause.Occur.SHOULD)));
}
System.out.println("query = " + output.toString());
}
private static String[] getStemmedWords(String value) {
StringReader sr = new StringReader(value);
TokenStream ts = new PorterStemFilter(new LowerCaseTokenizer(sr));
String stemmedValue = "";
try {
Token token = ts.next();
int count = 0;
while (token != null) {
System.out.println("token = " + token.termText());
System.out.println("type = " + token.type());
if (count == 0)
stemmedValue = token.termText();
else
stemmedValue = stemmedValue + " " + token.termText();
token = ts.next();
count++;
}
} catch (Exception e) {
stemmedValue = value;
}
if (stemmedValue.equals("")) {
stemmedValue = value;
}
String[] stemmedValues = stemmedValue.split("\\s+");
for (int j = 0; j < stemmedValues.length; j++) {
System.out.println("stemmedValues = " + stemmedValues[j]);
}
return stemmedValues;
}
private void addSloppyPhrases(Query input, BooleanQuery output) {
Clause[] clauses = input.getClauses();
for (int f = 0; f < FIELDS.length; f++) {
PhraseQuery sloppyPhrase = new PhraseQuery();
sloppyPhrase.setBoost(FIELD_BOOSTS[f] * PHRASE_BOOST);
sloppyPhrase
.setSlop("anchor".equals(FIELDS[f]) ? NutchDocumentAnalyzer.INTER_ANCHOR_GAP
: SLOP);
int sloppyTerms = 0;
for (int i = 0; i < clauses.length; i++) {
Clause c = clauses[i];
if (!c.getField().equals(Clause.DEFAULT_FIELD))
continue; // skip non-default fields
if (c.isPhrase()) // skip exact phrases
continue;
if (c.isProhibited()) // skip prohibited terms
continue;
sloppyPhrase.add(luceneTerm(FIELDS[f], c.getTerm()));
sloppyTerms++;
}
if (sloppyTerms > 1)
output.add(sloppyPhrase, BooleanClause.Occur.SHOULD);
}
}
private static org.apache.lucene.search.Query termQuery(String field,
Term term, float boost) {
TermQuery result = new TermQuery(luceneTerm(field, term));
result.setBoost(boost);
return result;
}
/**
* Utility to construct a Lucene exact phrase query for a Nutch phrase.
*/
private static org.apache.lucene.search.Query exactPhrase(
Phrase nutchPhrase, String field, float boost) {
Term[] terms = nutchPhrase.getTerms();
PhraseQuery exactPhrase = new PhraseQuery();
for (int i = 0; i < terms.length; i++) {
exactPhrase.add(luceneTerm(field, terms[i]));
}
exactPhrase.setBoost(boost);
return exactPhrase;
}
/**
* Utility to construct a Lucene Term given a Nutch query term and field.
*/
private static org.apache.lucene.index.Term luceneTerm(String field,
Term term) {
return new org.apache.lucene.index.Term(field, term.toString());
}
public void setConf(Configuration conf) {
this.conf = conf;
this.FIELD_BOOSTS[0] = conf.getFloat("query.url.boost", 4.0f);
this.FIELD_BOOSTS[1] = conf.getFloat("query.anchor.boost", 2.0f);
this.FIELD_BOOSTS[2] = conf.getFloat("query.content.boost", 1.0f);
this.FIELD_BOOSTS[3] = conf.getFloat("query.title.boost", 1.5f);
this.FIELD_BOOSTS[4] = conf.getFloat("query.host.boost", 2.0f);
this.PHRASE_BOOST = conf.getFloat("query.phrase.boost", 1.0f);
}
public Configuration getConf() {
return this.conf;
}
}
}}}}