You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by is...@apache.org on 2010/01/21 18:36:20 UTC
svn commit: r901791 - in /lucene/mahout/trunk:
core/src/main/java/org/apache/mahout/classifier/
examples/src/main/java/org/apache/mahout/classifier/bayes/
utils/src/main/java/org/apache/mahout/utils/vectors/text/
Author: isabel
Date: Thu Jan 21 17:36:20 2010
New Revision: 901791
URL: http://svn.apache.org/viewvc?rev=901791&view=rev
Log:
MAHOUT-246 - upgraded to new lucene TokenStream API
Modified:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentTokenizerMapper.java
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java?rev=901791&r1=901790&r2=901791&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java Thu Jan 21 17:36:20 2010
@@ -39,9 +39,10 @@
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.Version;
import org.apache.mahout.common.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -219,10 +220,10 @@
writer.write(label);
writer.write('\t'); // edit: Inorder to match Hadoop standard
// TextInputFormat
- Token token = new Token();
- while ((token = ts.next(token)) != null) {
- char[] termBuffer = token.termBuffer();
- int termLen = token.termLength();
+ TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
+ while (ts.incrementToken()) {
+ char[] termBuffer = termAtt.termBuffer();
+ int termLen = termAtt.termLength();
writer.write(termBuffer, 0, termLen);
writer.write(' ');
}
@@ -244,11 +245,11 @@
Reader reader) throws IOException {
TokenStream ts = analyzer.tokenStream("", reader);
- Token token;
List<String> coll = new ArrayList<String>();
- while ((token = ts.next()) != null) {
- char[] termBuffer = token.termBuffer();
- int termLen = token.termLength();
+ TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
+ while (ts.incrementToken()) {
+ char[] termBuffer = termAtt.termBuffer();
+ int termLen = termAtt.termLength();
String val = new String(termBuffer, 0, termLen);
coll.add(val);
}
@@ -334,7 +335,7 @@
analyzer = Class.forName((String) cmdLine.getValue(analyzerOpt))
.asSubclass(Analyzer.class).newInstance();
} else {
- analyzer = new StandardAnalyzer();
+ analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
}
Charset charset = Charset.forName("UTF-8");
if (cmdLine.hasOption(charsetOpt)) {
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java?rev=901791&r1=901790&r2=901791&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java Thu Jan 21 17:36:20 2010
@@ -34,8 +34,8 @@
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.GenericsUtil;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.mahout.analysis.WikipediaAnalyzer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -77,9 +77,9 @@
.replaceAll(""));
TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(
document));
- Token token = new Token();
- while ((token = stream.next(token)) != null) {
- contents.append(token.termBuffer(), 0, token.termLength()).append(' ');
+ TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
+ while (stream.incrementToken()) {
+ contents.append(termAtt.termBuffer(), 0, termAtt.termLength()).append(' ');
}
output.collect(new Text(SPACE_NON_ALPHA_PATTERN.matcher(catMatch)
.replaceAll("_")), new Text(contents.toString()));
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentTokenizerMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentTokenizerMapper.java?rev=901791&r1=901790&r2=901791&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentTokenizerMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentTokenizerMapper.java Thu Jan 21 17:36:20 2010
@@ -19,12 +19,7 @@
import java.io.IOException;
import java.io.StringReader;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Map.Entry;
-import org.apache.commons.lang.mutable.MutableLong;
-import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
@@ -32,11 +27,9 @@
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
-import org.apache.mahout.common.parameters.ClassParameter;
/**
* TextVectorizer Term Count Mapper. Tokenizes a text document and outputs
@@ -44,9 +37,9 @@
*/
public class DocumentTokenizerMapper extends MapReduceBase implements
Mapper<Text,Text,Text,Text> {
-
+
private Analyzer analyzer;
- private StringBuilder document = new StringBuilder();
+ private final StringBuilder document = new StringBuilder();
@Override
public void map(Text key,
Text value,
@@ -58,18 +51,17 @@
TermAttribute termAtt =
(TermAttribute) stream.addAttribute(TermAttribute.class);
document.setLength(0);
- String sep = "";
+ String sep = " ";
while (stream.incrementToken()) {
- String word = new String(termAtt.termBuffer(), 0, termAtt.termLength());
- if (word != "") {
- document.append(sep).append(word);
- sep = " ";
+ if (termAtt.termLength() > 0) {
+ document.append(sep).append(termAtt.termBuffer(), 0,
+ termAtt.termLength());
}
}
output.collect(key, new Text(document.toString()) );
-
+
}
-
+
@Override
public void configure(JobConf job) {
super.configure(job);
@@ -87,5 +79,5 @@
throw new IllegalStateException(e);
}
}
-
+
}