You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by is...@apache.org on 2010/01/21 18:36:20 UTC

svn commit: r901791 - in /lucene/mahout/trunk: core/src/main/java/org/apache/mahout/classifier/ examples/src/main/java/org/apache/mahout/classifier/bayes/ utils/src/main/java/org/apache/mahout/utils/vectors/text/

Author: isabel
Date: Thu Jan 21 17:36:20 2010
New Revision: 901791

URL: http://svn.apache.org/viewvc?rev=901791&view=rev
Log:
MAHOUT-246 - upgraded to new lucene TokenStream API

Modified:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentTokenizerMapper.java

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java?rev=901791&r1=901790&r2=901791&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/BayesFileFormatter.java Thu Jan 21 17:36:20 2010
@@ -39,9 +39,10 @@
 import org.apache.commons.cli2.builder.GroupBuilder;
 import org.apache.commons.cli2.commandline.Parser;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.Version;
 import org.apache.mahout.common.IOUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -219,10 +220,10 @@
       writer.write(label);
       writer.write('\t'); // edit: Inorder to match Hadoop standard
       // TextInputFormat
-      Token token = new Token();
-      while ((token = ts.next(token)) != null) {
-        char[] termBuffer = token.termBuffer();
-        int termLen = token.termLength();
+      TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
+      while (ts.incrementToken()) {
+        char[] termBuffer = termAtt.termBuffer();
+        int termLen = termAtt.termLength();
         writer.write(termBuffer, 0, termLen);
         writer.write(' ');
       }
@@ -244,11 +245,11 @@
                                           Reader reader) throws IOException {
     TokenStream ts = analyzer.tokenStream("", reader);
     
-    Token token;
     List<String> coll = new ArrayList<String>();
-    while ((token = ts.next()) != null) {
-      char[] termBuffer = token.termBuffer();
-      int termLen = token.termLength();
+    TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
+    while (ts.incrementToken()) {
+      char[] termBuffer = termAtt.termBuffer();
+      int termLen = termAtt.termLength();
       String val = new String(termBuffer, 0, termLen);
       coll.add(val);
     }
@@ -334,7 +335,7 @@
         analyzer = Class.forName((String) cmdLine.getValue(analyzerOpt))
             .asSubclass(Analyzer.class).newInstance();
       } else {
-        analyzer = new StandardAnalyzer();
+        analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
       }
       Charset charset = Charset.forName("UTF-8");
       if (cmdLine.hasOption(charsetOpt)) {

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java?rev=901791&r1=901790&r2=901791&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java Thu Jan 21 17:36:20 2010
@@ -34,8 +34,8 @@
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.util.GenericsUtil;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.mahout.analysis.WikipediaAnalyzer;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -77,9 +77,9 @@
           .replaceAll(""));
       TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(
           document));
-      Token token = new Token();
-      while ((token = stream.next(token)) != null) {
-        contents.append(token.termBuffer(), 0, token.termLength()).append(' ');
+      TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
+      while (stream.incrementToken()) {
+        contents.append(termAtt.termBuffer(), 0, termAtt.termLength()).append(' ');
       }
       output.collect(new Text(SPACE_NON_ALPHA_PATTERN.matcher(catMatch)
           .replaceAll("_")), new Text(contents.toString()));

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentTokenizerMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentTokenizerMapper.java?rev=901791&r1=901790&r2=901791&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentTokenizerMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentTokenizerMapper.java Thu Jan 21 17:36:20 2010
@@ -19,12 +19,7 @@
 
 import java.io.IOException;
 import java.io.StringReader;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Map.Entry;
 
-import org.apache.commons.lang.mutable.MutableLong;
-import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.MapReduceBase;
@@ -32,11 +27,9 @@
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
-import org.apache.mahout.common.parameters.ClassParameter;
 
 /**
  * TextVectorizer Term Count Mapper. Tokenizes a text document and outputs
@@ -44,9 +37,9 @@
  */
 public class DocumentTokenizerMapper extends MapReduceBase implements
     Mapper<Text,Text,Text,Text> {
-  
+
   private Analyzer analyzer;
-  private StringBuilder document = new StringBuilder();
+  private final StringBuilder document = new StringBuilder();
   @Override
   public void map(Text key,
                   Text value,
@@ -58,18 +51,17 @@
     TermAttribute termAtt =
         (TermAttribute) stream.addAttribute(TermAttribute.class);
     document.setLength(0);
-    String sep = "";
+    String sep = " ";
     while (stream.incrementToken()) {
-      String word = new String(termAtt.termBuffer(), 0, termAtt.termLength());
-      if (word != "") {
-        document.append(sep).append(word);
-        sep = " ";
+      if (termAtt.termLength() > 0) {
+        document.append(sep).append(termAtt.termBuffer(), 0,
+            termAtt.termLength());
       }
     }
     output.collect(key, new Text(document.toString()) );
-    
+
   }
-  
+
   @Override
   public void configure(JobConf job) {
     super.configure(job);
@@ -87,5 +79,5 @@
       throw new IllegalStateException(e);
     }
   }
-  
+
 }