You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2009/07/16 00:07:35 UTC

svn commit: r794433 - /lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java

Author: gsingers
Date: Wed Jul 15 22:07:35 2009
New Revision: 794433

URL: http://svn.apache.org/viewvc?rev=794433&view=rev
Log:
minor improvement in tokenization

Modified:
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java?rev=794433&r1=794432&r2=794433&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java Wed Jul 15 22:07:35 2009
@@ -59,9 +59,8 @@
     if(!country.equals("Unknown")){
       document = StringEscapeUtils.unescapeHtml(document.replaceFirst("<text xml:space=\"preserve\">", "").replaceAll("</text>", ""));
       TokenStream stream = analyzer.tokenStream(country, new StringReader(document));
-      while(true){
-        Token token = stream.next();
-        if(token==null) break;
+      Token token = new Token();
+      while((token = stream.next(token)) != null){
         contents.append(token.termBuffer(), 0, token.termLength()).append(' ');
       }
       output.collect(new Text(country.replace(" ","_")), new Text(contents.toString()));