You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2009/07/16 00:07:35 UTC
svn commit: r794433 -
/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
Author: gsingers
Date: Wed Jul 15 22:07:35 2009
New Revision: 794433
URL: http://svn.apache.org/viewvc?rev=794433&view=rev
Log:
minor improvement in tokenization
Modified:
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java?rev=794433&r1=794432&r2=794433&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java Wed Jul 15 22:07:35 2009
@@ -59,9 +59,8 @@
if(!country.equals("Unknown")){
document = StringEscapeUtils.unescapeHtml(document.replaceFirst("<text xml:space=\"preserve\">", "").replaceAll("</text>", ""));
TokenStream stream = analyzer.tokenStream(country, new StringReader(document));
- while(true){
- Token token = stream.next();
- if(token==null) break;
+ Token token = new Token();
+ while((token = stream.next(token)) != null){
contents.append(token.termBuffer(), 0, token.termLength()).append(' ');
}
output.collect(new Text(country.replace(" ","_")), new Text(contents.toString()));