You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2011/07/05 09:18:32 UTC

svn commit: r1142907 - in /mahout/trunk: core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/ core/src/main/java/org/apache/mahout/classifier/sgd/ core/src/main/java/org/apache/mahout/common/nlp/ core/src/main/java/org/apache/mahout/...

Author: srowen
Date: Tue Jul  5 07:18:31 2011
New Revision: 1142907

URL: http://svn.apache.org/viewvc?rev=1142907&view=rev
Log:
Harmonize some splitter / patterns to split on space / tab, in Bayes

Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesFeatureMapper.java
    mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java
    mahout/trunk/core/src/main/java/org/apache/mahout/common/nlp/NGrams.java
    mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java
    mahout/trunk/core/src/test/java/org/apache/mahout/classifier/sgd/OnlineBaseTest.java
    mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesFeatureMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesFeatureMapper.java?rev=1142907&r1=1142906&r2=1142907&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesFeatureMapper.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesFeatureMapper.java Tue Jul  5 07:18:31 2011
@@ -47,7 +47,7 @@ public class BayesFeatureMapper extends 
   private static final Logger log = LoggerFactory.getLogger(BayesFeatureMapper.class);
   
   private static final DoubleWritable ONE = new DoubleWritable(1.0);
-  private static final Pattern SPACE_PATTERN = Pattern.compile("[ ]+");
+  private static final Pattern SPACE_TAB = Pattern.compile("[ \t]+");
 
   private int gramSize = 1;
 
@@ -73,7 +73,7 @@ public class BayesFeatureMapper extends 
                   final OutputCollector<StringTuple,DoubleWritable> output,
                   Reporter reporter) throws IOException {
     final String label = key.toString();
-    String[] tokens = SPACE_PATTERN.split(value.toString());
+    String[] tokens = SPACE_TAB.split(value.toString());
     OpenObjectIntHashMap<String> wordList = new OpenObjectIntHashMap<String>(tokens.length * gramSize);
     
     if (gramSize > 1) {

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java?rev=1142907&r1=1142906&r2=1142907&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java Tue Jul  5 07:18:31 2011
@@ -70,7 +70,7 @@ public class CsvRecordFactory implements
 
   // crude CSV value splitter.  This will fail if any double quoted strings have
   // commas inside.  Also, escaped quotes will not be unescaped.  Good enough for now.
-  private final Splitter onComma = Splitter.on(",").trimResults(CharMatcher.is('"'));
+  private final Splitter COMMA = Splitter.on(',').trimResults(CharMatcher.is('"'));
 
   private static final Map<String, Class<? extends FeatureVectorEncoder>> TYPE_DICTIONARY =
           ImmutableMap.<String, Class<? extends FeatureVectorEncoder>>builder()
@@ -157,7 +157,7 @@ public class CsvRecordFactory implements
   public void firstLine(String line) {
     // read variable names, build map of name -> column
     final Map<String, Integer> vars = Maps.newHashMap();
-    variableNames = Lists.newArrayList(onComma.split(line));
+    variableNames = Lists.newArrayList(COMMA.split(line));
     int column = 0;
     for (String var : variableNames) {
       vars.put(var, column++);
@@ -226,7 +226,7 @@ public class CsvRecordFactory implements
    */
   @Override
   public int processLine(String line, Vector featureVector) {
-    List<String> values = Lists.newArrayList(onComma.split(line));
+    List<String> values = Lists.newArrayList(COMMA.split(line));
 
     int targetValue = targetDictionary.intern(values.get(target));
     if (targetValue >= maxTargetValue) {

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/common/nlp/NGrams.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/nlp/NGrams.java?rev=1142907&r1=1142906&r2=1142907&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/common/nlp/NGrams.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/common/nlp/NGrams.java Tue Jul  5 07:18:31 2011
@@ -17,6 +17,7 @@
 
 package org.apache.mahout.common.nlp;
 
+import com.google.common.base.CharMatcher;
 import com.google.common.base.Splitter;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
@@ -27,7 +28,7 @@ import java.util.Map;
 
 public class NGrams {
 
-  private static final Splitter SPACE = Splitter.on(' ');
+  private static final Splitter SPACE_TAB = Splitter.on(CharMatcher.anyOf(" \t"));
   
   private final String line;
   private final int gramSize;
@@ -40,7 +41,7 @@ public class NGrams {
   public Map<String,List<String>> generateNGrams() {
     Map<String,List<String>> returnDocument = Maps.newHashMap();
     
-    Iterator<String> tokenizer = SPACE.split(line).iterator();
+    Iterator<String> tokenizer = SPACE_TAB.split(line).iterator();
     List<String> tokens = Lists.newArrayList();
     String labelName = tokenizer.next();
     List<String> previousN1Grams = Lists.newArrayList();
@@ -70,7 +71,7 @@ public class NGrams {
 
     List<String> tokens = Lists.newArrayList();
     List<String> previousN1Grams = Lists.newArrayList();
-    for (String nextToken : SPACE.split(line)) {
+    for (String nextToken : SPACE_TAB.split(line)) {
       
       if (previousN1Grams.size() == gramSize) {
         previousN1Grams.remove(0);

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java?rev=1142907&r1=1142906&r2=1142907&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java Tue Jul  5 07:18:31 2011
@@ -35,9 +35,10 @@ import java.util.regex.Pattern;
  */
 public class TextValueEncoder extends FeatureVectorEncoder {
 
-  private static final double LOG_2 = Math.log(2);
+  private static final double LOG_2 = Math.log(2.0);
+
+  private static final Splitter ON_NON_WORD = Splitter.on(Pattern.compile("\\W+")).omitEmptyStrings();
 
-  private final Splitter onNonWord = Splitter.on(Pattern.compile("\\W+")).omitEmptyStrings();
   private FeatureVectorEncoder wordEncoder;
   private final Multiset<String> counts;
 
@@ -110,7 +111,7 @@ public class TextValueEncoder extends Fe
    * @see LuceneTextValueEncoder
    */
   protected Iterable<String> tokenize(CharSequence originalForm) {
-    return onNonWord.split(originalForm);
+    return ON_NON_WORD.split(originalForm);
   }
 
   /**

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/classifier/sgd/OnlineBaseTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/sgd/OnlineBaseTest.java?rev=1142907&r1=1142906&r2=1142907&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/classifier/sgd/OnlineBaseTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/classifier/sgd/OnlineBaseTest.java Tue Jul  5 07:18:31 2011
@@ -126,7 +126,7 @@ public abstract class OnlineBaseTest ext
    * @throws IOException If there is an error reading the data
    */
   protected static Matrix readCsv(String resourceName) throws IOException {
-    Splitter onCommas = Splitter.on(",").trimResults(CharMatcher.anyOf(" \""));
+    Splitter onCommas = Splitter.on(',').trimResults(CharMatcher.anyOf(" \""));
 
     Readable isr = new InputStreamReader(Resources.getResource(resourceName).openStream(), Charsets.UTF_8);
     List<String> data = CharStreams.readLines(isr);

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java?rev=1142907&r1=1142906&r2=1142907&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java Tue Jul  5 07:18:31 2011
@@ -59,8 +59,7 @@ import java.util.Random;
  */
 public final class SimpleCsvExamples {
 
-  public static final int SEPARATOR_CHAR = '\t';
-  public static final String SEPARATOR = "\t";
+  public static final char SEPARATOR_CHAR = '\t';
   private static final int FIELDS = 100;
 
   private SimpleCsvExamples() {
@@ -137,8 +136,8 @@ public final class SimpleCsvExamples {
 
 
   private static final class Line {
-    private static final Splitter ON_TABS = Splitter.on(SEPARATOR).trimResults();
-    public static final Joiner WITH_COMMAS = Joiner.on(SEPARATOR);
+    private static final Splitter ON_TABS = Splitter.on(SEPARATOR_CHAR).trimResults();
+    public static final Joiner WITH_COMMAS = Joiner.on(SEPARATOR_CHAR);
 
     public static final Random rand = RandomUtils.getRandom();