You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2013/07/27 07:07:51 UTC

svn commit: r1507575 - in /mahout/trunk: CHANGELOG core/pom.xml core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java

Author: smarthi
Date: Sat Jul 27 05:07:51 2013
New Revision: 1507575

URL: http://svn.apache.org/r1507575
Log:
MAHOUT-1287: classifier.sgd.CsvRecordFactory incorrectly parses CSV format

Modified:
    mahout/trunk/CHANGELOG
    mahout/trunk/core/pom.xml
    mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java

Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1507575&r1=1507574&r2=1507575&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Sat Jul 27 05:07:51 2013
@@ -12,6 +12,8 @@ Release 0.9 - unreleased
 
   MAHOUT-1290: Issue when running Mahout Recommender Demo (Helder Garay Martins via smarthi)
 
+  MAHOUT-1287: classifier.sgd.CsvRecordFactory incorrectly parses CSV format (Alex Franchuk via smarthi)
+
   MAHOUT-1275: Dropped bz2 distribution format for source and binaries (sslavic)
 
 Release 0.8 - 2013-07-25

Modified: mahout/trunk/core/pom.xml
URL: http://svn.apache.org/viewvc/mahout/trunk/core/pom.xml?rev=1507575&r1=1507574&r2=1507575&view=diff
==============================================================================
--- mahout/trunk/core/pom.xml (original)
+++ mahout/trunk/core/pom.xml Sat Jul 27 05:07:51 2013
@@ -202,6 +202,12 @@
       <scope>test</scope>
     </dependency>
 
+    <dependency>
+      <groupId>org.apache.solr</groupId>
+      <artifactId>solr-commons-csv</artifactId>
+      <version>3.5.0</version>
+    </dependency>
+
   </dependencies>
   
   <profiles>

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java?rev=1507575&r1=1507574&r2=1507575&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java Sat Jul 27 05:07:51 2013
@@ -17,14 +17,14 @@
 
 package org.apache.mahout.classifier.sgd;
 
-import com.google.common.base.CharMatcher;
 import com.google.common.base.Function;
 import com.google.common.base.Preconditions;
-import com.google.common.base.Splitter;
 import com.google.common.collect.Collections2;
 import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
+
+import org.apache.commons.csv.CSVUtils;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder;
 import org.apache.mahout.vectorizer.encoders.ContinuousValueEncoder;
@@ -33,8 +33,10 @@ import org.apache.mahout.vectorizer.enco
 import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder;
 import org.apache.mahout.vectorizer.encoders.TextValueEncoder;
 
+import java.io.IOException;
 import java.lang.reflect.Constructor;
 import java.lang.reflect.InvocationTargetException;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
@@ -68,10 +70,6 @@ import java.util.Set;
 public class CsvRecordFactory implements RecordFactory {
   private static final String INTERCEPT_TERM = "Intercept Term";
 
-  // crude CSV value splitter.  This will fail if any double quoted strings have
-  // commas inside.  Also, escaped quotes will not be unescaped.  Good enough for now.
-  private static final Splitter COMMA = Splitter.on(',').trimResults(CharMatcher.is('"'));
-
   private static final Map<String, Class<? extends FeatureVectorEncoder>> TYPE_DICTIONARY =
           ImmutableMap.<String, Class<? extends FeatureVectorEncoder>>builder()
                   .put("continuous", ContinuousValueEncoder.class)
@@ -103,6 +101,29 @@ public class CsvRecordFactory implements
       "Unable to construct type converter... shouldn't be possible";
 
   /**
+   * Parse a single line of csv-formatted text.
+   *
+   * Separated to make changing this functionality for the entire class easier
+   * in the future.
+   * @param line - CSV formatted text
+   * @return List<String>
+   */
+  private List<String> parseCsvLine(String line) {
+    try {
+      return Arrays.asList(CSVUtils.parseLine(line));
+	   }
+	   catch (IOException e) {
+      List<String> list = Lists.newArrayList();
+      list.add(line);
+      return list;
+   	}
+  }
+
+  private List<String> parseCsvLine(CharSequence line) {
+    return parseCsvLine(line.toString());
+  }
+
+  /**
    * Construct a parser for CSV lines that encodes the parsed data in vector form.
    * @param targetName            The name of the target variable.
    * @param typeMap               A map describing the types of the predictor variables.
@@ -166,7 +187,7 @@ public class CsvRecordFactory implements
   public void firstLine(String line) {
     // read variable names, build map of name -> column
     final Map<String, Integer> vars = Maps.newHashMap();
-    variableNames = Lists.newArrayList(COMMA.split(line));
+    variableNames = parseCsvLine(line);
     int column = 0;
     for (String var : variableNames) {
       vars.put(var, column++);
@@ -240,7 +261,7 @@ public class CsvRecordFactory implements
    */
   @Override
   public int processLine(String line, Vector featureVector) {
-    List<String> values = Lists.newArrayList(COMMA.split(line));
+    List<String> values = parseCsvLine(line);
 
     int targetValue = targetDictionary.intern(values.get(target));
     if (targetValue >= maxTargetValue) {
@@ -271,7 +292,7 @@ public class CsvRecordFactory implements
    * @return The value of the target variable.
    */
   public int processLine(CharSequence line, Vector featureVector, boolean returnTarget) {
-    List<String> values = Lists.newArrayList(COMMA.split(line));
+    List<String> values = parseCsvLine(line);
     int targetValue = -1;
     if (returnTarget) {
       targetValue = targetDictionary.intern(values.get(target));
@@ -293,7 +314,7 @@ public class CsvRecordFactory implements
    * @return the raw target value in the corresponding column of CSV line 
    */
   public String getTargetString(CharSequence line) {
-    List<String> values = Lists.newArrayList(COMMA.split(line));
+    List<String> values = parseCsvLine(line);
     return values.get(target);
 
   }
@@ -318,7 +339,7 @@ public class CsvRecordFactory implements
    * @return the id value of the CSV record
    */
   public String getIdString(CharSequence line) {
-    List<String> values = Lists.newArrayList(COMMA.split(line));
+    List<String> values = parseCsvLine(line);
     return values.get(id);
   }