You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2013/07/27 07:07:51 UTC
svn commit: r1507575 - in /mahout/trunk: CHANGELOG core/pom.xml
core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java
Author: smarthi
Date: Sat Jul 27 05:07:51 2013
New Revision: 1507575
URL: http://svn.apache.org/r1507575
Log:
MAHOUT-1287: classifier.sgd.CsvRecordFactory incorrectly parses CSV format
Modified:
mahout/trunk/CHANGELOG
mahout/trunk/core/pom.xml
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java
Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1507575&r1=1507574&r2=1507575&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Sat Jul 27 05:07:51 2013
@@ -12,6 +12,8 @@ Release 0.9 - unreleased
MAHOUT-1290: Issue when running Mahout Recommender Demo (Helder Garay Martins via smarthi)
+ MAHOUT-1287: classifier.sgd.CsvRecordFactory incorrectly parses CSV format (Alex Franchuk via smarthi)
+
MAHOUT-1275: Dropped bz2 distribution format for source and binaries (sslavic)
Release 0.8 - 2013-07-25
Modified: mahout/trunk/core/pom.xml
URL: http://svn.apache.org/viewvc/mahout/trunk/core/pom.xml?rev=1507575&r1=1507574&r2=1507575&view=diff
==============================================================================
--- mahout/trunk/core/pom.xml (original)
+++ mahout/trunk/core/pom.xml Sat Jul 27 05:07:51 2013
@@ -202,6 +202,12 @@
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>org.apache.solr</groupId>
+ <artifactId>solr-commons-csv</artifactId>
+ <version>3.5.0</version>
+ </dependency>
+
</dependencies>
<profiles>
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java?rev=1507575&r1=1507574&r2=1507575&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java Sat Jul 27 05:07:51 2013
@@ -17,14 +17,14 @@
package org.apache.mahout.classifier.sgd;
-import com.google.common.base.CharMatcher;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
-import com.google.common.base.Splitter;
import com.google.common.collect.Collections2;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
+
+import org.apache.commons.csv.CSVUtils;
import org.apache.mahout.math.Vector;
import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder;
import org.apache.mahout.vectorizer.encoders.ContinuousValueEncoder;
@@ -33,8 +33,10 @@ import org.apache.mahout.vectorizer.enco
import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder;
import org.apache.mahout.vectorizer.encoders.TextValueEncoder;
+import java.io.IOException;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
+import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
@@ -68,10 +70,6 @@ import java.util.Set;
public class CsvRecordFactory implements RecordFactory {
private static final String INTERCEPT_TERM = "Intercept Term";
- // crude CSV value splitter. This will fail if any double quoted strings have
- // commas inside. Also, escaped quotes will not be unescaped. Good enough for now.
- private static final Splitter COMMA = Splitter.on(',').trimResults(CharMatcher.is('"'));
-
private static final Map<String, Class<? extends FeatureVectorEncoder>> TYPE_DICTIONARY =
ImmutableMap.<String, Class<? extends FeatureVectorEncoder>>builder()
.put("continuous", ContinuousValueEncoder.class)
@@ -103,6 +101,29 @@ public class CsvRecordFactory implements
"Unable to construct type converter... shouldn't be possible";
/**
+ * Parse a single line of csv-formatted text.
+ *
+ * Separated to make changing this functionality for the entire class easier
+ * in the future.
+ * @param line - CSV formatted text
+ * @return List<String>
+ */
+ private List<String> parseCsvLine(String line) {
+ try {
+ return Arrays.asList(CSVUtils.parseLine(line));
+ }
+ catch (IOException e) {
+ List<String> list = Lists.newArrayList();
+ list.add(line);
+ return list;
+ }
+ }
+
+ private List<String> parseCsvLine(CharSequence line) {
+ return parseCsvLine(line.toString());
+ }
+
+ /**
* Construct a parser for CSV lines that encodes the parsed data in vector form.
* @param targetName The name of the target variable.
* @param typeMap A map describing the types of the predictor variables.
@@ -166,7 +187,7 @@ public class CsvRecordFactory implements
public void firstLine(String line) {
// read variable names, build map of name -> column
final Map<String, Integer> vars = Maps.newHashMap();
- variableNames = Lists.newArrayList(COMMA.split(line));
+ variableNames = parseCsvLine(line);
int column = 0;
for (String var : variableNames) {
vars.put(var, column++);
@@ -240,7 +261,7 @@ public class CsvRecordFactory implements
*/
@Override
public int processLine(String line, Vector featureVector) {
- List<String> values = Lists.newArrayList(COMMA.split(line));
+ List<String> values = parseCsvLine(line);
int targetValue = targetDictionary.intern(values.get(target));
if (targetValue >= maxTargetValue) {
@@ -271,7 +292,7 @@ public class CsvRecordFactory implements
* @return The value of the target variable.
*/
public int processLine(CharSequence line, Vector featureVector, boolean returnTarget) {
- List<String> values = Lists.newArrayList(COMMA.split(line));
+ List<String> values = parseCsvLine(line);
int targetValue = -1;
if (returnTarget) {
targetValue = targetDictionary.intern(values.get(target));
@@ -293,7 +314,7 @@ public class CsvRecordFactory implements
* @return the raw target value in the corresponding column of CSV line
*/
public String getTargetString(CharSequence line) {
- List<String> values = Lists.newArrayList(COMMA.split(line));
+ List<String> values = parseCsvLine(line);
return values.get(target);
}
@@ -318,7 +339,7 @@ public class CsvRecordFactory implements
* @return the id value of the CSV record
*/
public String getIdString(CharSequence line) {
- List<String> values = Lists.newArrayList(COMMA.split(line));
+ List<String> values = parseCsvLine(line);
return values.get(id);
}