You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2013/12/15 19:29:37 UTC
svn commit: r1551035 - in /mahout/trunk: ./
integration/src/main/java/org/apache/mahout/utils/vectors/arff/
integration/src/test/java/org/apache/mahout/utils/vectors/arff/
integration/src/test/resources/
Author: smarthi
Date: Sun Dec 15 18:29:36 2013
New Revision: 1551035
URL: http://svn.apache.org/r1551035
Log:
MAHOUT-1371: Arff loader can misinterpret nominals with integer, real or string
Modified:
mahout/trunk/CHANGELOG
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
mahout/trunk/integration/src/test/resources/date.arff
mahout/trunk/integration/src/test/resources/non-numeric-1.arff
Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1551035&r1=1551034&r2=1551035&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Sun Dec 15 18:29:36 2013
@@ -2,7 +2,7 @@ Mahout Change Log
Release 0.9 - unreleased
- MAHOUT-1380: Streaming KMeans when executed in Sequential Mode (smarthi)
+ MAHOUT-1380: Streaming KMeans fails when executed in Sequential Mode (smarthi)
MAHOUT-1379: ClusterQualitySummarizer fails with the new T-Digest for clusters with 1 data point (smarthi)
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java?rev=1551035&r1=1551034&r2=1551035&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java Sun Dec 15 18:29:36 2013
@@ -19,9 +19,12 @@ package org.apache.mahout.utils.vectors.
import java.io.BufferedReader;
import java.io.IOException;
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import java.util.List;
import com.google.common.collect.AbstractIterator;
+import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.RandomAccessSparseVector;
@@ -31,8 +34,8 @@ final class ARFFIterator extends Abstrac
// This pattern will make sure a , inside a string is not a point for split.
// Ex: "Arizona" , "0:08 PM, PDT" , 110 will be split considering "0:08 PM, PDT" as one string
- private static final Pattern COMMA_PATTERN = Pattern.compile(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)");
private static final Pattern WORDS_WITHOUT_SPARSE = Pattern.compile("([\\w[^{]])*");
+ private static final Pattern DATA_PATTERN = Pattern.compile("^\\"+ARFFModel.ARFF_SPARSE+"(.*)\\"+ARFFModel.ARFF_SPARSE_END+"$");
private final BufferedReader reader;
private final ARFFModel model;
@@ -64,12 +67,12 @@ final class ARFFIterator extends Abstrac
return endOfData();
}
Vector result;
- if (line.startsWith(ARFFModel.ARFF_SPARSE)) {
- line = line.substring(1, line.indexOf(ARFFModel.ARFF_SPARSE_END));
- String[] splits = COMMA_PATTERN.split(line);
+ Matcher contents = DATA_PATTERN.matcher(line);
+ if (contents.find()) {
+ line = contents.group(1);
+ String[] splits = splitCSV(line);
result = new RandomAccessSparseVector(model.getLabelSize());
for (String split : splits) {
- split = split.trim();
int idIndex = split.indexOf(' ');
int idx = Integer.parseInt(split.substring(0, idIndex).trim());
String data = split.substring(idIndex).trim();
@@ -79,7 +82,7 @@ final class ARFFIterator extends Abstrac
}
} else {
result = new DenseVector(model.getLabelSize());
- String[] splits = COMMA_PATTERN.split(line);
+ String[] splits = splitCSV(line);
for (int i = 0; i < splits.length; i++) {
String split = splits[i];
split = split.trim();
@@ -88,8 +91,54 @@ final class ARFFIterator extends Abstrac
}
}
}
- //result.setLabelBindings(labelBindings);
return result;
}
+ /**
+ * Splits a string by comma, ignores commas inside quotes and escaped quotes.
+ * As quotes are both double and single possible, because there is no exact definition
+ * for ARFF files
+ * @param line -
+ * @return String[]
+ */
+ public static String[] splitCSV(String line) {
+ StringBuilder sb = new StringBuilder(128);
+ List<String> tokens = Lists.newArrayList();
+ char escapeChar = '\0';
+ for (int i = 0; i < line.length(); i++) {
+ char c = line.charAt(i);
+ if (c == '\\') {
+ i++;
+ sb.append(line.charAt(i));
+ }
+ else if (c == '"' || c == '\'') {
+ // token is closed
+ if (c == escapeChar) {
+ escapeChar = '\0';
+ }
+ else if (escapeChar == '\0') {
+ escapeChar = c;
+ }
+ sb.append(c);
+ }
+ else if (c == ',') {
+ if (escapeChar == '\0') {
+ tokens.add(sb.toString().trim());
+ sb.setLength(0); // start work on next token
+ }
+ else {
+ sb.append(c);
+ }
+ }
+ else {
+ sb.append(c);
+ }
+ }
+ if (sb.length() > 0) {
+ tokens.add(sb.toString().trim());
+ }
+
+ return tokens.toArray(new String[tokens.size()]);
+ }
+
}
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java?rev=1551035&r1=1551034&r2=1551035&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java Sun Dec 15 18:29:36 2013
@@ -31,7 +31,6 @@ import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Iterator;
import java.util.Locale;
-import java.util.regex.Pattern;
/**
* Read in ARFF (http://www.cs.waikato.ac.nz/~ml/weka/arff.html) and create {@link Vector}s
@@ -49,9 +48,6 @@ import java.util.regex.Pattern;
*/
public class ARFFVectorIterable implements Iterable<Vector> {
- private static final Pattern COMMA_PATTERN = Pattern.compile(",");
- private static final Pattern SPACE_PATTERN = Pattern.compile(" ");
-
private final BufferedReader buff;
private final ARFFModel model;
@@ -80,44 +76,48 @@ public class ARFFVectorIterable implemen
String line;
while ((line = buff.readLine()) != null) {
line = line.trim();
- String lower = line.toLowerCase(Locale.ENGLISH);
- Integer labelNumInt = labelNumber;
- if (!lower.startsWith(ARFFModel.ARFF_COMMENT)) {
- if (lower.startsWith(ARFFModel.RELATION)) {
- model.setRelation(ARFFType.removeQuotes(line.substring(ARFFModel.RELATION.length())));
- } else if (lower.startsWith(ARFFModel.ATTRIBUTE)) {
+ if (!line.startsWith(ARFFModel.ARFF_COMMENT) && !line.isEmpty()) {
+ Integer labelNumInt = labelNumber;
+ String[] lineParts = line.split("[\\s\\t]+", 2);
+
+ // is it a relation name?
+ if (lineParts[0].equalsIgnoreCase(ARFFModel.RELATION)) {
+ model.setRelation(ARFFType.removeQuotes(lineParts[1]));
+ }
+ // or an attribute
+ else if (lineParts[0].equalsIgnoreCase(ARFFModel.ATTRIBUTE)) {
String label;
ARFFType type;
- if (lower.contains(ARFFType.NUMERIC.getIndicator())) {
- label = ARFFType.NUMERIC.getLabel(lower);
+
+ // split the name of the attribute and its description
+ String[] attrParts = lineParts[1].split("[\\s\\t]+", 2);
+ if (attrParts.length < 2)
+ throw new UnsupportedOperationException("No type for attribute found: " + lineParts[1]);
+
+ // label is attribute name
+ label = ARFFType.removeQuotes(attrParts[0].toLowerCase());
+ if (attrParts[1].equalsIgnoreCase(ARFFType.NUMERIC.getIndicator())) {
type = ARFFType.NUMERIC;
- } else if (lower.contains(ARFFType.INTEGER.getIndicator())) {
- label = ARFFType.INTEGER.getLabel(lower);
+ } else if (attrParts[1].equalsIgnoreCase(ARFFType.INTEGER.getIndicator())) {
type = ARFFType.INTEGER;
- } else if (lower.contains(ARFFType.REAL.getIndicator())) {
- label = ARFFType.REAL.getLabel(lower);
+ } else if (attrParts[1].equalsIgnoreCase(ARFFType.REAL.getIndicator())) {
type = ARFFType.REAL;
- } else if (lower.contains(ARFFType.STRING.getIndicator())) {
- label = ARFFType.STRING.getLabel(lower);
+ } else if (attrParts[1].equalsIgnoreCase(ARFFType.STRING.getIndicator())) {
type = ARFFType.STRING;
- } else if (lower.contains(ARFFType.NOMINAL.getIndicator())) {
- label = ARFFType.NOMINAL.getLabel(lower);
+ } else if (attrParts[1].toLowerCase().startsWith(ARFFType.NOMINAL.getIndicator())) {
type = ARFFType.NOMINAL;
- //@ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
- int classIdx = lower.indexOf(ARFFType.NOMINAL.getIndicator());
- String[] classes = COMMA_PATTERN.split(line.substring(classIdx + 1, line.length() - 1));
+ // nominal example:
+ // @ATTRIBUTE class {Iris-setosa,'Iris versicolor',Iris-virginica}
+ String[] classes = ARFFIterator.splitCSV(attrParts[1].substring(1, attrParts[1].length() - 1));
for (int i = 0; i < classes.length; i++) {
model.addNominal(label, ARFFType.removeQuotes(classes[i]), i + 1);
}
- } else if (lower.contains(ARFFType.DATE.getIndicator())) {
- label = ARFFType.DATE.getLabel(lower);
+ } else if (attrParts[1].toLowerCase().startsWith(ARFFType.DATE.getIndicator())) {
type = ARFFType.DATE;
//TODO: DateFormatter map
DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
- int idx = lower.lastIndexOf(ARFFType.DATE.getIndicator());
- String[] split = SPACE_PATTERN.split(line);
- if (split.length >= 4) { //we have a date format
- String formStr = line.substring(idx + ARFFType.DATE.getIndicator().length()).trim();
+ String formStr = attrParts[1].substring(ARFFType.DATE.getIndicator().length()).trim();
+ if (!formStr.isEmpty()) {
if (formStr.startsWith("\"")) {
formStr = formStr.substring(1, formStr.length() - 1);
}
@@ -126,13 +126,12 @@ public class ARFFVectorIterable implemen
model.addDateFormat(labelNumInt, format);
//@attribute <name> date [<date-format>]
} else {
- throw new UnsupportedOperationException("Invalid attribute: " + line);
+ throw new UnsupportedOperationException("Invalid attribute: " + attrParts[1]);
}
model.addLabel(label, labelNumInt);
model.addType(labelNumInt, type);
labelNumber++;
- } else if (lower.startsWith(ARFFModel.DATA)) {
- //inData = true;
+ } else if (lineParts[0].equalsIgnoreCase(ARFFModel.DATA)) {
break; //skip it
}
}
Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java?rev=1551035&r1=1551034&r2=1551035&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java Sun Dec 15 18:29:36 2013
@@ -124,7 +124,7 @@ public final class ARFFVectorIterableTes
assertEquals(1, nominalMap.size());
Map<String, Integer> noms = nominalMap.get("bar");
assertNotNull("nominals for bar are null", noms);
- assertEquals(2, noms.size());
+ assertEquals(5, noms.size());
Map<Integer, ARFFType> integerARFFTypeMap = model.getTypeMap();
assertNotNull("Type map null", integerARFFTypeMap);
assertEquals(5, integerARFFTypeMap.size());
@@ -189,7 +189,7 @@ public final class ARFFVectorIterableTes
assertEquals(1, nominalMap.size());
Map<String,Integer> noms = nominalMap.get("bar");
assertNotNull("nominals for bar are null", noms);
- assertEquals(2, noms.size());
+ assertEquals(5, noms.size());
Map<Integer,ARFFType> integerARFFTypeMap = model.getTypeMap();
assertNotNull("Type map null", integerARFFTypeMap);
assertEquals(5, integerARFFTypeMap.size());
Modified: mahout/trunk/integration/src/test/resources/date.arff
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/resources/date.arff?rev=1551035&r1=1551034&r2=1551035&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/resources/date.arff (original)
+++ mahout/trunk/integration/src/test/resources/date.arff Sun Dec 15 18:29:36 2013
@@ -4,7 +4,7 @@
@RELATION MahoutDateTest
@ATTRIBUTE junk NUMERIC
- @ATTRIBUTE date1
+ @ATTRIBUTE date1 date
@ATTRIBUTE date2 date "yyyy.MM.dd G 'at' HH:mm:ss z"
@ATTRIBUTE date3 date "EEE, MMM d, ''yy"
@ATTRIBUTE date4 date "K:mm a, z"
Modified: mahout/trunk/integration/src/test/resources/non-numeric-1.arff
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/resources/non-numeric-1.arff?rev=1551035&r1=1551034&r2=1551035&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/resources/non-numeric-1.arff (original)
+++ mahout/trunk/integration/src/test/resources/non-numeric-1.arff Sun Dec 15 18:29:36 2013
@@ -5,7 +5,7 @@
@ATTRIBUTE junk NUMERIC
@ATTRIBUTE foo NUMERIC
- @ATTRIBUTE bar {c,d}
+ @ATTRIBUTE bar {c,d,'xy, numeric','marc o\'polo', e}
@ATTRIBUTE hockey string
@ATTRIBUTE football date "yyyy-MM-dd"