You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2013/11/29 20:06:13 UTC
svn commit: r1546624 - in /mahout/trunk/integration/src:
main/java/org/apache/mahout/utils/vectors/arff/
test/java/org/apache/mahout/utils/vectors/arff/
Author: smarthi
Date: Fri Nov 29 19:06:13 2013
New Revision: 1546624
URL: http://svn.apache.org/r1546624
Log:
MAHOUT-1285: Arff loader can misparse string data as double
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java?rev=1546624&r1=1546623&r2=1546624&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java Fri Nov 29 19:06:13 2013
@@ -82,59 +82,59 @@ public class ARFFVectorIterable implemen
line = line.trim();
String lower = line.toLowerCase(Locale.ENGLISH);
Integer labelNumInt = labelNumber;
- if (lower.startsWith(ARFFModel.ARFF_COMMENT)) {
- continue;
- } else if (lower.startsWith(ARFFModel.RELATION)) {
- model.setRelation(ARFFType.removeQuotes(line.substring(ARFFModel.RELATION.length())));
- } else if (lower.startsWith(ARFFModel.ATTRIBUTE)) {
- String label;
- ARFFType type;
- if (lower.contains(ARFFType.NUMERIC.getIndicator())) {
- label = ARFFType.NUMERIC.getLabel(lower);
- type = ARFFType.NUMERIC;
- } else if (lower.contains(ARFFType.INTEGER.getIndicator())) {
- label = ARFFType.INTEGER.getLabel(lower);
- type = ARFFType.INTEGER;
- } else if (lower.contains(ARFFType.REAL.getIndicator())) {
- label = ARFFType.REAL.getLabel(lower);
- type = ARFFType.REAL;
- } else if (lower.contains(ARFFType.STRING.getIndicator())) {
- label = ARFFType.STRING.getLabel(lower);
- type = ARFFType.STRING;
- } else if (lower.contains(ARFFType.NOMINAL.getIndicator())) {
- label = ARFFType.NOMINAL.getLabel(lower);
- type = ARFFType.NOMINAL;
- //@ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
- int classIdx = lower.indexOf(ARFFType.NOMINAL.getIndicator());
- String[] classes = COMMA_PATTERN.split(line.substring(classIdx + 1, line.length() - 1));
- for (int i = 0; i < classes.length; i++) {
- model.addNominal(label, ARFFType.removeQuotes(classes[i]), i + 1);
- }
- } else if (lower.contains(ARFFType.DATE.getIndicator())) {
- label = ARFFType.DATE.getLabel(lower);
- type = ARFFType.DATE;
- //TODO: DateFormatter map
- DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
- int idx = lower.lastIndexOf(ARFFType.DATE.getIndicator());
- String[] split = SPACE_PATTERN.split(line);
- if (split.length >= 4) { //we have a date format
- String formStr = line.substring(idx + ARFFType.DATE.getIndicator().length()).trim();
- if (formStr.startsWith("\"")) {
- formStr = formStr.substring(1, formStr.length() - 1);
+ if (!lower.startsWith(ARFFModel.ARFF_COMMENT)) {
+ if (lower.startsWith(ARFFModel.RELATION)) {
+ model.setRelation(ARFFType.removeQuotes(line.substring(ARFFModel.RELATION.length())));
+ } else if (lower.startsWith(ARFFModel.ATTRIBUTE)) {
+ String label;
+ ARFFType type;
+ if (lower.contains(ARFFType.NUMERIC.getIndicator())) {
+ label = ARFFType.NUMERIC.getLabel(lower);
+ type = ARFFType.NUMERIC;
+ } else if (lower.contains(ARFFType.INTEGER.getIndicator())) {
+ label = ARFFType.INTEGER.getLabel(lower);
+ type = ARFFType.INTEGER;
+ } else if (lower.contains(ARFFType.REAL.getIndicator())) {
+ label = ARFFType.REAL.getLabel(lower);
+ type = ARFFType.REAL;
+ } else if (lower.contains(ARFFType.STRING.getIndicator())) {
+ label = ARFFType.STRING.getLabel(lower);
+ type = ARFFType.STRING;
+ } else if (lower.contains(ARFFType.NOMINAL.getIndicator())) {
+ label = ARFFType.NOMINAL.getLabel(lower);
+ type = ARFFType.NOMINAL;
+ //@ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
+ int classIdx = lower.indexOf(ARFFType.NOMINAL.getIndicator());
+ String[] classes = COMMA_PATTERN.split(line.substring(classIdx + 1, line.length() - 1));
+ for (int i = 0; i < classes.length; i++) {
+ model.addNominal(label, ARFFType.removeQuotes(classes[i]), i + 1);
+ }
+ } else if (lower.contains(ARFFType.DATE.getIndicator())) {
+ label = ARFFType.DATE.getLabel(lower);
+ type = ARFFType.DATE;
+ //TODO: DateFormatter map
+ DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
+ int idx = lower.lastIndexOf(ARFFType.DATE.getIndicator());
+ String[] split = SPACE_PATTERN.split(line);
+ if (split.length >= 4) { //we have a date format
+ String formStr = line.substring(idx + ARFFType.DATE.getIndicator().length()).trim();
+ if (formStr.startsWith("\"")) {
+ formStr = formStr.substring(1, formStr.length() - 1);
+ }
+ format = new SimpleDateFormat(formStr, Locale.ENGLISH);
}
- format = new SimpleDateFormat(formStr, Locale.ENGLISH);
+ model.addDateFormat(labelNumInt, format);
+ //@attribute <name> date [<date-format>]
+ } else {
+ throw new UnsupportedOperationException("Invalid attribute: " + line);
}
- model.addDateFormat(labelNumInt, format);
- //@attribute <name> date [<date-format>]
- } else {
- throw new UnsupportedOperationException("Invalid attribute: " + line);
+ model.addLabel(label, labelNumInt);
+ model.addType(labelNumInt, type);
+ labelNumber++;
+ } else if (lower.startsWith(ARFFModel.DATA)) {
+ //inData = true;
+ break; //skip it
}
- model.addLabel(label, labelNumInt);
- model.addType(labelNumInt, type);
- labelNumber++;
- } else if (lower.startsWith(ARFFModel.DATA)) {
- //inData = true;
- break; //skip it
}
}
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java?rev=1546624&r1=1546623&r2=1546624&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java Fri Nov 29 19:06:13 2013
@@ -20,7 +20,9 @@ package org.apache.mahout.utils.vectors.
import com.google.common.collect.Maps;
import java.text.DateFormat;
+import java.text.NumberFormat;
import java.text.ParseException;
+import java.text.ParsePosition;
import java.text.SimpleDateFormat;
import java.util.Collections;
import java.util.Date;
@@ -143,9 +145,19 @@ public class MapBackedARFFModel implemen
}
protected static double processNumeric(String data) {
- return Double.parseDouble(data);
+ if (isNumeric(data)) {
+ return Double.parseDouble(data);
+ }
+ return 0.0;
}
-
+
+ public static boolean isNumeric(String str) {
+ NumberFormat formatter = NumberFormat.getInstance();
+ ParsePosition parsePosition = new ParsePosition(0);
+ formatter.parse(str, parsePosition);
+ return str.length() == parsePosition.getIndex();
+ }
+
protected double processDate(String data, int idx) {
DateFormat format = dateMap.get(idx);
if (format == null) {
Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java?rev=1546624&r1=1546623&r2=1546624&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java Fri Nov 29 19:06:13 2013
@@ -39,4 +39,23 @@ public class MapBackedARFFModelTest exte
Map<String, Integer> windyValues = nominalMap.get(windy);
assertEquals(77, windyValues.get(breezy).intValue());
}
+
+ @Test
+ public void processBadNumeric() {
+ ARFFModel model = new MapBackedARFFModel();
+ model.addLabel("b1shkt70694difsmmmdv0ikmoh", 77);
+ model.addType(77, ARFFType.REAL);
+ assertTrue(0.0 == model.getValue("b1shkt70694difsmmmdv0ikmoh", 77));
+ }
+
+ @Test
+ public void processGoodNumeric() {
+ ARFFModel model = new MapBackedARFFModel();
+ model.addLabel("1234", 77);
+ model.addType(77, ARFFType.INTEGER);
+ assertTrue(1234 == model.getValue("1234", 77));
+ model.addLabel("131.34", 78);
+ model.addType(78, ARFFType.REAL);
+ assertTrue(131.34 == model.getValue("131.34", 78));
+ }
}