You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2013/11/29 20:06:13 UTC

svn commit: r1546624 - in /mahout/trunk/integration/src: main/java/org/apache/mahout/utils/vectors/arff/ test/java/org/apache/mahout/utils/vectors/arff/

Author: smarthi
Date: Fri Nov 29 19:06:13 2013
New Revision: 1546624

URL: http://svn.apache.org/r1546624
Log:
MAHOUT-1285: Arff loader can misparse string data as double

Modified:
    mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
    mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java?rev=1546624&r1=1546623&r2=1546624&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java Fri Nov 29 19:06:13 2013
@@ -82,59 +82,59 @@ public class ARFFVectorIterable implemen
       line = line.trim();
       String lower = line.toLowerCase(Locale.ENGLISH);
       Integer labelNumInt = labelNumber;
-      if (lower.startsWith(ARFFModel.ARFF_COMMENT)) {
-        continue;
-      } else if (lower.startsWith(ARFFModel.RELATION)) {
-        model.setRelation(ARFFType.removeQuotes(line.substring(ARFFModel.RELATION.length())));
-      } else if (lower.startsWith(ARFFModel.ATTRIBUTE)) {
-        String label;
-        ARFFType type;
-        if (lower.contains(ARFFType.NUMERIC.getIndicator())) {
-          label = ARFFType.NUMERIC.getLabel(lower);
-          type = ARFFType.NUMERIC;
-        } else if (lower.contains(ARFFType.INTEGER.getIndicator())) {
-          label = ARFFType.INTEGER.getLabel(lower);
-          type = ARFFType.INTEGER;
-        } else if (lower.contains(ARFFType.REAL.getIndicator())) {
-          label = ARFFType.REAL.getLabel(lower);
-          type = ARFFType.REAL;
-        } else if (lower.contains(ARFFType.STRING.getIndicator())) {
-          label = ARFFType.STRING.getLabel(lower);
-          type = ARFFType.STRING;
-        } else if (lower.contains(ARFFType.NOMINAL.getIndicator())) {
-          label = ARFFType.NOMINAL.getLabel(lower);
-          type = ARFFType.NOMINAL;
-          //@ATTRIBUTE class        {Iris-setosa,Iris-versicolor,Iris-virginica}
-          int classIdx = lower.indexOf(ARFFType.NOMINAL.getIndicator());
-          String[] classes = COMMA_PATTERN.split(line.substring(classIdx + 1, line.length() - 1));
-          for (int i = 0; i < classes.length; i++) {
-            model.addNominal(label, ARFFType.removeQuotes(classes[i]), i + 1);
-          }
-        } else if (lower.contains(ARFFType.DATE.getIndicator())) {
-          label = ARFFType.DATE.getLabel(lower);
-          type = ARFFType.DATE;
-          //TODO: DateFormatter map
-          DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
-          int idx = lower.lastIndexOf(ARFFType.DATE.getIndicator());
-          String[] split = SPACE_PATTERN.split(line);
-          if (split.length >= 4) { //we have a date format
-            String formStr = line.substring(idx + ARFFType.DATE.getIndicator().length()).trim();
-            if (formStr.startsWith("\"")) {
-              formStr = formStr.substring(1, formStr.length() - 1);
+      if (!lower.startsWith(ARFFModel.ARFF_COMMENT)) {
+        if (lower.startsWith(ARFFModel.RELATION)) {
+          model.setRelation(ARFFType.removeQuotes(line.substring(ARFFModel.RELATION.length())));
+        } else if (lower.startsWith(ARFFModel.ATTRIBUTE)) {
+          String label;
+          ARFFType type;
+          if (lower.contains(ARFFType.NUMERIC.getIndicator())) {
+            label = ARFFType.NUMERIC.getLabel(lower);
+            type = ARFFType.NUMERIC;
+          } else if (lower.contains(ARFFType.INTEGER.getIndicator())) {
+            label = ARFFType.INTEGER.getLabel(lower);
+            type = ARFFType.INTEGER;
+          } else if (lower.contains(ARFFType.REAL.getIndicator())) {
+            label = ARFFType.REAL.getLabel(lower);
+            type = ARFFType.REAL;
+          } else if (lower.contains(ARFFType.STRING.getIndicator())) {
+            label = ARFFType.STRING.getLabel(lower);
+            type = ARFFType.STRING;
+          } else if (lower.contains(ARFFType.NOMINAL.getIndicator())) {
+            label = ARFFType.NOMINAL.getLabel(lower);
+            type = ARFFType.NOMINAL;
+            //@ATTRIBUTE class        {Iris-setosa,Iris-versicolor,Iris-virginica}
+            int classIdx = lower.indexOf(ARFFType.NOMINAL.getIndicator());
+            String[] classes = COMMA_PATTERN.split(line.substring(classIdx + 1, line.length() - 1));
+            for (int i = 0; i < classes.length; i++) {
+              model.addNominal(label, ARFFType.removeQuotes(classes[i]), i + 1);
+            }
+          } else if (lower.contains(ARFFType.DATE.getIndicator())) {
+            label = ARFFType.DATE.getLabel(lower);
+            type = ARFFType.DATE;
+            //TODO: DateFormatter map
+            DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
+            int idx = lower.lastIndexOf(ARFFType.DATE.getIndicator());
+            String[] split = SPACE_PATTERN.split(line);
+            if (split.length >= 4) { //we have a date format
+              String formStr = line.substring(idx + ARFFType.DATE.getIndicator().length()).trim();
+              if (formStr.startsWith("\"")) {
+                formStr = formStr.substring(1, formStr.length() - 1);
+              }
+              format = new SimpleDateFormat(formStr, Locale.ENGLISH);
             }
-            format = new SimpleDateFormat(formStr, Locale.ENGLISH);
+            model.addDateFormat(labelNumInt, format);
+            //@attribute <name> date [<date-format>]
+          } else {
+            throw new UnsupportedOperationException("Invalid attribute: " + line);
           }
-          model.addDateFormat(labelNumInt, format);
-          //@attribute <name> date [<date-format>]
-        } else {
-          throw new UnsupportedOperationException("Invalid attribute: " + line);
+          model.addLabel(label, labelNumInt);
+          model.addType(labelNumInt, type);
+          labelNumber++;
+        } else if (lower.startsWith(ARFFModel.DATA)) {
+          //inData = true;
+          break; //skip it
         }
-        model.addLabel(label, labelNumInt);
-        model.addType(labelNumInt, type);
-        labelNumber++;
-      } else if (lower.startsWith(ARFFModel.DATA)) {
-        //inData = true;
-        break; //skip it
       }
     }
 

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java?rev=1546624&r1=1546623&r2=1546624&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java Fri Nov 29 19:06:13 2013
@@ -20,7 +20,9 @@ package org.apache.mahout.utils.vectors.
 import com.google.common.collect.Maps;
 
 import java.text.DateFormat;
+import java.text.NumberFormat;
 import java.text.ParseException;
+import java.text.ParsePosition;
 import java.text.SimpleDateFormat;
 import java.util.Collections;
 import java.util.Date;
@@ -143,9 +145,19 @@ public class MapBackedARFFModel implemen
   }
   
   protected static double processNumeric(String data) {
-    return Double.parseDouble(data);
+    if (isNumeric(data)) {
+      return Double.parseDouble(data);
+    }
+    return 0.0;
   }
-  
+
+  public static boolean isNumeric(String str) {
+    NumberFormat formatter = NumberFormat.getInstance();
+    ParsePosition parsePosition = new ParsePosition(0);
+    formatter.parse(str, parsePosition);
+    return str.length() == parsePosition.getIndex();
+  }
+
   protected double processDate(String data, int idx) {
     DateFormat format = dateMap.get(idx);
     if (format == null) {

Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java?rev=1546624&r1=1546623&r2=1546624&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java Fri Nov 29 19:06:13 2013
@@ -39,4 +39,23 @@ public class MapBackedARFFModelTest exte
     Map<String, Integer> windyValues = nominalMap.get(windy);
     assertEquals(77, windyValues.get(breezy).intValue());
   }
+
+  @Test
+  public void processBadNumeric() {
+    ARFFModel model = new MapBackedARFFModel();
+    model.addLabel("b1shkt70694difsmmmdv0ikmoh", 77);
+    model.addType(77, ARFFType.REAL);
+    assertTrue(0.0 == model.getValue("b1shkt70694difsmmmdv0ikmoh", 77));
+  }
+
+  @Test
+  public void processGoodNumeric() {
+    ARFFModel model = new MapBackedARFFModel();
+    model.addLabel("1234", 77);
+    model.addType(77, ARFFType.INTEGER);
+    assertTrue(1234 == model.getValue("1234", 77));
+    model.addLabel("131.34", 78);
+    model.addType(78, ARFFType.REAL);
+    assertTrue(131.34 == model.getValue("131.34", 78));
+  }
 }