You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2013/12/15 19:29:37 UTC

svn commit: r1551035 - in /mahout/trunk: ./ integration/src/main/java/org/apache/mahout/utils/vectors/arff/ integration/src/test/java/org/apache/mahout/utils/vectors/arff/ integration/src/test/resources/

Author: smarthi
Date: Sun Dec 15 18:29:36 2013
New Revision: 1551035

URL: http://svn.apache.org/r1551035
Log:
MAHOUT-1371: Arff loader can misinterpret nominals with integer, real or string

Modified:
    mahout/trunk/CHANGELOG
    mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java
    mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
    mahout/trunk/integration/src/test/resources/date.arff
    mahout/trunk/integration/src/test/resources/non-numeric-1.arff

Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1551035&r1=1551034&r2=1551035&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Sun Dec 15 18:29:36 2013
@@ -2,7 +2,7 @@ Mahout Change Log
 
 Release 0.9 - unreleased
 
-  MAHOUT-1380: Streaming KMeans when executed in Sequential Mode (smarthi)
+  MAHOUT-1380: Streaming KMeans fails when executed in Sequential Mode (smarthi)
 
   MAHOUT-1379: ClusterQualitySummarizer fails with the new T-Digest for clusters with 1 data point (smarthi)
 

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java?rev=1551035&r1=1551034&r2=1551035&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java Sun Dec 15 18:29:36 2013
@@ -19,9 +19,12 @@ package org.apache.mahout.utils.vectors.
 
 import java.io.BufferedReader;
 import java.io.IOException;
+import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+import java.util.List;
 
 import com.google.common.collect.AbstractIterator;
+import com.google.common.collect.Lists;
 import com.google.common.io.Closeables;
 import org.apache.mahout.math.DenseVector;
 import org.apache.mahout.math.RandomAccessSparseVector;
@@ -31,8 +34,8 @@ final class ARFFIterator extends Abstrac
 
   // This pattern will make sure a , inside a string is not a point for split.
   // Ex: "Arizona" , "0:08 PM, PDT" , 110 will be split considering "0:08 PM, PDT" as one string
-  private static final Pattern COMMA_PATTERN = Pattern.compile(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)");
   private static final Pattern WORDS_WITHOUT_SPARSE = Pattern.compile("([\\w[^{]])*");
+  private static final Pattern DATA_PATTERN = Pattern.compile("^\\"+ARFFModel.ARFF_SPARSE+"(.*)\\"+ARFFModel.ARFF_SPARSE_END+"$");
 
   private final BufferedReader reader;
   private final ARFFModel model;
@@ -64,12 +67,12 @@ final class ARFFIterator extends Abstrac
       return endOfData();
     }
     Vector result;
-    if (line.startsWith(ARFFModel.ARFF_SPARSE)) {
-      line = line.substring(1, line.indexOf(ARFFModel.ARFF_SPARSE_END));
-      String[] splits = COMMA_PATTERN.split(line);
+    Matcher contents = DATA_PATTERN.matcher(line);
+    if (contents.find()) {
+      line = contents.group(1);
+      String[] splits = splitCSV(line);
       result = new RandomAccessSparseVector(model.getLabelSize());
       for (String split : splits) {
-        split = split.trim();
         int idIndex = split.indexOf(' ');
         int idx = Integer.parseInt(split.substring(0, idIndex).trim());
         String data = split.substring(idIndex).trim();
@@ -79,7 +82,7 @@ final class ARFFIterator extends Abstrac
       }
     } else {
       result = new DenseVector(model.getLabelSize());
-      String[] splits = COMMA_PATTERN.split(line);
+      String[] splits = splitCSV(line);
       for (int i = 0; i < splits.length; i++) {
         String split = splits[i];
         split = split.trim();
@@ -88,8 +91,54 @@ final class ARFFIterator extends Abstrac
         }
       }
     }
-    //result.setLabelBindings(labelBindings);
     return result;
   }
 
+  /**
+   * Splits a string by comma, ignores commas inside quotes and escaped quotes.
+   * As quotes are both double and single possible, because there is no exact definition
+   * for ARFF files
+   * @param line -
+   * @return String[]
+   */
+  public static String[] splitCSV(String line) {
+    StringBuilder sb = new StringBuilder(128);
+    List<String> tokens = Lists.newArrayList();
+    char escapeChar = '\0';
+    for (int i = 0; i < line.length(); i++) {
+      char c = line.charAt(i);
+      if (c == '\\') {
+        i++;
+        sb.append(line.charAt(i));
+      }
+      else if (c == '"' || c == '\'') {
+        // token is closed
+        if (c == escapeChar) {
+          escapeChar = '\0';
+        }
+        else if (escapeChar == '\0') {
+          escapeChar = c;
+        }
+        sb.append(c);
+      }
+      else if (c == ',') {
+        if (escapeChar == '\0') {
+          tokens.add(sb.toString().trim());
+          sb.setLength(0); // start work on next token
+        }
+        else {
+          sb.append(c);
+        }
+      }
+      else {
+        sb.append(c);
+      }
+    }
+    if (sb.length() > 0) {
+      tokens.add(sb.toString().trim());
+    }
+
+    return tokens.toArray(new String[tokens.size()]);
+  }
+
 }

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java?rev=1551035&r1=1551034&r2=1551035&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java Sun Dec 15 18:29:36 2013
@@ -31,7 +31,6 @@ import java.text.DateFormat;
 import java.text.SimpleDateFormat;
 import java.util.Iterator;
 import java.util.Locale;
-import java.util.regex.Pattern;
 
 /**
  * Read in ARFF (http://www.cs.waikato.ac.nz/~ml/weka/arff.html) and create {@link Vector}s
@@ -49,9 +48,6 @@ import java.util.regex.Pattern;
  */
 public class ARFFVectorIterable implements Iterable<Vector> {
 
-  private static final Pattern COMMA_PATTERN = Pattern.compile(",");
-  private static final Pattern SPACE_PATTERN = Pattern.compile(" ");
-
   private final BufferedReader buff;
   private final ARFFModel model;
 
@@ -80,44 +76,48 @@ public class ARFFVectorIterable implemen
     String line;
     while ((line = buff.readLine()) != null) {
       line = line.trim();
-      String lower = line.toLowerCase(Locale.ENGLISH);
-      Integer labelNumInt = labelNumber;
-      if (!lower.startsWith(ARFFModel.ARFF_COMMENT)) {
-        if (lower.startsWith(ARFFModel.RELATION)) {
-          model.setRelation(ARFFType.removeQuotes(line.substring(ARFFModel.RELATION.length())));
-        } else if (lower.startsWith(ARFFModel.ATTRIBUTE)) {
+      if (!line.startsWith(ARFFModel.ARFF_COMMENT) && !line.isEmpty()) {
+        Integer labelNumInt = labelNumber;
+        String[] lineParts = line.split("[\\s\\t]+", 2);
+
+        // is it a relation name?
+        if (lineParts[0].equalsIgnoreCase(ARFFModel.RELATION)) {
+          model.setRelation(ARFFType.removeQuotes(lineParts[1]));
+        }
+        // or an attribute
+        else if (lineParts[0].equalsIgnoreCase(ARFFModel.ATTRIBUTE)) {
           String label;
           ARFFType type;
-          if (lower.contains(ARFFType.NUMERIC.getIndicator())) {
-            label = ARFFType.NUMERIC.getLabel(lower);
+
+          // split the name of the attribute and its description
+          String[] attrParts = lineParts[1].split("[\\s\\t]+", 2);
+          if (attrParts.length < 2)
+            throw new UnsupportedOperationException("No type for attribute found: " + lineParts[1]);
+
+          // label is attribute name
+          label = ARFFType.removeQuotes(attrParts[0].toLowerCase());
+          if (attrParts[1].equalsIgnoreCase(ARFFType.NUMERIC.getIndicator())) {
             type = ARFFType.NUMERIC;
-          } else if (lower.contains(ARFFType.INTEGER.getIndicator())) {
-            label = ARFFType.INTEGER.getLabel(lower);
+          } else if (attrParts[1].equalsIgnoreCase(ARFFType.INTEGER.getIndicator())) {
             type = ARFFType.INTEGER;
-          } else if (lower.contains(ARFFType.REAL.getIndicator())) {
-            label = ARFFType.REAL.getLabel(lower);
+          } else if (attrParts[1].equalsIgnoreCase(ARFFType.REAL.getIndicator())) {
             type = ARFFType.REAL;
-          } else if (lower.contains(ARFFType.STRING.getIndicator())) {
-            label = ARFFType.STRING.getLabel(lower);
+          } else if (attrParts[1].equalsIgnoreCase(ARFFType.STRING.getIndicator())) {
             type = ARFFType.STRING;
-          } else if (lower.contains(ARFFType.NOMINAL.getIndicator())) {
-            label = ARFFType.NOMINAL.getLabel(lower);
+          } else if (attrParts[1].toLowerCase().startsWith(ARFFType.NOMINAL.getIndicator())) {
             type = ARFFType.NOMINAL;
-            //@ATTRIBUTE class        {Iris-setosa,Iris-versicolor,Iris-virginica}
-            int classIdx = lower.indexOf(ARFFType.NOMINAL.getIndicator());
-            String[] classes = COMMA_PATTERN.split(line.substring(classIdx + 1, line.length() - 1));
+            // nominal example:
+            // @ATTRIBUTE class        {Iris-setosa,'Iris versicolor',Iris-virginica}
+            String[] classes = ARFFIterator.splitCSV(attrParts[1].substring(1, attrParts[1].length() - 1));
             for (int i = 0; i < classes.length; i++) {
               model.addNominal(label, ARFFType.removeQuotes(classes[i]), i + 1);
             }
-          } else if (lower.contains(ARFFType.DATE.getIndicator())) {
-            label = ARFFType.DATE.getLabel(lower);
+          } else if (attrParts[1].toLowerCase().startsWith(ARFFType.DATE.getIndicator())) {
             type = ARFFType.DATE;
             //TODO: DateFormatter map
             DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
-            int idx = lower.lastIndexOf(ARFFType.DATE.getIndicator());
-            String[] split = SPACE_PATTERN.split(line);
-            if (split.length >= 4) { //we have a date format
-              String formStr = line.substring(idx + ARFFType.DATE.getIndicator().length()).trim();
+            String formStr = attrParts[1].substring(ARFFType.DATE.getIndicator().length()).trim();
+            if (!formStr.isEmpty()) {
               if (formStr.startsWith("\"")) {
                 formStr = formStr.substring(1, formStr.length() - 1);
               }
@@ -126,13 +126,12 @@ public class ARFFVectorIterable implemen
             model.addDateFormat(labelNumInt, format);
             //@attribute <name> date [<date-format>]
           } else {
-            throw new UnsupportedOperationException("Invalid attribute: " + line);
+            throw new UnsupportedOperationException("Invalid attribute: " + attrParts[1]);
           }
           model.addLabel(label, labelNumInt);
           model.addType(labelNumInt, type);
           labelNumber++;
-        } else if (lower.startsWith(ARFFModel.DATA)) {
-          //inData = true;
+        } else if (lineParts[0].equalsIgnoreCase(ARFFModel.DATA)) {
           break; //skip it
         }
       }

Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java?rev=1551035&r1=1551034&r2=1551035&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java Sun Dec 15 18:29:36 2013
@@ -124,7 +124,7 @@ public final class ARFFVectorIterableTes
     assertEquals(1, nominalMap.size());
     Map<String, Integer> noms = nominalMap.get("bar");
     assertNotNull("nominals for bar are null", noms);
-    assertEquals(2, noms.size());
+    assertEquals(5, noms.size());
     Map<Integer, ARFFType> integerARFFTypeMap = model.getTypeMap();
     assertNotNull("Type map null", integerARFFTypeMap);
     assertEquals(5, integerARFFTypeMap.size());
@@ -189,7 +189,7 @@ public final class ARFFVectorIterableTes
     assertEquals(1, nominalMap.size());
     Map<String,Integer> noms = nominalMap.get("bar");
     assertNotNull("nominals for bar are null", noms);
-    assertEquals(2, noms.size());
+    assertEquals(5, noms.size());
     Map<Integer,ARFFType> integerARFFTypeMap = model.getTypeMap();
     assertNotNull("Type map null", integerARFFTypeMap);
     assertEquals(5, integerARFFTypeMap.size());

Modified: mahout/trunk/integration/src/test/resources/date.arff
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/resources/date.arff?rev=1551035&r1=1551034&r2=1551035&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/resources/date.arff (original)
+++ mahout/trunk/integration/src/test/resources/date.arff Sun Dec 15 18:29:36 2013
@@ -4,7 +4,7 @@
    @RELATION MahoutDateTest
 
    @ATTRIBUTE junk  NUMERIC
-   @ATTRIBUTE date1
+   @ATTRIBUTE date1   date
    @ATTRIBUTE date2   date "yyyy.MM.dd G 'at' HH:mm:ss z"
    @ATTRIBUTE date3   date "EEE, MMM d, ''yy"
    @ATTRIBUTE date4   date "K:mm a, z"

Modified: mahout/trunk/integration/src/test/resources/non-numeric-1.arff
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/resources/non-numeric-1.arff?rev=1551035&r1=1551034&r2=1551035&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/resources/non-numeric-1.arff (original)
+++ mahout/trunk/integration/src/test/resources/non-numeric-1.arff Sun Dec 15 18:29:36 2013
@@ -5,7 +5,7 @@
 
    @ATTRIBUTE junk  NUMERIC
    @ATTRIBUTE foo  NUMERIC
-   @ATTRIBUTE bar   {c,d}
+   @ATTRIBUTE bar   {c,d,'xy, numeric','marc o\'polo', e}
    @ATTRIBUTE hockey  string
    @ATTRIBUTE football   date "yyyy-MM-dd"