You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2012/06/22 13:58:14 UTC

svn commit: r1352857 - in /mahout/trunk/integration/src: main/java/org/apache/mahout/utils/vectors/arff/ test/java/org/apache/mahout/utils/vectors/arff/

Author: srowen
Date: Fri Jun 22 11:58:13 2012
New Revision: 1352857

URL: http://svn.apache.org/viewvc?rev=1352857&view=rev
Log:
MAHOUT-985 ignore ARFF instance weights, handle ? correctly

Modified:
    mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java
    mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java?rev=1352857&r1=1352856&r2=1352857&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java Fri Jun 22 11:58:13 2012
@@ -32,6 +32,7 @@ final class ARFFIterator extends Abstrac
   // This pattern will make sure a , inside a string is not a point for split.
   // Ex: "Arizona" , "0:08 PM, PDT" , 110 will be split considering "0:08 PM, PDT" as one string
   private static final Pattern COMMA_PATTERN = Pattern.compile(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)");
+  private static final Pattern WORDS_WITHOUT_SPARSE = Pattern.compile("([\\w[^{]])*");
 
   private final BufferedReader reader;
   private final ARFFModel model;
@@ -60,7 +61,7 @@ final class ARFFIterator extends Abstrac
     }
     Vector result;
     if (line.startsWith(ARFFModel.ARFF_SPARSE)) {
-      line = line.substring(1, line.length() - 1);
+      line = line.substring(1, line.indexOf(ARFFModel.ARFF_SPARSE_END));
       String[] splits = COMMA_PATTERN.split(line);
       result = new RandomAccessSparseVector(model.getLabelSize());
       for (String split : splits) {
@@ -68,13 +69,19 @@ final class ARFFIterator extends Abstrac
         int idIndex = split.indexOf(' ');
         int idx = Integer.parseInt(split.substring(0, idIndex).trim());
         String data = split.substring(idIndex).trim();
-        result.setQuick(idx, model.getValue(data, idx));
+        if (!"?".equals(data)) {
+          result.setQuick(idx, model.getValue(data, idx));
+        }
       }
     } else {
       result = new DenseVector(model.getLabelSize());
       String[] splits = COMMA_PATTERN.split(line);
       for (int i = 0; i < splits.length; i++) {
-        result.setQuick(i, model.getValue(splits[i], i));
+        String split = splits[i];
+        split = split.trim();
+        if (WORDS_WITHOUT_SPARSE.matcher(split).matches() && !"?".equals(split)) {
+          result.setQuick(i, model.getValue(split, i));
+        }
       }
     }
     //result.setLabelBindings(labelBindings);

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java?rev=1352857&r1=1352856&r2=1352857&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java Fri Jun 22 11:58:13 2012
@@ -28,6 +28,7 @@ import java.util.Map;
  */
 public interface ARFFModel {
   String ARFF_SPARSE = "{"; //indicates the vector is sparse
+  String ARFF_SPARSE_END = "}";
   String ARFF_COMMENT = "%";
   String ATTRIBUTE = "@attribute";
   String DATA = "@data";

Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java?rev=1352857&r1=1352856&r2=1352857&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java Fri Jun 22 11:58:13 2012
@@ -75,24 +75,36 @@ public final class ARFFVectorIterableTes
   public void testDense() throws Exception {
     ARFFModel model = new MapBackedARFFModel();
     Iterable<Vector> iterable = new ARFFVectorIterable(SAMPLE_DENSE_ARFF, model);
+    Vector firstVector = iterable.iterator().next();
+    assertEquals(1.0, firstVector.get(0), 0);
+    assertEquals(65.0, firstVector.get(1), 0);
+    assertEquals(1.0, firstVector.get(3), 0);
+    assertEquals(1.0, firstVector.get(4), 0);
+    
     int count = 0;
     for (Vector vector : iterable) {
       assertTrue("Vector is not dense", vector instanceof DenseVector);
       count++;
     }
-    assertEquals(10, count);
+    assertEquals(5, count);
   }
 
   @Test
   public void testSparse() throws Exception {
     ARFFModel model = new MapBackedARFFModel();
     Iterable<Vector> iterable = new ARFFVectorIterable(SAMPLE_SPARSE_ARFF, model);
+    
+    Vector firstVector = iterable.iterator().next();
+    assertEquals(23.1, firstVector.get(1), 0);
+    assertEquals(3.23, firstVector.get(2), 0);
+    assertEquals(1.2, firstVector.get(3), 0);
+    
     int count = 0;
     for (Vector vector : iterable) {
       assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector);
       count++;
     }
-    assertEquals(10, count);
+    assertEquals(9, count);
   }
 
   @Test
@@ -212,17 +224,19 @@ public final class ARFFVectorIterableTes
   }
   
   private static final String SAMPLE_DENSE_ARFF = "   % Comments\n" + "   % \n" + "   % Comments go here"
-                                                  + "   % \n" + "   @RELATION Mahout\n" + '\n'
-                                                  + "   @ATTRIBUTE foo  NUMERIC\n"
-                                                  + "   @ATTRIBUTE bar   NUMERIC\n"
-                                                  + "   @ATTRIBUTE hockey  NUMERIC\n"
-                                                  + "   @ATTRIBUTE football   NUMERIC\n" + "  \n" + '\n'
-                                                  + '\n' + "   @DATA\n" + "   23.1,3.23,1.2,0.2\n"
-                                                  + "   2.9,3.0,1.2,0.2\n" + "   2.7,3.2,1.3,0.2\n"
-                                                  + "   2.6,3.1,1.23,0.2\n" + "   23.0,3.6,1.2,0.2\n"
-                                                  + "   23.2,3.9,1.7,0.2\n" + "   2.6,3.2,1.2,0.3\n"
-                                                  + "   23.0,3.2,1.23,0.2\n" + "   2.2,2.9,1.2,0.2\n"
-                                                  + "   2.9,3.1,1.23,0.1\n";
+                                                  + "   % \n" + "   @RELATION golf\n" + '\n'
+                                                  + "   @ATTRIBUTE outlook {sunny,overcast, rain}\n"
+                                                  + "   @ATTRIBUTE temperature   NUMERIC\n"
+                                                  + "   @ATTRIBUTE humidity  NUMERIC\n"
+                                                  + "   @ATTRIBUTE windy {false, true}\n" 
+                                                  + "   @ATTRIBUTE class {dont_play, play}\n" + "  \n" + '\n'  
+                                                  + '\n' + "   @DATA\n" 
+                                                  + "   sunny,    65, ?, false, dont_play, {2} \n"
+                                                  + "   sunny,    80, 90,  true, dont_play\n" 
+                                                  + "   overcast, 83, 78, false, play ,{3} \n"
+                                                  + "   rain,     70, 96, false, play\n" 
+                                                  + "   rain,     68, 80, false, play\n"
+                                                  + "   rain,     65, 70, true, play\n";
   
   private static final String SAMPLE_SPARSE_ARFF = "   % Comments\n" + "   % \n" + "   % Comments go here"
                                                    + "   % \n" + "   @RELATION Mahout\n" + '\n'
@@ -231,8 +245,8 @@ public final class ARFFVectorIterableTes
                                                    + "   @ATTRIBUTE hockey  NUMERIC\n"
                                                    + "   @ATTRIBUTE football   NUMERIC\n"
                                                    + "   @ATTRIBUTE tennis   NUMERIC\n" + "  \n" + '\n'
-                                                   + '\n' + "   @DATA\n" + "   {1 23.1,2 3.23,3 1.2,4 0.2}\n"
-                                                   + "   {0 2.9}\n" + "   {0 2.7,2 3.2,3 1.3,4 0.2}\n"
+                                                   + '\n' + "   @DATA\n" + "   {1 23.1,2 3.23,3 1.2,4 ?} {5}\n"
+                                                   + "   {0 2.9}\n" + "   {0 2.7,2 3.2,3 1.3,4 0.2} {10} \n"
                                                    + "   {1 2.6,2 3.1,3 1.23,4 0.2}\n"
                                                    + "   {1 23.0,2 3.6,3 1.2,4 0.2}\n"
                                                    + "   {0 23.2,1 3.9,3 1.7,4 0.2}\n"