You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2012/06/22 13:58:14 UTC
svn commit: r1352857 - in /mahout/trunk/integration/src:
main/java/org/apache/mahout/utils/vectors/arff/
test/java/org/apache/mahout/utils/vectors/arff/
Author: srowen
Date: Fri Jun 22 11:58:13 2012
New Revision: 1352857
URL: http://svn.apache.org/viewvc?rev=1352857&view=rev
Log:
MAHOUT-985 ignore ARFF instance weights, handle ? correctly
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java?rev=1352857&r1=1352856&r2=1352857&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java Fri Jun 22 11:58:13 2012
@@ -32,6 +32,7 @@ final class ARFFIterator extends Abstrac
// This pattern will make sure a , inside a string is not a point for split.
// Ex: "Arizona" , "0:08 PM, PDT" , 110 will be split considering "0:08 PM, PDT" as one string
private static final Pattern COMMA_PATTERN = Pattern.compile(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)");
+ private static final Pattern WORDS_WITHOUT_SPARSE = Pattern.compile("([\\w[^{]])*");
private final BufferedReader reader;
private final ARFFModel model;
@@ -60,7 +61,7 @@ final class ARFFIterator extends Abstrac
}
Vector result;
if (line.startsWith(ARFFModel.ARFF_SPARSE)) {
- line = line.substring(1, line.length() - 1);
+ line = line.substring(1, line.indexOf(ARFFModel.ARFF_SPARSE_END));
String[] splits = COMMA_PATTERN.split(line);
result = new RandomAccessSparseVector(model.getLabelSize());
for (String split : splits) {
@@ -68,13 +69,19 @@ final class ARFFIterator extends Abstrac
int idIndex = split.indexOf(' ');
int idx = Integer.parseInt(split.substring(0, idIndex).trim());
String data = split.substring(idIndex).trim();
- result.setQuick(idx, model.getValue(data, idx));
+ if (!"?".equals(data)) {
+ result.setQuick(idx, model.getValue(data, idx));
+ }
}
} else {
result = new DenseVector(model.getLabelSize());
String[] splits = COMMA_PATTERN.split(line);
for (int i = 0; i < splits.length; i++) {
- result.setQuick(i, model.getValue(splits[i], i));
+ String split = splits[i];
+ split = split.trim();
+ if (WORDS_WITHOUT_SPARSE.matcher(split).matches() && !"?".equals(split)) {
+ result.setQuick(i, model.getValue(split, i));
+ }
}
}
//result.setLabelBindings(labelBindings);
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java?rev=1352857&r1=1352856&r2=1352857&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java Fri Jun 22 11:58:13 2012
@@ -28,6 +28,7 @@ import java.util.Map;
*/
public interface ARFFModel {
String ARFF_SPARSE = "{"; //indicates the vector is sparse
+ String ARFF_SPARSE_END = "}";
String ARFF_COMMENT = "%";
String ATTRIBUTE = "@attribute";
String DATA = "@data";
Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java?rev=1352857&r1=1352856&r2=1352857&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java Fri Jun 22 11:58:13 2012
@@ -75,24 +75,36 @@ public final class ARFFVectorIterableTes
public void testDense() throws Exception {
ARFFModel model = new MapBackedARFFModel();
Iterable<Vector> iterable = new ARFFVectorIterable(SAMPLE_DENSE_ARFF, model);
+ Vector firstVector = iterable.iterator().next();
+ assertEquals(1.0, firstVector.get(0), 0);
+ assertEquals(65.0, firstVector.get(1), 0);
+ assertEquals(1.0, firstVector.get(3), 0);
+ assertEquals(1.0, firstVector.get(4), 0);
+
int count = 0;
for (Vector vector : iterable) {
assertTrue("Vector is not dense", vector instanceof DenseVector);
count++;
}
- assertEquals(10, count);
+ assertEquals(5, count);
}
@Test
public void testSparse() throws Exception {
ARFFModel model = new MapBackedARFFModel();
Iterable<Vector> iterable = new ARFFVectorIterable(SAMPLE_SPARSE_ARFF, model);
+
+ Vector firstVector = iterable.iterator().next();
+ assertEquals(23.1, firstVector.get(1), 0);
+ assertEquals(3.23, firstVector.get(2), 0);
+ assertEquals(1.2, firstVector.get(3), 0);
+
int count = 0;
for (Vector vector : iterable) {
assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector);
count++;
}
- assertEquals(10, count);
+ assertEquals(9, count);
}
@Test
@@ -212,17 +224,19 @@ public final class ARFFVectorIterableTes
}
private static final String SAMPLE_DENSE_ARFF = " % Comments\n" + " % \n" + " % Comments go here"
- + " % \n" + " @RELATION Mahout\n" + '\n'
- + " @ATTRIBUTE foo NUMERIC\n"
- + " @ATTRIBUTE bar NUMERIC\n"
- + " @ATTRIBUTE hockey NUMERIC\n"
- + " @ATTRIBUTE football NUMERIC\n" + " \n" + '\n'
- + '\n' + " @DATA\n" + " 23.1,3.23,1.2,0.2\n"
- + " 2.9,3.0,1.2,0.2\n" + " 2.7,3.2,1.3,0.2\n"
- + " 2.6,3.1,1.23,0.2\n" + " 23.0,3.6,1.2,0.2\n"
- + " 23.2,3.9,1.7,0.2\n" + " 2.6,3.2,1.2,0.3\n"
- + " 23.0,3.2,1.23,0.2\n" + " 2.2,2.9,1.2,0.2\n"
- + " 2.9,3.1,1.23,0.1\n";
+ + " % \n" + " @RELATION golf\n" + '\n'
+ + " @ATTRIBUTE outlook {sunny,overcast, rain}\n"
+ + " @ATTRIBUTE temperature NUMERIC\n"
+ + " @ATTRIBUTE humidity NUMERIC\n"
+ + " @ATTRIBUTE windy {false, true}\n"
+ + " @ATTRIBUTE class {dont_play, play}\n" + " \n" + '\n'
+ + '\n' + " @DATA\n"
+ + " sunny, 65, ?, false, dont_play, {2} \n"
+ + " sunny, 80, 90, true, dont_play\n"
+ + " overcast, 83, 78, false, play ,{3} \n"
+ + " rain, 70, 96, false, play\n"
+ + " rain, 68, 80, false, play\n"
+ + " rain, 65, 70, true, play\n";
private static final String SAMPLE_SPARSE_ARFF = " % Comments\n" + " % \n" + " % Comments go here"
+ " % \n" + " @RELATION Mahout\n" + '\n'
@@ -231,8 +245,8 @@ public final class ARFFVectorIterableTes
+ " @ATTRIBUTE hockey NUMERIC\n"
+ " @ATTRIBUTE football NUMERIC\n"
+ " @ATTRIBUTE tennis NUMERIC\n" + " \n" + '\n'
- + '\n' + " @DATA\n" + " {1 23.1,2 3.23,3 1.2,4 0.2}\n"
- + " {0 2.9}\n" + " {0 2.7,2 3.2,3 1.3,4 0.2}\n"
+ + '\n' + " @DATA\n" + " {1 23.1,2 3.23,3 1.2,4 ?} {5}\n"
+ + " {0 2.9}\n" + " {0 2.7,2 3.2,3 1.3,4 0.2} {10} \n"
+ " {1 2.6,2 3.1,3 1.23,4 0.2}\n"
+ " {1 23.0,2 3.6,3 1.2,4 0.2}\n"
+ " {0 23.2,1 3.9,3 1.7,4 0.2}\n"