You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2009/08/03 15:11:49 UTC
svn commit: r800349 - in /lucene/mahout/trunk/utils/src:
main/java/org/apache/mahout/utils/vectors/arff/
test/java/org/apache/mahout/utils/vectors/arff/
Author: gsingers
Date: Mon Aug 3 13:11:49 2009
New Revision: 800349
URL: http://svn.apache.org/viewvc?rev=800349&view=rev
Log:
MAHOUT-155: Nominals can span files, still need to write out info in other maps
Modified:
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java?rev=800349&r1=800348&r2=800349&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java Mon Aug 3 13:11:49 2009
@@ -46,9 +46,9 @@
*/
Map<String, Integer> getLabelBindings();
- Integer getNominalValue(String nominal);
+ Integer getNominalValue(String label, String nominal);
- void addNominal(String nominal, int idx);
+ void addNominal(String label, String nominal, int idx);
DateFormat getDateFormat(Integer idx);
@@ -70,7 +70,7 @@
double getValue(String data, int idx);
- Map<String, Integer> getNominalMap();
+ Map<String, Map<String, Integer>> getNominalMap();
int getLabelSize();
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java?rev=800349&r1=800348&r2=800349&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java Mon Aug 3 13:11:49 2009
@@ -15,6 +15,8 @@
public String getLabel(String line) {
- return line.substring(ARFFModel.ATTRIBUTE.length(), line.length() - indicator.length()).trim();
+ int idx = line.indexOf(indicator);
+ return line.substring(ARFFModel.ATTRIBUTE.length(),
+ idx).trim();
}
}
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java?rev=800349&r1=800348&r2=800349&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java Mon Aug 3 13:11:49 2009
@@ -95,25 +95,25 @@
} else if (lower.startsWith(ARFFModel.ATTRIBUTE)) {
String label;
if (lower.indexOf(ARFFType.NUMERIC.getIndicator()) != -1) {
- label = ARFFType.NUMERIC.getLabel(line);
+ label = ARFFType.NUMERIC.getLabel(lower);
type = ARFFType.NUMERIC;
} else if (lower.indexOf(ARFFType.STRING.getIndicator()) != -1) {
- label = ARFFType.STRING.getLabel(line);
+ label = ARFFType.STRING.getLabel(lower);
type = ARFFType.STRING;
//TODO: create a map so we know which
} else if (lower.indexOf(ARFFType.NOMINAL.getIndicator()) != -1) {
- label = ARFFType.NOMINAL.getLabel(line);
+ label = ARFFType.NOMINAL.getLabel(lower);
type = ARFFType.NOMINAL;
//@ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
int classIdx = lower.indexOf(ARFFType.NOMINAL.getIndicator());
String [] classes = line.substring(classIdx + 1, line.length() - 1).split(",");
for (int i = 0; i < classes.length; i++) {
- model.addNominal(classes[i].trim(), i);
+ model.addNominal(label, classes[i].trim(), i);
}
} else if (lower.indexOf(ARFFType.DATE.getIndicator()) != -1) {
- label = ARFFType.DATE.getLabel(line);
+ label = ARFFType.DATE.getLabel(lower);
type = ARFFType.DATE;
//TODO: DateFormatter map
DateFormat format = ARFFModel.DEFAULT_DATE_FORMAT;
@@ -184,7 +184,6 @@
for (int i = 0; i < splits.length; i++) {
String[] data = splits[i].split(" ");//first is index, second is
int idx = Integer.parseInt(data[0]);
-
result.setQuick(idx, model.getValue(data[1], idx));
}
} else {
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java?rev=800349&r1=800348&r2=800349&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java Mon Aug 3 13:11:49 2009
@@ -151,7 +151,8 @@
private static void writeFile(String outWriter, String outDir, File file,
long maxDocs, ARFFModel arffModel) throws IOException {
log.info("Converting File: " + file);
- ARFFModel model = new MapBackedARFFModel(arffModel.getWords(), arffModel.getWordCount() + 1);
+ ARFFModel model = new MapBackedARFFModel(arffModel.getWords(), arffModel.getWordCount() + 1,
+ arffModel.getNominalMap());
ARFFVectorIterable iteratable = new ARFFVectorIterable(file, model);
String outFile = outDir + "/" + file.getName() + ".mvc";
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java?rev=800349&r1=800348&r2=800349&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java Mon Aug 3 13:11:49 2009
@@ -34,22 +34,25 @@
protected String relation;
private Map<String, Integer> labelBindings;
+ private Map<Integer, String> idxLabel;
private Map<Integer, ARFFType> typeMap; //key is the vector index, value is the type
private Map<Integer, DateFormat> dateMap;
- private Map<String, Integer> nominalMap;
+ private Map<String, Map<String, Integer>> nominalMap;
private Map<String, Long> words;
public MapBackedARFFModel() {
- this(new HashMap<String, Long>(), 1);
+ this(new HashMap<String, Long>(), 1, new HashMap<String, Map<String, Integer>>());
}
- public MapBackedARFFModel(Map<String, Long> words, long wordCount) {
+ public MapBackedARFFModel(Map<String, Long> words, long wordCount, Map<String, Map<String, Integer>> nominalMap) {
this.words = words;
this.wordCount = wordCount;
labelBindings = new HashMap<String, Integer>();
+ idxLabel = new HashMap<Integer, String>();
typeMap = new HashMap<Integer, ARFFType>();
dateMap = new HashMap<Integer, DateFormat>();
- nominalMap = new HashMap<String, Integer>();
+ this.nominalMap = nominalMap;
+
}
public String getRelation() {
@@ -87,7 +90,8 @@
break;
}
case NOMINAL: {
- result = processNominal(data);
+ String label = idxLabel.get(idx);
+ result = processNominal(label, data);
break;
}
@@ -96,13 +100,20 @@
return result;
}
- protected double processNominal(String data) {
+ protected double processNominal(String label, String data) {
double result;
- Integer ord = nominalMap.get(data);
- if (ord == null) {
- throw new RuntimeException("Invalid nominal: " + data);
+ Map<String, Integer> classes = nominalMap.get(label);
+ if (classes != null) {
+ Integer ord = classes.get(data);
+ if (ord != null) {
+ result = ord;
+ } else {
+ throw new RuntimeException("Invalid nominal: " + data + " for label: " + label);
+ }
+ } else {
+ throw new RuntimeException("Invalid nominal label: " + label + " Data: " + data);
}
- result = ord;
+
return result;
}
@@ -147,11 +158,11 @@
}
/**
- * The vector attributes (labels in Mahout speak)
+ * The vector attributes (labels in Mahout speak), unmodifiable
* @return the map
*/
public Map<String, Integer> getLabelBindings() {
- return labelBindings;
+ return Collections.unmodifiableMap(labelBindings);
}
/**
@@ -171,11 +182,11 @@
}
/**
- * Map nominals to ids
+ * Map nominals to ids. Should only be modified by calling {@link ARFFModel#addNominal(String, String, int)}
* @return the map
*/
- public Map<String, Integer> getNominalMap() {
- return Collections.unmodifiableMap(nominalMap);
+ public Map<String, Map<String, Integer>> getNominalMap() {
+ return nominalMap;
}
/**
@@ -186,12 +197,17 @@
return words;
}
- public Integer getNominalValue(String nominal){
- return nominalMap.get(nominal);
+ public Integer getNominalValue(String label, String nominal){
+ return nominalMap.get(label).get(nominal);
}
- public void addNominal(String nominal, int idx) {
- nominalMap.put(nominal, idx);
+ public void addNominal(String label, String nominal, int idx) {
+ Map<String, Integer> noms = nominalMap.get(label);
+ if (noms == null) {
+ noms = new HashMap<String, Integer>();
+ nominalMap.put(label, noms);
+ }
+ noms.put(nominal, idx);
}
public DateFormat getDateFormat(Integer idx){
@@ -208,6 +224,7 @@
public void addLabel(String label, Integer idx) {
labelBindings.put(label, idx);
+ idxLabel.put(idx, label);
}
public ARFFType getARFFType(Integer idx){
Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java?rev=800349&r1=800348&r2=800349&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java Mon Aug 3 13:11:49 2009
@@ -22,9 +22,9 @@
import org.apache.mahout.matrix.Vector;
import org.apache.mahout.utils.strings.StringUtil;
+import java.text.DateFormat;
import java.util.Iterator;
import java.util.Map;
-import java.text.DateFormat;
/**
@@ -50,6 +50,7 @@
assertTrue(iterable.getModel().getRelation() + " is not equal to " + "Mahout", iterable.getModel().getRelation().equals("Mahout") == true);
Map<String, Integer> bindings = iterable.getModel().getLabelBindings();
assertNotNull(bindings);
+ assertTrue("bindings Size: " + bindings.size() + " is not: " + 5, bindings.size() == 5);
Iterator<Vector> iter = iterable.iterator();
assertTrue(iter.hasNext());
Vector next = iter.next();
@@ -97,32 +98,73 @@
}
public void testNonNumeric() throws Exception {
- try {
- MapBackedARFFModel model = new MapBackedARFFModel();
- ARFFVectorIterable iterable = new ARFFVectorIterable(NON_NUMERIC_ARFF, model);
- int count = 0;
- for (Vector vector : iterable) {
- assertTrue("Vector is not dense", vector instanceof SparseVector);
- count++;
- }
- assertTrue(count + " does not equal: " + 10, count == 10);
- Map<String, Integer> nominalMap = iterable.getModel().getNominalMap();
- assertNotNull(nominalMap);
- assertTrue("nominalMap Size: " + nominalMap.size() + " is not: " + 2, nominalMap.size() == 2);
- Map<Integer, ARFFType> integerARFFTypeMap = model.getTypeMap();
- assertNotNull("Type map null", integerARFFTypeMap);
- assertTrue("integerARFFTypeMap Size: " + integerARFFTypeMap.size() + " is not: " + 5, integerARFFTypeMap.size() == 5);
- Map<String, Long> words = model.getWords();
- assertNotNull("words null", words);
- assertTrue("words Size: " + words.size() + " is not: " + 10, words.size() == 10);
- System.out.println("Words: " + words);
- Map<Integer, DateFormat> integerDateFormatMap = model.getDateMap();
- assertNotNull("date format null", integerDateFormatMap);
- assertTrue("integerDateFormatMap Size: " + integerDateFormatMap.size() + " is not: " + 1, integerDateFormatMap.size() == 1);
- } catch (UnsupportedOperationException e) {
+ MapBackedARFFModel model = new MapBackedARFFModel();
+ ARFFVectorIterable iterable = new ARFFVectorIterable(NON_NUMERIC_ARFF, model);
+ int count = 0;
+ for (Vector vector : iterable) {
+ assertTrue("Vector is not dense", vector instanceof SparseVector);
+ count++;
}
+ assertTrue(count + " does not equal: " + 10, count == 10);
+ Map<String, Map<String, Integer>> nominalMap = iterable.getModel().getNominalMap();
+ assertNotNull(nominalMap);
+ assertTrue("nominalMap Size: " + nominalMap.size() + " is not: " + 1, nominalMap.size() == 1);
+ Map<String, Integer> noms = nominalMap.get("bar");
+ assertNotNull("nominals for bar are null", noms);
+ assertTrue("noms Size: " + noms.size() + " is not: " + 2, noms.size() == 2);
+ Map<Integer, ARFFType> integerARFFTypeMap = model.getTypeMap();
+ assertNotNull("Type map null", integerARFFTypeMap);
+ assertTrue("integerARFFTypeMap Size: " + integerARFFTypeMap.size() + " is not: " + 5, integerARFFTypeMap.size() == 5);
+ Map<String, Long> words = model.getWords();
+ assertNotNull("words null", words);
+ assertTrue("words Size: " + words.size() + " is not: " + 10, words.size() == 10);
+ System.out.println("Words: " + words);
+ Map<Integer, DateFormat> integerDateFormatMap = model.getDateMap();
+ assertNotNull("date format null", integerDateFormatMap);
+ assertTrue("integerDateFormatMap Size: " + integerDateFormatMap.size() + " is not: " + 1, integerDateFormatMap.size() == 1);
+
+ }
+ public void testMultipleNoms() throws Exception {
+ MapBackedARFFModel model = new MapBackedARFFModel();
+ ARFFVectorIterable iterable = new ARFFVectorIterable(NON_NUMERIC_ARFF, model);
+ int count = 0;
+ for (Vector vector : iterable) {
+ assertTrue("Vector is not dense", vector instanceof SparseVector);
+ count++;
+ }
+ assertTrue(count + " does not equal: " + 10, count == 10);
+ Map<String, Map<String, Integer>> nominalMap = iterable.getModel().getNominalMap();
+ assertNotNull(nominalMap);
+ assertTrue("nominalMap Size: " + nominalMap.size() + " is not: " + 1, nominalMap.size() == 1);
+ Map<String, Integer> noms = nominalMap.get("bar");
+ assertNotNull("nominals for bar are null", noms);
+ assertTrue("noms Size: " + noms.size() + " is not: " + 2, noms.size() == 2);
+ Map<Integer, ARFFType> integerARFFTypeMap = model.getTypeMap();
+ assertNotNull("Type map null", integerARFFTypeMap);
+ assertTrue("integerARFFTypeMap Size: " + integerARFFTypeMap.size() + " is not: " + 5, integerARFFTypeMap.size() == 5);
+ Map<String, Long> words = model.getWords();
+ assertNotNull("words null", words);
+ assertTrue("words Size: " + words.size() + " is not: " + 10, words.size() == 10);
+ System.out.println("Words: " + words);
+ Map<Integer, DateFormat> integerDateFormatMap = model.getDateMap();
+ assertNotNull("date format null", integerDateFormatMap);
+ assertTrue("integerDateFormatMap Size: " + integerDateFormatMap.size() + " is not: " + 1, integerDateFormatMap.size() == 1);
+ model = new MapBackedARFFModel(model.getWords(), model.getWordCount(),
+ model.getNominalMap());
+ iterable = new ARFFVectorIterable(NON_NUMERIC_ARFF2, model);
+ count = 0;
+ for (Vector vector : iterable) {
+ assertTrue("Vector is not dense", vector instanceof SparseVector);
+ count++;
+ }
+ nominalMap = model.getNominalMap();
+ assertNotNull(nominalMap);
+ assertTrue("nominalMap Size: " + nominalMap.size() + " is not: " + 2, nominalMap.size() == 2);
+ noms = nominalMap.get("test");
+ assertNotNull("nominals for bar are null", noms);
+ assertTrue("noms Size: " + noms.size() + " is not: " + 2, noms.size() == 2);
}
@@ -203,4 +245,30 @@
" {3 howe,4 1983-06-23}\n" +
" {0 2.2,2 d,3 messier,4 2008-11-23}\n" +
" {2 c,3 roy,4 1973-10-13}\n";
+
+ public static final String NON_NUMERIC_ARFF2 = " % Comments\n" +
+ " % \n" +
+ " % Comments go here" +
+ " % \n" +
+ " @RELATION Mahout\n" +
+ "\n" +
+ " @ATTRIBUTE junk NUMERIC\n" +
+ " @ATTRIBUTE foo NUMERIC\n" +
+ " @ATTRIBUTE test {f,z}\n" +
+ " @ATTRIBUTE hockey string\n" +
+ " @ATTRIBUTE football date \"yyyy-MM-dd\"\n" +
+ " \n" +
+ "\n" +
+ "\n" +
+ " @DATA\n" +
+ " {2 f,3 gretzky,4 1973-10-23}\n" +
+ " {1 2.9,2 z,3 orr,4 1973-11-23}\n" +
+ " {2 f,3 bossy,4 1981-10-23}\n" +
+ " {1 2.6,2 f,3 lefleur,4 1989-10-23}\n" +
+ " {3 esposito,4 1973-04-23}\n" +
+ " {1 23.2,2 z,3 chelios,4 1999-2-23}\n" +
+ " {3 richard,4 1973-10-12}\n" +
+ " {3 howe,4 1983-06-23}\n" +
+ " {0 2.2,2 f,3 messier,4 2008-11-23}\n" +
+ " {2 f,3 roy,4 1973-10-13}\n";
}