You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2009/08/03 15:11:49 UTC

svn commit: r800349 - in /lucene/mahout/trunk/utils/src: main/java/org/apache/mahout/utils/vectors/arff/ test/java/org/apache/mahout/utils/vectors/arff/

Author: gsingers
Date: Mon Aug  3 13:11:49 2009
New Revision: 800349

URL: http://svn.apache.org/viewvc?rev=800349&view=rev
Log:
MAHOUT-155: Nominals can span files, still need to write out info in other maps

Modified:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java?rev=800349&r1=800348&r2=800349&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java Mon Aug  3 13:11:49 2009
@@ -46,9 +46,9 @@
    */
   Map<String, Integer> getLabelBindings();
 
-  Integer getNominalValue(String nominal);
+  Integer getNominalValue(String label, String nominal);
 
-  void addNominal(String nominal, int idx);
+  void addNominal(String label, String nominal, int idx);
 
   DateFormat getDateFormat(Integer idx);
 
@@ -70,7 +70,7 @@
 
   double getValue(String data, int idx);
 
-  Map<String, Integer> getNominalMap();
+  Map<String, Map<String, Integer>> getNominalMap();
 
   int getLabelSize();
 

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java?rev=800349&r1=800348&r2=800349&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java Mon Aug  3 13:11:49 2009
@@ -15,6 +15,8 @@
   
 
   public String getLabel(String line) {
-    return line.substring(ARFFModel.ATTRIBUTE.length(), line.length() - indicator.length()).trim();
+    int idx = line.indexOf(indicator);
+    return line.substring(ARFFModel.ATTRIBUTE.length(),
+            idx).trim();
   }
 }

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java?rev=800349&r1=800348&r2=800349&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java Mon Aug  3 13:11:49 2009
@@ -95,25 +95,25 @@
       } else if (lower.startsWith(ARFFModel.ATTRIBUTE)) {
         String label;
         if (lower.indexOf(ARFFType.NUMERIC.getIndicator()) != -1) {
-          label = ARFFType.NUMERIC.getLabel(line);
+          label = ARFFType.NUMERIC.getLabel(lower);
           type = ARFFType.NUMERIC;
         } else if (lower.indexOf(ARFFType.STRING.getIndicator()) != -1) {
-          label = ARFFType.STRING.getLabel(line);
+          label = ARFFType.STRING.getLabel(lower);
           type = ARFFType.STRING;
           //TODO: create a map so we know which
 
         } else if (lower.indexOf(ARFFType.NOMINAL.getIndicator()) != -1) {
-          label = ARFFType.NOMINAL.getLabel(line);
+          label = ARFFType.NOMINAL.getLabel(lower);
           type = ARFFType.NOMINAL;
           //@ATTRIBUTE class        {Iris-setosa,Iris-versicolor,Iris-virginica}
           int classIdx = lower.indexOf(ARFFType.NOMINAL.getIndicator());
           String [] classes = line.substring(classIdx + 1, line.length() - 1).split(",");
           for (int i = 0; i < classes.length; i++) {
-            model.addNominal(classes[i].trim(), i);
+            model.addNominal(label, classes[i].trim(), i);
           }
 
         } else if (lower.indexOf(ARFFType.DATE.getIndicator()) != -1) {
-          label = ARFFType.DATE.getLabel(line);
+          label = ARFFType.DATE.getLabel(lower);
           type = ARFFType.DATE;
           //TODO: DateFormatter map
           DateFormat format = ARFFModel.DEFAULT_DATE_FORMAT;
@@ -184,7 +184,6 @@
         for (int i = 0; i < splits.length; i++) {
           String[] data = splits[i].split(" ");//first is index, second is
           int idx = Integer.parseInt(data[0]);
-
           result.setQuick(idx, model.getValue(data[1], idx));
         }
       } else {

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java?rev=800349&r1=800348&r2=800349&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java Mon Aug  3 13:11:49 2009
@@ -151,7 +151,8 @@
   private static void writeFile(String outWriter, String outDir, File file,
                                 long maxDocs, ARFFModel arffModel) throws IOException {
     log.info("Converting File: " + file);
-    ARFFModel model = new MapBackedARFFModel(arffModel.getWords(), arffModel.getWordCount() + 1);
+    ARFFModel model = new MapBackedARFFModel(arffModel.getWords(), arffModel.getWordCount() + 1,
+            arffModel.getNominalMap());
     ARFFVectorIterable iteratable = new ARFFVectorIterable(file, model);
     String outFile = outDir + "/" + file.getName() + ".mvc";
 

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java?rev=800349&r1=800348&r2=800349&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java Mon Aug  3 13:11:49 2009
@@ -34,22 +34,25 @@
   protected String relation;
 
   private Map<String, Integer> labelBindings;
+  private Map<Integer, String> idxLabel;
   private Map<Integer, ARFFType> typeMap; //key is the vector index, value is the type
   private Map<Integer, DateFormat> dateMap;
-  private Map<String, Integer> nominalMap;
+  private Map<String, Map<String, Integer>> nominalMap;
   private Map<String, Long> words;
 
   public MapBackedARFFModel() {
-    this(new HashMap<String, Long>(), 1);    
+    this(new HashMap<String, Long>(), 1, new HashMap<String, Map<String, Integer>>());
   }
 
-  public MapBackedARFFModel(Map<String, Long> words, long wordCount) {
+  public MapBackedARFFModel(Map<String, Long> words, long wordCount, Map<String, Map<String, Integer>> nominalMap) {
     this.words = words;
     this.wordCount = wordCount;
     labelBindings = new HashMap<String, Integer>();
+    idxLabel = new HashMap<Integer, String>();
     typeMap = new HashMap<Integer, ARFFType>();
     dateMap = new HashMap<Integer, DateFormat>();
-    nominalMap = new HashMap<String, Integer>();
+    this.nominalMap = nominalMap;
+
   }
 
   public String getRelation() {
@@ -87,7 +90,8 @@
         break;
       }
       case NOMINAL: {
-        result = processNominal(data);
+        String label = idxLabel.get(idx);
+        result = processNominal(label, data);
         break;
       }
 
@@ -96,13 +100,20 @@
     return result;
   }
 
-  protected double processNominal(String data) {
+  protected double processNominal(String label, String data) {
     double result;
-    Integer ord = nominalMap.get(data);
-    if (ord == null) {
-      throw new RuntimeException("Invalid nominal: " + data);
+    Map<String, Integer> classes = nominalMap.get(label);
+    if (classes != null) {
+      Integer ord = classes.get(data);
+      if (ord != null) {
+        result = ord;
+      } else {
+        throw new RuntimeException("Invalid nominal: " + data + " for label: " + label);
+      }
+    } else {
+      throw new RuntimeException("Invalid nominal label: " + label + " Data: " + data);
     }
-    result = ord;
+
     return result;
   }
 
@@ -147,11 +158,11 @@
   }
 
   /**
-   * The vector attributes (labels in Mahout speak)
+   * The vector attributes (labels in Mahout speak), unmodifiable
    * @return the map
    */
   public Map<String, Integer> getLabelBindings() {
-    return labelBindings;
+    return Collections.unmodifiableMap(labelBindings);
   }
 
   /**
@@ -171,11 +182,11 @@
   }
 
   /**
-   * Map nominals to ids
+   * Map nominals to ids.  Should only be modified by calling {@link ARFFModel#addNominal(String, String, int)}
    * @return the map
    */
-  public Map<String, Integer> getNominalMap() {
-    return Collections.unmodifiableMap(nominalMap);
+  public Map<String, Map<String, Integer>> getNominalMap() {
+    return nominalMap;
   }
 
   /**
@@ -186,12 +197,17 @@
     return words;
   }
 
-  public Integer getNominalValue(String nominal){
-    return nominalMap.get(nominal);
+  public Integer getNominalValue(String label, String nominal){
+    return nominalMap.get(label).get(nominal);
   }
 
-  public void addNominal(String nominal, int idx) {
-    nominalMap.put(nominal, idx);
+  public void addNominal(String label, String nominal, int idx) {
+    Map<String, Integer> noms = nominalMap.get(label);
+    if (noms == null) {
+      noms = new HashMap<String, Integer>();
+      nominalMap.put(label, noms);
+    }
+    noms.put(nominal, idx);
   }
 
   public DateFormat getDateFormat(Integer idx){
@@ -208,6 +224,7 @@
 
   public void addLabel(String label, Integer idx) {
     labelBindings.put(label, idx);
+    idxLabel.put(idx, label);
   }
 
   public ARFFType getARFFType(Integer idx){

Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java?rev=800349&r1=800348&r2=800349&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java Mon Aug  3 13:11:49 2009
@@ -22,9 +22,9 @@
 import org.apache.mahout.matrix.Vector;
 import org.apache.mahout.utils.strings.StringUtil;
 
+import java.text.DateFormat;
 import java.util.Iterator;
 import java.util.Map;
-import java.text.DateFormat;
 
 
 /**
@@ -50,6 +50,7 @@
     assertTrue(iterable.getModel().getRelation() + " is not equal to " + "Mahout", iterable.getModel().getRelation().equals("Mahout") == true);
     Map<String, Integer> bindings = iterable.getModel().getLabelBindings();
     assertNotNull(bindings);
+    assertTrue("bindings Size: " + bindings.size() + " is not: " + 5, bindings.size() == 5);
     Iterator<Vector> iter = iterable.iterator();
     assertTrue(iter.hasNext());
     Vector next = iter.next();
@@ -97,32 +98,73 @@
   }
 
   public void testNonNumeric() throws Exception {
-    try {
-      MapBackedARFFModel model = new MapBackedARFFModel();
-      ARFFVectorIterable iterable = new ARFFVectorIterable(NON_NUMERIC_ARFF, model);
-      int count = 0;
-      for (Vector vector : iterable) {
-        assertTrue("Vector is not dense", vector instanceof SparseVector);
-        count++;
-      }
-      assertTrue(count + " does not equal: " + 10, count == 10);
-      Map<String, Integer> nominalMap = iterable.getModel().getNominalMap();
-      assertNotNull(nominalMap);
-      assertTrue("nominalMap Size: " + nominalMap.size() + " is not: " + 2, nominalMap.size() == 2);
-      Map<Integer, ARFFType> integerARFFTypeMap = model.getTypeMap();
-      assertNotNull("Type map null", integerARFFTypeMap);
-      assertTrue("integerARFFTypeMap Size: " + integerARFFTypeMap.size() + " is not: " + 5, integerARFFTypeMap.size() == 5);
-      Map<String, Long> words = model.getWords();
-      assertNotNull("words null", words);
-      assertTrue("words Size: " + words.size() + " is not: " + 10, words.size() == 10);
-      System.out.println("Words: " + words);
-      Map<Integer, DateFormat> integerDateFormatMap = model.getDateMap();
-      assertNotNull("date format null", integerDateFormatMap);
-      assertTrue("integerDateFormatMap Size: " + integerDateFormatMap.size() + " is not: " + 1, integerDateFormatMap.size() == 1);
-    } catch (UnsupportedOperationException e) {
 
+    MapBackedARFFModel model = new MapBackedARFFModel();
+    ARFFVectorIterable iterable = new ARFFVectorIterable(NON_NUMERIC_ARFF, model);
+    int count = 0;
+    for (Vector vector : iterable) {
+      assertTrue("Vector is not dense", vector instanceof SparseVector);
+      count++;
     }
+    assertTrue(count + " does not equal: " + 10, count == 10);
+    Map<String, Map<String, Integer>> nominalMap = iterable.getModel().getNominalMap();
+    assertNotNull(nominalMap);
+    assertTrue("nominalMap Size: " + nominalMap.size() + " is not: " + 1, nominalMap.size() == 1);
+    Map<String, Integer> noms = nominalMap.get("bar");
+    assertNotNull("nominals for bar are null", noms);
+    assertTrue("noms Size: " + noms.size() + " is not: " + 2, noms.size() == 2);
+    Map<Integer, ARFFType> integerARFFTypeMap = model.getTypeMap();
+    assertNotNull("Type map null", integerARFFTypeMap);
+    assertTrue("integerARFFTypeMap Size: " + integerARFFTypeMap.size() + " is not: " + 5, integerARFFTypeMap.size() == 5);
+    Map<String, Long> words = model.getWords();
+    assertNotNull("words null", words);
+    assertTrue("words Size: " + words.size() + " is not: " + 10, words.size() == 10);
+    System.out.println("Words: " + words);
+    Map<Integer, DateFormat> integerDateFormatMap = model.getDateMap();
+    assertNotNull("date format null", integerDateFormatMap);
+    assertTrue("integerDateFormatMap Size: " + integerDateFormatMap.size() + " is not: " + 1, integerDateFormatMap.size() == 1);
+
+  }
 
+  public void testMultipleNoms() throws Exception {
+    MapBackedARFFModel model = new MapBackedARFFModel();
+    ARFFVectorIterable iterable = new ARFFVectorIterable(NON_NUMERIC_ARFF, model);
+    int count = 0;
+    for (Vector vector : iterable) {
+      assertTrue("Vector is not dense", vector instanceof SparseVector);
+      count++;
+    }
+    assertTrue(count + " does not equal: " + 10, count == 10);
+    Map<String, Map<String, Integer>> nominalMap = iterable.getModel().getNominalMap();
+    assertNotNull(nominalMap);
+    assertTrue("nominalMap Size: " + nominalMap.size() + " is not: " + 1, nominalMap.size() == 1);
+    Map<String, Integer> noms = nominalMap.get("bar");
+    assertNotNull("nominals for bar are null", noms);
+    assertTrue("noms Size: " + noms.size() + " is not: " + 2, noms.size() == 2);
+    Map<Integer, ARFFType> integerARFFTypeMap = model.getTypeMap();
+    assertNotNull("Type map null", integerARFFTypeMap);
+    assertTrue("integerARFFTypeMap Size: " + integerARFFTypeMap.size() + " is not: " + 5, integerARFFTypeMap.size() == 5);
+    Map<String, Long> words = model.getWords();
+    assertNotNull("words null", words);
+    assertTrue("words Size: " + words.size() + " is not: " + 10, words.size() == 10);
+    System.out.println("Words: " + words);
+    Map<Integer, DateFormat> integerDateFormatMap = model.getDateMap();
+    assertNotNull("date format null", integerDateFormatMap);
+    assertTrue("integerDateFormatMap Size: " + integerDateFormatMap.size() + " is not: " + 1, integerDateFormatMap.size() == 1);
+    model = new MapBackedARFFModel(model.getWords(), model.getWordCount(),
+            model.getNominalMap());
+    iterable = new ARFFVectorIterable(NON_NUMERIC_ARFF2, model);
+    count = 0;
+    for (Vector vector : iterable) {
+      assertTrue("Vector is not dense", vector instanceof SparseVector);
+      count++;
+    }
+    nominalMap = model.getNominalMap();
+    assertNotNull(nominalMap);
+    assertTrue("nominalMap Size: " + nominalMap.size() + " is not: " + 2, nominalMap.size() == 2);
+    noms = nominalMap.get("test");
+    assertNotNull("nominals for bar are null", noms);
+    assertTrue("noms Size: " + noms.size() + " is not: " + 2, noms.size() == 2);
   }
 
 
@@ -203,4 +245,30 @@
           "   {3 howe,4 1983-06-23}\n" +
           "   {0 2.2,2 d,3 messier,4 2008-11-23}\n" +
           "   {2 c,3 roy,4 1973-10-13}\n";
+
+  public static final String NON_NUMERIC_ARFF2 = "   % Comments\n" +
+          "   % \n" +
+          "   % Comments go here" +
+          "   % \n" +
+          "   @RELATION Mahout\n" +
+          "\n" +
+          "   @ATTRIBUTE junk  NUMERIC\n" +
+          "   @ATTRIBUTE foo  NUMERIC\n" +
+          "   @ATTRIBUTE test   {f,z}\n" +
+          "   @ATTRIBUTE hockey  string\n" +
+          "   @ATTRIBUTE football   date \"yyyy-MM-dd\"\n" +
+          "  \n" +
+          "\n" +
+          "\n" +
+          "   @DATA\n" +
+          "   {2 f,3 gretzky,4 1973-10-23}\n" +
+          "   {1 2.9,2 z,3 orr,4 1973-11-23}\n" +
+          "   {2 f,3 bossy,4 1981-10-23}\n" +
+          "   {1 2.6,2 f,3 lefleur,4 1989-10-23}\n" +
+          "   {3 esposito,4 1973-04-23}\n" +
+          "   {1 23.2,2 z,3 chelios,4 1999-2-23}\n" +
+          "   {3 richard,4 1973-10-12}\n" +
+          "   {3 howe,4 1983-06-23}\n" +
+          "   {0 2.2,2 f,3 messier,4 2008-11-23}\n" +
+          "   {2 f,3 roy,4 1973-10-13}\n";
 }