You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ad...@apache.org on 2009/10/11 09:02:21 UTC

svn commit: r824033 - /lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java

Author: adeneche
Date: Sun Oct 11 07:02:20 2009
New Revision: 824033

URL: http://svn.apache.org/viewvc?rev=824033&view=rev
Log:
MAHOUT-133 - CDInfosToolTest failure

Modified:
    lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java

Modified: lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java?rev=824033&r1=824032&r2=824033&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java (original)
+++ lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java Sun Oct 11 07:02:20 2009
@@ -24,6 +24,7 @@
 import org.apache.hadoop.fs.Path;
 import org.apache.mahout.ga.watchmaker.cd.tool.DescriptionUtils.Range;
 import org.apache.mahout.common.RandomUtils;
+import org.apache.commons.lang.ArrayUtils;
 
 import java.io.BufferedWriter;
 import java.io.IOException;
@@ -36,6 +37,8 @@
 
 public class CDInfosToolTest extends TestCase {
 
+  /** max number of distinct values for any nominal attribute */
+  private static final int MAX_NOMINAL_VALUES = 50;
   private Random rng;
 
   @Override
@@ -63,6 +66,14 @@
     return new Descriptors(descriptors);
   }
 
+  /**
+   * generate random descriptions given the attibutes descriptors.<br> -
+   * numerical attributes: generate random min and max values<br> - nominal
+   * attributes: generate a random list of values
+   *
+   * @param descriptors
+   * @return
+   */
   private Object[][] randomDescriptions(Descriptors descriptors) {
     int nbattrs = descriptors.size();
     Object[][] descriptions = new Object[nbattrs][];
@@ -79,7 +90,7 @@
         descriptions[index] = new Double[] { min, max };
       } else if (descriptors.isNominal(index)) {
         // categorical attribute
-        int nbvalues = rng.nextInt(50) + 1;
+        int nbvalues = rng.nextInt(MAX_NOMINAL_VALUES) + 1;
         descriptions[index] = new Object[nbvalues];
         for (int vindex = 0; vindex < nbvalues; vindex++) {
           descriptions[index][vindex] = "val_" + index + '_' + vindex;
@@ -92,15 +103,25 @@
 
   private void randomDataset(FileSystem fs, Path input, Descriptors descriptors,
       Object[][] descriptions) throws IOException {
+    boolean[][] appeared = new boolean[descriptions.length][];
+    for (int desc = 0; desc < descriptors.size(); desc++) {
+      // appeared is used only by nominal attributes
+      if (descriptors.isNominal(desc)) {
+        appeared[desc] = new boolean[descriptions[desc].length];
+      }
+    }
+
     int nbfiles = rng.nextInt(20) + 1;
 
     for (int floop = 0; floop < nbfiles; floop++) {
       FSDataOutputStream out = fs.create(new Path(input, "file." + floop));
       BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out));
 
-      int nblines = rng.nextInt(200) + 1;
+      // make sure we have enough room to allow all nominal values to appear in the data
+      int nblines = rng.nextInt(200) + MAX_NOMINAL_VALUES;
+
       for (int line = 0; line < nblines; line++) {
-        writer.write(randomLine(descriptors, descriptions));
+        writer.write(randomLine(descriptors, descriptions, appeared));
         writer.newLine();
       }
 
@@ -108,7 +129,18 @@
     }
   }
 
-  private String randomLine(Descriptors descriptors, Object[][] descriptions) {
+  /**
+   * generates a random line using the given information
+   *
+   * @param descriptors attributes descriptions
+   * @param descriptions detailed attributes descriptions:<br> - min and max
+   *        values for numerical attributes<br> - all distinct values for
+   *        nominal attributes
+   * @param appeared used to make sure that each nominal attribute's value
+   *        appears at least once in the dataset
+   * @return
+   */
+  private String randomLine(Descriptors descriptors, Object[][] descriptions, boolean[][] appeared) {
     StringBuilder buffer = new StringBuilder();
 
     for (int index = 0; index < descriptors.size(); index++) {
@@ -122,9 +154,21 @@
       } else if (descriptors.isNominal(index)) {
         // categorical attribute
         int nbvalues = descriptions[index].length;
-        int vindex = rng.nextInt(nbvalues);
+        // chose a random value
+        int vindex;
+        if (ArrayUtils.contains(appeared[index], false)) {
+          // if some values never appeared in the dataset, start with them
+          do {
+            vindex = rng.nextInt(nbvalues);
+          } while (appeared[index][vindex]);
+        } else {
+          // chose any value
+          vindex = rng.nextInt(nbvalues);
+        }
 
         buffer.append(descriptions[index][vindex]);
+
+        appeared[index][vindex] = true;
       } else {
         // ignored attribute (any value is correct)
         buffer.append('I');
@@ -149,60 +193,63 @@
   }
 
   public void testGatherInfos() throws Exception {
-    int maxattr = 100; // max number of attributes
-    int nbattrs = rng.nextInt(maxattr) + 1;
-
-    // random descriptors
-    double numRate = rng.nextDouble();
-    double catRate = rng.nextDouble() * (1.0 - numRate);
-    Descriptors descriptors = randomDescriptors(nbattrs, numRate, catRate);
-
-    // random descriptions
-    Object[][] descriptions = randomDescriptions(descriptors);
-
-    // random dataset
-    Path inpath = new Path("input");
-    FileSystem fs = FileSystem.get(inpath.toUri(), new Configuration());
-    if (fs.exists(inpath)) {
-      fs.delete(inpath, true);
-    }
-
-    randomDataset(fs, inpath, descriptors, descriptions);
-
-    // Start the tool
-    List<String> result = new ArrayList<String>();
-    CDInfosTool.gatherInfos(descriptors, inpath, result);
-
-    // check the results
-    Collection<String> target = new ArrayList<String>();
-
-    assertEquals(nbNonIgnored(descriptors), result.size());
-    int rindex = 0;
-    for (int index = 0; index < nbattrs; index++) {
-      if (descriptors.isIgnored(index)) {
-        continue;
-      } 
-
-      String description = result.get(rindex++);
+    int n = 1; // put a greater value when you search for some nasty bug
+    for (int nloop = 0; nloop < n; nloop++) {
+      int maxattr = 100; // max number of attributes
+      int nbattrs = rng.nextInt(maxattr) + 1;
+
+      // random descriptors
+      double numRate = rng.nextDouble();
+      double catRate = rng.nextDouble() * (1.0 - numRate);
+      Descriptors descriptors = randomDescriptors(nbattrs, numRate, catRate);
+
+      // random descriptions
+      Object[][] descriptions = randomDescriptions(descriptors);
+
+      // random dataset
+      Path inpath = new Path("input");
+      FileSystem fs = FileSystem.get(inpath.toUri(), new Configuration());
+      if (fs.exists(inpath)) {
+        fs.delete(inpath, true);
+      }
+
+      randomDataset(fs, inpath, descriptors, descriptions);
+
+      // Start the tool
+      List<String> result = new ArrayList<String>();
+      CDInfosTool.gatherInfos(descriptors, inpath, result);
+
+      // check the results
+      Collection<String> target = new ArrayList<String>();
+
+      assertEquals(nbNonIgnored(descriptors), result.size());
+      int rindex = 0;
+      for (int index = 0; index < nbattrs; index++) {
+        if (descriptors.isIgnored(index)) {
+          continue;
+        }
 
-      if (descriptors.isNumerical(index)) {
-        // numerical attribute
-        double min = (Double) descriptions[index][0];
-        double max = (Double) descriptions[index][1];
-        Range range = DescriptionUtils.extractNumericalRange(description);
+        String description = result.get(rindex++);
 
-        assertTrue("bad min value for attribute (" + index + ')',
-            min <= range.min);
-        assertTrue("bad max value for attribute (" + index + ')',
-            max >= range.max);
-      } else if (descriptors.isNominal(index)) {
-        // categorical attribute
-        Object[] values = descriptions[index];
-        target.clear();
-        DescriptionUtils.extractNominalValues(description, target);
+        if (descriptors.isNumerical(index)) {
+          // numerical attribute
+          double min = (Double) descriptions[index][0];
+          double max = (Double) descriptions[index][1];
+          Range range = DescriptionUtils.extractNumericalRange(description);
+
+          assertTrue("bad min value for attribute (" + index + ')',
+                  min <= range.min);
+          assertTrue("bad max value for attribute (" + index + ')',
+                  max >= range.max);
+        } else if (descriptors.isNominal(index)) {
+          // categorical attribute
+          Object[] values = descriptions[index];
+          target.clear();
+          DescriptionUtils.extractNominalValues(description, target);
 
-        assertEquals(values.length, target.size());
-        assertTrue(target.containsAll(Arrays.asList(values)));
+          assertEquals(values.length, target.size());
+          assertTrue(target.containsAll(Arrays.asList(values)));
+        }
       }
     }
   }