You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ad...@apache.org on 2009/10/11 09:02:21 UTC
svn commit: r824033 -
/lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java
Author: adeneche
Date: Sun Oct 11 07:02:20 2009
New Revision: 824033
URL: http://svn.apache.org/viewvc?rev=824033&view=rev
Log:
MAHOUT-133 - CDInfosToolTest failure
Modified:
lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java
Modified: lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java?rev=824033&r1=824032&r2=824033&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java (original)
+++ lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java Sun Oct 11 07:02:20 2009
@@ -24,6 +24,7 @@
import org.apache.hadoop.fs.Path;
import org.apache.mahout.ga.watchmaker.cd.tool.DescriptionUtils.Range;
import org.apache.mahout.common.RandomUtils;
+import org.apache.commons.lang.ArrayUtils;
import java.io.BufferedWriter;
import java.io.IOException;
@@ -36,6 +37,8 @@
public class CDInfosToolTest extends TestCase {
+ /** max number of distinct values for any nominal attribute */
+ private static final int MAX_NOMINAL_VALUES = 50;
private Random rng;
@Override
@@ -63,6 +66,14 @@
return new Descriptors(descriptors);
}
+ /**
+ * generate random descriptions given the attibutes descriptors.<br> -
+ * numerical attributes: generate random min and max values<br> - nominal
+ * attributes: generate a random list of values
+ *
+ * @param descriptors
+ * @return
+ */
private Object[][] randomDescriptions(Descriptors descriptors) {
int nbattrs = descriptors.size();
Object[][] descriptions = new Object[nbattrs][];
@@ -79,7 +90,7 @@
descriptions[index] = new Double[] { min, max };
} else if (descriptors.isNominal(index)) {
// categorical attribute
- int nbvalues = rng.nextInt(50) + 1;
+ int nbvalues = rng.nextInt(MAX_NOMINAL_VALUES) + 1;
descriptions[index] = new Object[nbvalues];
for (int vindex = 0; vindex < nbvalues; vindex++) {
descriptions[index][vindex] = "val_" + index + '_' + vindex;
@@ -92,15 +103,25 @@
private void randomDataset(FileSystem fs, Path input, Descriptors descriptors,
Object[][] descriptions) throws IOException {
+ boolean[][] appeared = new boolean[descriptions.length][];
+ for (int desc = 0; desc < descriptors.size(); desc++) {
+ // appeared is used only by nominal attributes
+ if (descriptors.isNominal(desc)) {
+ appeared[desc] = new boolean[descriptions[desc].length];
+ }
+ }
+
int nbfiles = rng.nextInt(20) + 1;
for (int floop = 0; floop < nbfiles; floop++) {
FSDataOutputStream out = fs.create(new Path(input, "file." + floop));
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out));
- int nblines = rng.nextInt(200) + 1;
+ // make sure we have enough room to allow all nominal values to appear in the data
+ int nblines = rng.nextInt(200) + MAX_NOMINAL_VALUES;
+
for (int line = 0; line < nblines; line++) {
- writer.write(randomLine(descriptors, descriptions));
+ writer.write(randomLine(descriptors, descriptions, appeared));
writer.newLine();
}
@@ -108,7 +129,18 @@
}
}
- private String randomLine(Descriptors descriptors, Object[][] descriptions) {
+ /**
+ * generates a random line using the given information
+ *
+ * @param descriptors attributes descriptions
+ * @param descriptions detailed attributes descriptions:<br> - min and max
+ * values for numerical attributes<br> - all distinct values for
+ * nominal attributes
+ * @param appeared used to make sure that each nominal attribute's value
+ * appears at least once in the dataset
+ * @return
+ */
+ private String randomLine(Descriptors descriptors, Object[][] descriptions, boolean[][] appeared) {
StringBuilder buffer = new StringBuilder();
for (int index = 0; index < descriptors.size(); index++) {
@@ -122,9 +154,21 @@
} else if (descriptors.isNominal(index)) {
// categorical attribute
int nbvalues = descriptions[index].length;
- int vindex = rng.nextInt(nbvalues);
+ // chose a random value
+ int vindex;
+ if (ArrayUtils.contains(appeared[index], false)) {
+ // if some values never appeared in the dataset, start with them
+ do {
+ vindex = rng.nextInt(nbvalues);
+ } while (appeared[index][vindex]);
+ } else {
+ // chose any value
+ vindex = rng.nextInt(nbvalues);
+ }
buffer.append(descriptions[index][vindex]);
+
+ appeared[index][vindex] = true;
} else {
// ignored attribute (any value is correct)
buffer.append('I');
@@ -149,60 +193,63 @@
}
public void testGatherInfos() throws Exception {
- int maxattr = 100; // max number of attributes
- int nbattrs = rng.nextInt(maxattr) + 1;
-
- // random descriptors
- double numRate = rng.nextDouble();
- double catRate = rng.nextDouble() * (1.0 - numRate);
- Descriptors descriptors = randomDescriptors(nbattrs, numRate, catRate);
-
- // random descriptions
- Object[][] descriptions = randomDescriptions(descriptors);
-
- // random dataset
- Path inpath = new Path("input");
- FileSystem fs = FileSystem.get(inpath.toUri(), new Configuration());
- if (fs.exists(inpath)) {
- fs.delete(inpath, true);
- }
-
- randomDataset(fs, inpath, descriptors, descriptions);
-
- // Start the tool
- List<String> result = new ArrayList<String>();
- CDInfosTool.gatherInfos(descriptors, inpath, result);
-
- // check the results
- Collection<String> target = new ArrayList<String>();
-
- assertEquals(nbNonIgnored(descriptors), result.size());
- int rindex = 0;
- for (int index = 0; index < nbattrs; index++) {
- if (descriptors.isIgnored(index)) {
- continue;
- }
-
- String description = result.get(rindex++);
+ int n = 1; // put a greater value when you search for some nasty bug
+ for (int nloop = 0; nloop < n; nloop++) {
+ int maxattr = 100; // max number of attributes
+ int nbattrs = rng.nextInt(maxattr) + 1;
+
+ // random descriptors
+ double numRate = rng.nextDouble();
+ double catRate = rng.nextDouble() * (1.0 - numRate);
+ Descriptors descriptors = randomDescriptors(nbattrs, numRate, catRate);
+
+ // random descriptions
+ Object[][] descriptions = randomDescriptions(descriptors);
+
+ // random dataset
+ Path inpath = new Path("input");
+ FileSystem fs = FileSystem.get(inpath.toUri(), new Configuration());
+ if (fs.exists(inpath)) {
+ fs.delete(inpath, true);
+ }
+
+ randomDataset(fs, inpath, descriptors, descriptions);
+
+ // Start the tool
+ List<String> result = new ArrayList<String>();
+ CDInfosTool.gatherInfos(descriptors, inpath, result);
+
+ // check the results
+ Collection<String> target = new ArrayList<String>();
+
+ assertEquals(nbNonIgnored(descriptors), result.size());
+ int rindex = 0;
+ for (int index = 0; index < nbattrs; index++) {
+ if (descriptors.isIgnored(index)) {
+ continue;
+ }
- if (descriptors.isNumerical(index)) {
- // numerical attribute
- double min = (Double) descriptions[index][0];
- double max = (Double) descriptions[index][1];
- Range range = DescriptionUtils.extractNumericalRange(description);
+ String description = result.get(rindex++);
- assertTrue("bad min value for attribute (" + index + ')',
- min <= range.min);
- assertTrue("bad max value for attribute (" + index + ')',
- max >= range.max);
- } else if (descriptors.isNominal(index)) {
- // categorical attribute
- Object[] values = descriptions[index];
- target.clear();
- DescriptionUtils.extractNominalValues(description, target);
+ if (descriptors.isNumerical(index)) {
+ // numerical attribute
+ double min = (Double) descriptions[index][0];
+ double max = (Double) descriptions[index][1];
+ Range range = DescriptionUtils.extractNumericalRange(description);
+
+ assertTrue("bad min value for attribute (" + index + ')',
+ min <= range.min);
+ assertTrue("bad max value for attribute (" + index + ')',
+ max >= range.max);
+ } else if (descriptors.isNominal(index)) {
+ // categorical attribute
+ Object[] values = descriptions[index];
+ target.clear();
+ DescriptionUtils.extractNominalValues(description, target);
- assertEquals(values.length, target.size());
- assertTrue(target.containsAll(Arrays.asList(values)));
+ assertEquals(values.length, target.size());
+ assertTrue(target.containsAll(Arrays.asList(values)));
+ }
}
}
}