You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@drill.apache.org by td...@apache.org on 2013/02/05 02:23:56 UTC
[1/5] DRILL-33 - Synthetic Log Generator
http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/f04a0fd2/sandbox/prototype/contrib/synth-log/src/test/java/org/apache/drill/synth/TermGeneratorTest.java
----------------------------------------------------------------------
diff --git a/sandbox/prototype/contrib/synth-log/src/test/java/org/apache/drill/synth/TermGeneratorTest.java b/sandbox/prototype/contrib/synth-log/src/test/java/org/apache/drill/synth/TermGeneratorTest.java
new file mode 100644
index 0000000..1bf089d
--- /dev/null
+++ b/sandbox/prototype/contrib/synth-log/src/test/java/org/apache/drill/synth/TermGeneratorTest.java
@@ -0,0 +1,80 @@
+package org.apache.drill.synth;
+
+import com.google.common.base.Function;
+import com.google.common.collect.*;
+import org.apache.commons.math3.distribution.NormalDistribution;
+import org.apache.mahout.math.stats.LogLikelihood;
+import org.junit.Test;
+
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class TermGeneratorTest {
+
+ private static final WordGenerator WORDS = new WordGenerator("word-frequency-seed", "other-words");
+
+ @Test
+ public void generateTerms() {
+ TermGenerator x = new TermGenerator(WORDS, 1, 0.8);
+ final Multiset<String> counts = HashMultiset.create();
+ for (int i = 0; i < 10000; i++) {
+ counts.add(x.sample());
+ }
+
+ assertEquals(10000, counts.size());
+ assertTrue("Should have some common words", counts.elementSet().size() < 10000);
+ List<Integer> k = Lists.newArrayList(Iterables.transform(counts.elementSet(), new Function<String, Integer>() {
+ public Integer apply(String s) {
+ return counts.count(s);
+ }
+ }));
+// System.out.printf("%s\n", Ordering.natural().reverse().sortedCopy(k).subList(0, 30));
+// System.out.printf("%s\n", Iterables.transform(Iterables.filter(counts.elementSet(), new Predicate<String>() {
+// public boolean apply(String s) {
+// return counts.count(s) > 100;
+// }
+// }), new Function<String, String>() {
+// public String apply(String s) {
+// return s + ":" + counts.count(s);
+// }
+// }));
+ assertEquals(1, Ordering.natural().leastOf(k, 1).get(0).intValue());
+ assertTrue(Ordering.natural().greatestOf(k, 1).get(0) > 300);
+ assertTrue(counts.count("the") > 300);
+ }
+
+ @Test
+ public void distinctVocabularies() {
+ TermGenerator x1 = new TermGenerator(WORDS, 1, 0.8);
+ final Multiset<String> k1 = HashMultiset.create();
+ for (int i = 0; i < 50000; i++) {
+ k1.add(x1.sample());
+ }
+
+ TermGenerator x2 = new TermGenerator(WORDS, 1, 0.8);
+ final Multiset<String> k2 = HashMultiset.create();
+ for (int i = 0; i < 50000; i++) {
+ k2.add(x2.sample());
+ }
+
+ final NormalDistribution normal = new NormalDistribution();
+ List<Double> scores = Ordering.natural().sortedCopy(Iterables.transform(k1.elementSet(),
+ new Function<String, Double>() {
+ public Double apply(String s) {
+ return normal.cumulativeProbability(LogLikelihood.rootLogLikelihoodRatio(k1.count(s), 50000 - k1.count(s), k2.count(s), 50000 - k2.count(s)));
+ }
+ }));
+ int n = scores.size();
+// System.out.printf("%.5f, %.5f, %.5f, %.5f, %.5f, %.5f, %.5f", scores.get(0), scores.get((int) (0.05*n)), scores.get(n / 4), scores.get(n / 2), scores.get(3 * n / 4), scores.get((int) (0.95 * n)), scores.get(n - 1));
+ int i = 0;
+ for (Double score : scores) {
+ if (i % 10 == 0) {
+ System.out.printf("%.6f\t%.6f\n", (double) i / n, score);
+ }
+
+ i++;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/f04a0fd2/sandbox/prototype/contrib/synth-log/src/test/java/org/apache/drill/synth/WordGeneratorTest.java
----------------------------------------------------------------------
diff --git a/sandbox/prototype/contrib/synth-log/src/test/java/org/apache/drill/synth/WordGeneratorTest.java b/sandbox/prototype/contrib/synth-log/src/test/java/org/apache/drill/synth/WordGeneratorTest.java
new file mode 100644
index 0000000..f085f94
--- /dev/null
+++ b/sandbox/prototype/contrib/synth-log/src/test/java/org/apache/drill/synth/WordGeneratorTest.java
@@ -0,0 +1,22 @@
+package org.apache.drill.synth;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class WordGeneratorTest {
+ @Test
+ public void checkRealWords() {
+ WordGenerator words = new WordGenerator("word-frequency-seed", "other-words");
+ for (int i = 0; i < 20000; i++) {
+ assertFalse(words.getString(i).matches("-[0-9]+"));
+ }
+
+ for (int i = 0; i < 1000; i++) {
+ String w = words.getString(i + 200000);
+ assertTrue(w.matches(".*-[0-9]+"));
+ }
+ }
+
+}