You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@drill.apache.org by td...@apache.org on 2013/07/12 04:40:22 UTC

git commit: DRILL-148 - Fix problems due to bit rot. Mahout 0.8 has migrated a bit from where it started.

Updated Branches:
  refs/heads/master 97eb07ac8 -> 5052b64d9


DRILL-148 - Fix problems due to bit rot.  Mahout 0.8 has migrated a bit from where it started.

Project: http://git-wip-us.apache.org/repos/asf/incubator-drill/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-drill/commit/5052b64d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-drill/tree/5052b64d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-drill/diff/5052b64d

Branch: refs/heads/master
Commit: 5052b64d9953857575f8f40995b8da05160e5457
Parents: 97eb07a
Author: Ted Dunning <td...@apache.org>
Authored: Thu Jul 11 19:35:13 2013 -0700
Committer: Ted Dunning <td...@apache.org>
Committed: Thu Jul 11 19:35:13 2013 -0700

----------------------------------------------------------------------
 .../apache/drill/synth/ChineseRestaurant.java   | 118 +++++++++++++++++++
 .../java/org/apache/drill/synth/LongTail.java   |   9 +-
 2 files changed, 121 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/5052b64d/sandbox/prototype/contrib/synth-log/src/main/java/org/apache/drill/synth/ChineseRestaurant.java
----------------------------------------------------------------------
diff --git a/sandbox/prototype/contrib/synth-log/src/main/java/org/apache/drill/synth/ChineseRestaurant.java b/sandbox/prototype/contrib/synth-log/src/main/java/org/apache/drill/synth/ChineseRestaurant.java
new file mode 100644
index 0000000..0288071
--- /dev/null
+++ b/sandbox/prototype/contrib/synth-log/src/main/java/org/apache/drill/synth/ChineseRestaurant.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.synth;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.list.DoubleArrayList;
+import org.apache.mahout.math.random.Sampler;
+
+import java.util.Random;
+
+/**
+ *
+ * Generates samples from a generalized Chinese restaurant process (or Pittman-Yor process).
+ *
+ * The number of values drawn exactly once will asymptotically be equal to the discount parameter
+ * as the total number of draws T increases without bound.  The number of unique values sampled will
+ * increase as O(alpha * log T) if discount = 0 or O(alpha * T^discount) for discount > 0.
+ */
+public final class ChineseRestaurant implements Sampler<Integer> {
+    private final double alpha;
+    private double weight = 0;
+    private double discount = 0;
+    private final DoubleArrayList weights = new DoubleArrayList();
+    private final Random rand = RandomUtils.getRandom();
+
+    /**
+     * Constructs a Dirichlet process sampler.  This is done by setting discount = 0.
+     * @param alpha  The strength parameter for the Dirichlet process.
+     */
+    public ChineseRestaurant(double alpha) {
+        this(alpha, 0);
+    }
+
+    /**
+     * Constructs a Pitman-Yor sampler.
+     *
+     * @param alpha     The strength parameter that drives the number of unique values as a function of draws.
+     * @param discount  The discount parameter that drives the percentage of values that occur once in a large sample.
+     */
+    public ChineseRestaurant(double alpha, double discount) {
+        Preconditions.checkArgument(alpha > 0);
+        Preconditions.checkArgument(discount >= 0 && discount <= 1);
+        this.alpha = alpha;
+        this.discount = discount;
+    }
+
+    public Integer sample() {
+        double u = rand.nextDouble() * (alpha + weight);
+        for (int j = 0; j < weights.size(); j++) {
+            // select existing options with probability (w_j - d) / (alpha + w)
+            if (u < weights.get(j) - discount) {
+                weights.set(j, weights.get(j) + 1);
+                weight++;
+                return j;
+            } else {
+                u -= weights.get(j) - discount;
+            }
+        }
+
+        // if no existing item selected, pick new item with probability (alpha - d*t) / (alpha + w)
+        // where t is number of pre-existing cases
+        weights.add(1);
+        weight++;
+        return weights.size() - 1;
+    }
+
+    /**
+     * @return the number of unique values that have been returned.
+     */
+    public int size() {
+        return weights.size();
+    }
+
+    /**
+     * @return the number draws so far.
+     */
+    public int count() {
+        return (int) weight;
+    }
+
+    /**
+     * @param j Which value to test.
+     * @return  The number of times that j has been returned so far.
+     */
+    public int count(int j) {
+        Preconditions.checkArgument(j >= 0);
+
+        if (j < weights.size()) {
+            return (int) weights.get(j);
+        } else {
+            return 0;
+        }
+    }
+
+    public void setCount(int term, double count) {
+        while (weights.size() <= term) {
+            weights.add(0);
+        }
+        weight += (count - weights.get(term));
+        weights.set(term, count);
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/5052b64d/sandbox/prototype/contrib/synth-log/src/main/java/org/apache/drill/synth/LongTail.java
----------------------------------------------------------------------
diff --git a/sandbox/prototype/contrib/synth-log/src/main/java/org/apache/drill/synth/LongTail.java b/sandbox/prototype/contrib/synth-log/src/main/java/org/apache/drill/synth/LongTail.java
index 1e0d2ef..1a46e52 100644
--- a/sandbox/prototype/contrib/synth-log/src/main/java/org/apache/drill/synth/LongTail.java
+++ b/sandbox/prototype/contrib/synth-log/src/main/java/org/apache/drill/synth/LongTail.java
@@ -1,17 +1,14 @@
 package org.apache.drill.synth;
 
 import com.google.common.collect.Lists;
-import org.apache.mahout.math.random.ChineseRestaurant;
 import org.apache.mahout.math.random.Sampler;
 
 import java.util.List;
 
 /**
- * Created with IntelliJ IDEA.
- * User: tdunning
- * Date: 2/2/13
- * Time: 6:05 PM
- * To change this template use File | Settings | File Templates.
+ * Samples from a set of things based on a long-tailed distribution.  This converts
+ * the ChineseRestaurant distribution from a distribution over integers into a distribution
+ * over more plausible looking things like words.
  */
 public abstract class LongTail<T> implements Sampler<T> {
     private ChineseRestaurant base;