You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@drill.apache.org by td...@apache.org on 2013/07/12 04:40:22 UTC
git commit: DRILL-148 - Fix problems due to bit rot. Mahout 0.8 has
migrated a bit from where it started.
Updated Branches:
refs/heads/master 97eb07ac8 -> 5052b64d9
DRILL-148 - Fix problems due to bit rot. Mahout 0.8 has migrated a bit from where it started.
Project: http://git-wip-us.apache.org/repos/asf/incubator-drill/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-drill/commit/5052b64d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-drill/tree/5052b64d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-drill/diff/5052b64d
Branch: refs/heads/master
Commit: 5052b64d9953857575f8f40995b8da05160e5457
Parents: 97eb07a
Author: Ted Dunning <td...@apache.org>
Authored: Thu Jul 11 19:35:13 2013 -0700
Committer: Ted Dunning <td...@apache.org>
Committed: Thu Jul 11 19:35:13 2013 -0700
----------------------------------------------------------------------
.../apache/drill/synth/ChineseRestaurant.java | 118 +++++++++++++++++++
.../java/org/apache/drill/synth/LongTail.java | 9 +-
2 files changed, 121 insertions(+), 6 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/5052b64d/sandbox/prototype/contrib/synth-log/src/main/java/org/apache/drill/synth/ChineseRestaurant.java
----------------------------------------------------------------------
diff --git a/sandbox/prototype/contrib/synth-log/src/main/java/org/apache/drill/synth/ChineseRestaurant.java b/sandbox/prototype/contrib/synth-log/src/main/java/org/apache/drill/synth/ChineseRestaurant.java
new file mode 100644
index 0000000..0288071
--- /dev/null
+++ b/sandbox/prototype/contrib/synth-log/src/main/java/org/apache/drill/synth/ChineseRestaurant.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.synth;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.list.DoubleArrayList;
+import org.apache.mahout.math.random.Sampler;
+
+import java.util.Random;
+
+/**
+ *
+ * Generates samples from a generalized Chinese restaurant process (or Pittman-Yor process).
+ *
+ * The number of values drawn exactly once will asymptotically be equal to the discount parameter
+ * as the total number of draws T increases without bound. The number of unique values sampled will
+ * increase as O(alpha * log T) if discount = 0 or O(alpha * T^discount) for discount > 0.
+ */
+public final class ChineseRestaurant implements Sampler<Integer> {
+ private final double alpha;
+ private double weight = 0;
+ private double discount = 0;
+ private final DoubleArrayList weights = new DoubleArrayList();
+ private final Random rand = RandomUtils.getRandom();
+
+ /**
+ * Constructs a Dirichlet process sampler. This is done by setting discount = 0.
+ * @param alpha The strength parameter for the Dirichlet process.
+ */
+ public ChineseRestaurant(double alpha) {
+ this(alpha, 0);
+ }
+
+ /**
+ * Constructs a Pitman-Yor sampler.
+ *
+ * @param alpha The strength parameter that drives the number of unique values as a function of draws.
+ * @param discount The discount parameter that drives the percentage of values that occur once in a large sample.
+ */
+ public ChineseRestaurant(double alpha, double discount) {
+ Preconditions.checkArgument(alpha > 0);
+ Preconditions.checkArgument(discount >= 0 && discount <= 1);
+ this.alpha = alpha;
+ this.discount = discount;
+ }
+
+ public Integer sample() {
+ double u = rand.nextDouble() * (alpha + weight);
+ for (int j = 0; j < weights.size(); j++) {
+ // select existing options with probability (w_j - d) / (alpha + w)
+ if (u < weights.get(j) - discount) {
+ weights.set(j, weights.get(j) + 1);
+ weight++;
+ return j;
+ } else {
+ u -= weights.get(j) - discount;
+ }
+ }
+
+ // if no existing item selected, pick new item with probability (alpha - d*t) / (alpha + w)
+ // where t is number of pre-existing cases
+ weights.add(1);
+ weight++;
+ return weights.size() - 1;
+ }
+
+ /**
+ * @return the number of unique values that have been returned.
+ */
+ public int size() {
+ return weights.size();
+ }
+
+ /**
+ * @return the number draws so far.
+ */
+ public int count() {
+ return (int) weight;
+ }
+
+ /**
+ * @param j Which value to test.
+ * @return The number of times that j has been returned so far.
+ */
+ public int count(int j) {
+ Preconditions.checkArgument(j >= 0);
+
+ if (j < weights.size()) {
+ return (int) weights.get(j);
+ } else {
+ return 0;
+ }
+ }
+
+ public void setCount(int term, double count) {
+ while (weights.size() <= term) {
+ weights.add(0);
+ }
+ weight += (count - weights.get(term));
+ weights.set(term, count);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/5052b64d/sandbox/prototype/contrib/synth-log/src/main/java/org/apache/drill/synth/LongTail.java
----------------------------------------------------------------------
diff --git a/sandbox/prototype/contrib/synth-log/src/main/java/org/apache/drill/synth/LongTail.java b/sandbox/prototype/contrib/synth-log/src/main/java/org/apache/drill/synth/LongTail.java
index 1e0d2ef..1a46e52 100644
--- a/sandbox/prototype/contrib/synth-log/src/main/java/org/apache/drill/synth/LongTail.java
+++ b/sandbox/prototype/contrib/synth-log/src/main/java/org/apache/drill/synth/LongTail.java
@@ -1,17 +1,14 @@
package org.apache.drill.synth;
import com.google.common.collect.Lists;
-import org.apache.mahout.math.random.ChineseRestaurant;
import org.apache.mahout.math.random.Sampler;
import java.util.List;
/**
- * Created with IntelliJ IDEA.
- * User: tdunning
- * Date: 2/2/13
- * Time: 6:05 PM
- * To change this template use File | Settings | File Templates.
+ * Samples from a set of things based on a long-tailed distribution. This converts
+ * the ChineseRestaurant distribution from a distribution over integers into a distribution
+ * over more plausible looking things like words.
*/
public abstract class LongTail<T> implements Sampler<T> {
private ChineseRestaurant base;