You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datafu.apache.org by mh...@apache.org on 2014/05/14 19:01:14 UTC
[2/2] git commit: DATAFU-37 Adding LSH implementation.
DATAFU-37 Adding LSH implementation.
https://issues.apache.org/jira/browse/DATAFU-37
Signed-off-by: Matt Hayes <mh...@linkedin.com>
Project: http://git-wip-us.apache.org/repos/asf/incubator-datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-datafu/commit/544e5af0
Tree: http://git-wip-us.apache.org/repos/asf/incubator-datafu/tree/544e5af0
Diff: http://git-wip-us.apache.org/repos/asf/incubator-datafu/diff/544e5af0
Branch: refs/heads/master
Commit: 544e5af08c81671e8f14832fc3657de696cd313b
Parents: e15390f
Author: cstella <ce...@gmail.com>
Authored: Tue May 13 12:04:08 2014 -0400
Committer: Matt Hayes <mh...@linkedin.com>
Committed: Wed May 14 10:00:28 2014 -0700
----------------------------------------------------------------------
.../datafu/pig/hash/lsh/CosineDistanceHash.java | 185 +++++++
.../java/datafu/pig/hash/lsh/L1PStableHash.java | 198 ++++++++
.../java/datafu/pig/hash/lsh/L2PStableHash.java | 202 ++++++++
.../java/datafu/pig/hash/lsh/LSHFamily.java | 66 +++
.../main/java/datafu/pig/hash/lsh/LSHFunc.java | 149 ++++++
.../java/datafu/pig/hash/lsh/RepeatingLSH.java | 82 ++++
.../pig/hash/lsh/cosine/HyperplaneLSH.java | 88 ++++
.../pig/hash/lsh/cosine/package-info.java | 24 +
.../datafu/pig/hash/lsh/interfaces/LSH.java | 71 +++
.../pig/hash/lsh/interfaces/LSHCreator.java | 117 +++++
.../datafu/pig/hash/lsh/interfaces/Sampler.java | 39 ++
.../pig/hash/lsh/interfaces/package-info.java | 23 +
.../java/datafu/pig/hash/lsh/metric/Cosine.java | 68 +++
.../java/datafu/pig/hash/lsh/metric/L1.java | 57 +++
.../java/datafu/pig/hash/lsh/metric/L2.java | 55 +++
.../datafu/pig/hash/lsh/metric/MetricUDF.java | 170 +++++++
.../pig/hash/lsh/metric/package-info.java | 24 +
.../AbstractStableDistributionFunction.java | 119 +++++
.../datafu/pig/hash/lsh/p_stable/L1LSH.java | 60 +++
.../datafu/pig/hash/lsh/p_stable/L2LSH.java | 61 +++
.../pig/hash/lsh/p_stable/package-info.java | 27 +
.../java/datafu/pig/hash/lsh/package-info.java | 23 +
.../datafu/pig/hash/lsh/util/DataTypeUtil.java | 178 +++++++
.../datafu/pig/hash/lsh/util/package-info.java | 23 +
.../datafu/test/pig/hash/lsh/LSHPigTest.java | 492 +++++++++++++++++++
.../java/datafu/test/pig/hash/lsh/LSHTest.java | 268 ++++++++++
26 files changed, 2869 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/CosineDistanceHash.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/CosineDistanceHash.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/CosineDistanceHash.java
new file mode 100644
index 0000000..1664362
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/CosineDistanceHash.java
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh;
+
+import org.apache.commons.math.MathException;
+import org.apache.commons.math.random.RandomGenerator;
+
+import datafu.pig.hash.lsh.cosine.HyperplaneLSH;
+import datafu.pig.hash.lsh.interfaces.LSH;
+import datafu.pig.hash.lsh.interfaces.LSHCreator;
+
+/**
+ * From wikipedia's article on {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>}:
+ * <pre>
+ * Locality-sensitive hashing (LSH) is a method of performing probabilistic dimension reduction of high-dimensional data.
+ * The basic idea is to hash the input items so that similar items are mapped to the same buckets with high probability
+ * (the number of buckets being much smaller than the universe of possible input items).
+ * </pre>
+ *
+ * In particular, this implementation implements a locality sensitive hashing scheme which maps high-dimensional vectors which are
+ * close together (with high probability) according to {@link <a href="http://en.wikipedia.org/wiki/Cosine_similarity" target="_blank">Cosine Similarity</a>}
+ * into the same buckets. Each LSH maps a vector onto one side or the other of a random hyperplane, thereby producing a single
+ * bit as the hash value. Multiple, independent, hashes can be run on the same input and aggregated together to form a more
+ * broad domain than a single bit.
+ *
+ * For more information, see Charikar, Moses S.. (2002). "Similarity Estimation Techniques from Rounding Algorithms". Proceedings of the 34th Annual ACM Symposium on Theory of Computing 2002.
+ *
+ *
+ */
+public class CosineDistanceHash extends LSHFunc
+{
+ int dim;
+ long seed;
+ int repeat;
+ int numHashes;
+
+ /**
+ * Locality sensitive hash that maps vectors onto 0,1 in such a way that colliding
+ * vectors are "near" one another according to cosine similarity with high probability.
+ *
+ * <p>
+ * Generally, multiple LSH are combined via repetition to increase the range of the hash function to the full set of longs.
+ * The number of functions which you want to internally repeat is specified by the sRepeat parameter.
+ *
+ * The size of the hash family corresponds to the number of independent hashes you want to apply to the data.
+ * In a k-near neighbors style of searching, this corresponds to the number of neighbors you want to find
+ * (i.e. the number of vectors within a distance according to cosine similarity).
+ *
+ * <p>
+ * Consider the following example where we input some 3-dimensional points and a set of 3-dimensional queries
+ * and find the nearest neighbors of the query points:
+ * <pre>
+ * -- Create a CosineDistanceHash of
+ * -- 3 dimensional data
+ * -- 1500 internal hashes (being combined into one hash)
+ * -- family of 5 hashes
+ * -- with a seed of 0
+ *
+ * -- This creates a bag of tuples:
+ * -- lsh_id:Integer the family ID (in this case, 0-4)
+ * -- hash:Long the hash
+ *
+ * define LSH datafu.pig.hash.lsh.CosineDistanceHash('3', '1500', '5', '0');
+ * define METRIC datafu.pig.hash.lsh.metric.L2();
+ *
+ * PTS = LOAD 'input' AS (dim1:double, dim2:double, dim3:double);
+ *
+ * --hash the input points
+ * PTS_HASHED = foreach PTS generate TOTUPLE(dim1, dim2, dim3) as pt
+ * , FLATTEN(LSH(TOTUPLE(dim1, dim2, dim3)));
+ *
+ * -- the hash family ID and the hash should group the input points into partitions
+ * PARTITIONS = group PTS_HASHED by (lsh_id, hash);
+ *
+ * -- take in the query points and hash them
+ * QUERIES = LOAD 'queries' as (dim1:double, dim2:double, dim3:double);
+ * QUERIES_HASHED = foreach QUERIES generate TOTUPLE(dim1, dim2, dim3) as query_pt
+ * , FLATTEN(LSH(TOTUPLE(dim1, dim2, dim3)))
+ * ;
+ *
+ * -- join the hashed query points with the (presumably larger) list of input data split by partitions
+ * QUERIES_W_PARTS = join QUERIES_HASHED by (lsh_id, hash), PARTITIONS by (group.$0, group.$1);
+ *
+ * -- Now, use the appropriate METRIC UDF (in this case Cosine distance) to find the first point within
+ * -- a parameterized threshold (in this case, .001). It takes:
+ * -- query_pt:Tuple the query point
+ * -- threshold:Double the threshold, so that if the distance between the query point and a point
+ * -- in the partition is less than this threshold, it returns the point (and stops searching)
+ * -- partition:Bag The bag of tuples in the partition.
+ *
+ * tuples from
+ * NEAR_NEIGHBORS = foreach QUERIES_W_PARTS generate query_pt as query_pt
+ * , METRIC(query_pt, .001, PTS_HASHED) as neighbor
+ * ;
+ * describe NEAR_NEIGHBORS;
+ * -- {query_pt: (dim1: double,dim2: double,dim3: double)
+ * -- ,neighbor: (pt: (dim1: double,dim2: double,dim3: double)
+ * -- ,lsh::lsh_id: int
+ * -- ,lsh::hash: long
+ * -- )
+ * -- }
+ *
+ * -- project out the query and the matching point
+ * NEIGHBORS_PROJ = foreach NEAR_NEIGHBORS {
+ * generate query_pt as query_pt, neighbor.pt as matching_pts;
+ * };
+ *
+ * -- Filter out the hashes which resulted in no matches
+ * NOT_NULL = filter NEIGHBORS_PROJ by SIZE(matching_pts) > 0;
+ *
+ * -- group by the query
+ * NEIGHBORS_GRP = group NOT_NULL by query_pt;
+ * describe NEIGHBORS_GRP;
+ *
+ * -- Generate the query, the number of matches and the bag of matching points
+ * NEIGHBOR_CNT = foreach NEIGHBORS_GRP{
+ * MATCHING_PTS = foreach NOT_NULL generate FLATTEN(matching_pts);
+ * DIST_MATCHING_PTS = DISTINCT MATCHING_PTS;
+ * generate group as query_pt, COUNT(NOT_NULL), DIST_MATCHING_PTS;
+ * };
+ * describe NEIGHBOR_CNT;
+ * -- NEIGHBOR_CNT: {query_pt: (dim1: double,dim2: double,dim3: double)
+ * -- ,long
+ * -- ,DIST_MATCHING_PTS: { (matching_pts::dim1: double,matching_pts::dim2: double,matching_pts::dim3: double)
+ * -- }
+ * -- }
+ * STORE NEIGHBOR_CNT INTO 'neighbors';
+ * </pre>
+ *
+ *
+ *
+ * @param sDim Dimension of the vectors
+ * @param sRepeat Number of internal repetitions
+ * @param sNumHashes Size of the hash family (if you're looking for k near neighbors, this is the k)
+ * @param sSeed Seed to use when constructing LSH family
+ */
+ public CosineDistanceHash(String sDim, String sRepeat, String sNumHashes, String sSeed)
+ {
+ super(sSeed);
+ dim = Integer.parseInt(sDim);
+ repeat = Integer.parseInt(sRepeat);
+ numHashes = Integer.parseInt(sNumHashes);
+ }
+
+ public CosineDistanceHash(String sDim, String sRepeat, String sNumHashes)
+ {
+ this(sDim, sRepeat, sNumHashes, null);
+ }
+
+ @Override
+ protected LSHCreator createLSHCreator() {
+ return new LSHCreator(dim, numHashes, repeat, getSeed())
+ {
+
+ @Override
+ protected LSH constructLSH(RandomGenerator rg) throws MathException {
+ return new HyperplaneLSH(dim, rg );
+ }
+
+ };
+ }
+
+ @Override
+ protected int getDimension() {
+ return dim;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/L1PStableHash.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/L1PStableHash.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/L1PStableHash.java
new file mode 100644
index 0000000..2dd42cd
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/L1PStableHash.java
@@ -0,0 +1,198 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh;
+
+import org.apache.commons.math.MathException;
+import org.apache.commons.math.random.RandomGenerator;
+
+import datafu.pig.hash.lsh.interfaces.LSH;
+import datafu.pig.hash.lsh.interfaces.LSHCreator;
+import datafu.pig.hash.lsh.p_stable.L1LSH;
+
+/**
+ * From wikipedia's article on {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>}:
+ * <pre>
+ * Locality-sensitive hashing (LSH) is a method of performing probabilistic dimension reduction of high-dimensional data.
+ * The basic idea is to hash the input items so that similar items are mapped to the same buckets with high probability
+ * (the number of buckets being much smaller than the universe of possible input items).
+ * </pre>
+ *
+ * In particular, this implementation implements a locality sensitive hashing scheme which maps high-dimensional vectors which are
+ * close together (with high probability) according to the {@link <a href="http://en.wikipedia.org/wiki/Lp_space" target="_blank">L1</a>}
+ * distance metric into the same buckets. This implementation uses a 1-stable distribution (a Cauchy distribution) in order
+ * to accomplish this.
+ *
+ * For more information, see Datar, M.; Immorlica, N.; Indyk, P.; Mirrokni, V.S. (2004). "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions". Proceedings of the Symposium on Computational Geometry.
+ *
+ */
+public class L1PStableHash extends LSHFunc
+{
+ int dim;
+ double w;
+
+ int repeat;
+ int numHashes;
+
+ /**
+ * Locality sensitive hash that maps vectors onto a long in such a way that colliding
+ * vectors are "near" one another according to cosine similarity with high probability.
+ *
+ * <p>
+ * Generally, multiple LSH are combined via repetition to increase the range of the hash function to the full set of longs.
+ * The number of functions which you want to internally repeat is specified by the sRepeat parameter.
+ *
+ * The size of the hash family corresponds to the number of independent hashes you want to apply to the data.
+ * In a k-near neighbors style of searching, this corresponds to the number of neighbors you want to find
+ * (i.e. the number of vectors within a distance according to cosine similarity).
+ *
+ * This UDF, indeed all p-stable LSH functions, are parameterized with a quantization parameter (w or r in the literature
+ * , depending on where you look). Consider the following excerpt from Datar, M.; Immorlica, N.; Indyk, P.; Mirrokni, V.S. (2004). "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions". Proceedings of the Symposium on Computational Geometry.
+ *
+ * <pre>
+ * Decreasing the width of the projection (w) decreases the probability of collision for any two points.
+ * Thus, it has the same effect as increasing k . As a result, we would like to set w as small as possible
+ * and in this way decrease the number of projections we need to make.
+ * </pre>
+ *
+ * In the literature, the quantization parameter (or width of the projection) is found empirically given a sample of
+ * the data and the likely threshold of for the metric. Tuning this parameter is very important for the performance
+ * of this algorithm.
+ *
+ * <p>
+ * Consider the following example where we input some 3-dimensional points and a set of 3-dimensional queries
+ * and find the nearest neighbors of the query points:
+ * <pre>
+ * -- Create a L1PStableHash of
+ * -- 3 dimensional data
+ * -- projection width of 150
+ * -- 1 internal hashes
+ * -- family of 5 hashes
+ * -- with a seed of 0
+ *
+ * -- This creates a bag of tuples:
+ * -- lsh_id:Integer the family ID (in this case, 0-4)
+ * -- hash:Long the hash
+ *
+ * define LSH datafu.pig.hash.lsh.L1PStableHash('3', '150', '1', '5', '0');
+ * define METRIC datafu.pig.hash.lsh.metric.L1();
+ *
+ * PTS = LOAD 'input' AS (dim1:double, dim2:double, dim3:double);
+ *
+ * --hash the input points
+ * PTS_HASHED = foreach PTS generate TOTUPLE(dim1, dim2, dim3) as pt
+ * , FLATTEN(LSH(TOTUPLE(dim1, dim2, dim3)));
+ *
+ * -- the hash family ID and the hash should group the input points into partitions
+ * PARTITIONS = group PTS_HASHED by (lsh_id, hash);
+ *
+ * -- take in the query points and hash them
+ * QUERIES = LOAD 'queries' as (dim1:double, dim2:double, dim3:double);
+ * QUERIES_HASHED = foreach QUERIES generate TOTUPLE(dim1, dim2, dim3) as query_pt
+ * , FLATTEN(LSH(TOTUPLE(dim1, dim2, dim3)))
+ * ;
+ *
+ * -- join the hashed query points with the (presumably larger) list of input data split by partitions
+ * QUERIES_W_PARTS = join QUERIES_HASHED by (lsh_id, hash), PARTITIONS by (group.$0, group.$1);
+ *
+ * -- Now, use the appropriate METRIC UDF (in this case L1 (aka city block) distance) to find the first point within
+ * -- a parameterized threshold (in this case, 1000). It takes:
+ * -- query_pt:Tuple the query point
+ * -- threshold:Double the threshold, so that if the distance between the query point and a point
+ * -- in the partition is less than this threshold, it returns the point (and stops searching)
+ * -- partition:Bag The bag of tuples in the partition.
+ *
+ *
+ * NEAR_NEIGHBORS = foreach QUERIES_W_PARTS generate query_pt as query_pt
+ * , METRIC(query_pt, 1000, PTS_HASHED) as neighbor
+ * ;
+ * describe NEAR_NEIGHBORS;
+ * -- {query_pt: (dim1: double,dim2: double,dim3: double)
+ * -- ,neighbor: (pt: (dim1: double,dim2: double,dim3: double)
+ * -- ,lsh::lsh_id: int
+ * -- ,lsh::hash: long
+ * -- )
+ * -- }
+ *
+ * -- project out the query and the matching point
+ * NEIGHBORS_PROJ = foreach NEAR_NEIGHBORS {
+ * generate query_pt as query_pt, neighbor.pt as matching_pts;
+ * };
+ *
+ * -- Filter out the hashes which resulted in no matches
+ * NOT_NULL = filter NEIGHBORS_PROJ by SIZE(matching_pts) > 0;
+ *
+ * -- group by the query
+ * NEIGHBORS_GRP = group NOT_NULL by query_pt;
+ * describe NEIGHBORS_GRP;
+ *
+ * -- Generate the query, the number of matches and the bag of matching points
+ * NEIGHBOR_CNT = foreach NEIGHBORS_GRP{
+ * DIST_MATCHING_PTS = DISTINCT MATCHING_PTS;
+ * generate group as query_pt, COUNT(NOT_NULL), DIST_MATCHING_PTS;
+ * };
+ * describe NEIGHBOR_CNT;
+ * -- NEIGHBOR_CNT: {query_pt: (dim1: double,dim2: double,dim3: double)
+ * -- ,long
+ * -- ,DIST_MATCHING_PTS: { (matching_pts::dim1: double,matching_pts::dim2: double,matching_pts::dim3: double)
+ * -- }
+ * -- }
+ * STORE NEIGHBOR_CNT INTO 'neighbors';
+ * </pre>
+ *
+ *
+ *
+ * @param sDim Dimension of the vectors
+ * @param sW A double representing the quantization parameter (also known as the projection width)
+ * @param sRepeat Number of internal repetitions (generally this should be 1 as the p-stable hashes have a larger range than one bit)
+ * @param sNumHashes Size of the hash family (if you're looking for k near neighbors, this is the k)
+ * @param sSeed Seed to use when constructing LSH family
+ */
+ public L1PStableHash(String sDim, String sW, String sRepeat, String sNumHashes, String sSeed)
+ {
+ super(sSeed);
+ dim = Integer.parseInt(sDim);
+ w = Double.parseDouble(sW);
+ repeat = Integer.parseInt(sRepeat);
+ numHashes = Integer.parseInt(sNumHashes);
+ }
+
+ public L1PStableHash(String sDim, String sW, String sRepeat, String sNumHashes)
+ {
+ this(sDim, sW, sRepeat, sNumHashes, null);
+ }
+
+ @Override
+ protected LSHCreator createLSHCreator() {
+ return new LSHCreator(dim, numHashes, repeat, getSeed())
+ {
+
+ @Override
+ protected LSH constructLSH(RandomGenerator rg) throws MathException {
+ return new L1LSH(dim, w, rg );
+ }
+
+ };
+
+ }
+ @Override
+ protected int getDimension() {
+ return dim;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/L2PStableHash.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/L2PStableHash.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/L2PStableHash.java
new file mode 100644
index 0000000..588f199
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/L2PStableHash.java
@@ -0,0 +1,202 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh;
+
+import org.apache.commons.math.MathException;
+import org.apache.commons.math.random.RandomGenerator;
+
+import datafu.pig.hash.lsh.interfaces.LSH;
+import datafu.pig.hash.lsh.interfaces.LSHCreator;
+import datafu.pig.hash.lsh.p_stable.L2LSH;
+
+/**
+ * From wikipedia's article on {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>}:
+ * <pre>
+ * Locality-sensitive hashing (LSH) is a method of performing probabilistic dimension reduction of high-dimensional data.
+ * The basic idea is to hash the input items so that similar items are mapped to the same buckets with high probability
+ * (the number of buckets being much smaller than the universe of possible input items).
+ * </pre>
+ *
+ * In particular, this implementation implements a locality sensitive hashing scheme which maps high-dimensional vectors which are
+ * close together (with high probability) according to the {@link <a href="http://en.wikipedia.org/wiki/Lp_space" target="_blank">L2</a>}
+ * distance metric into the same buckets. This implementation uses a 2-stable distribution (a Gaussian distribution) in order
+ * to accomplish this.
+ *
+ * For more information, see Datar, M.; Immorlica, N.; Indyk, P.; Mirrokni, V.S. (2004). "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions". Proceedings of the Symposium on Computational Geometry.
+ *
+ */
+public class L2PStableHash extends LSHFunc{
+ int dim;
+ double w;
+ long seed;
+ int repeat;
+ int numHashes;
+
+ /**
+ * Locality sensitive hash that maps vectors onto a long in such a way that colliding
+ * vectors are "near" one another according to cosine similarity with high probability.
+ *
+ * <p>
+ * Generally, multiple LSH are combined via repetition to increase the range of the hash function to the full set of longs.
+ * The number of functions which you want to internally repeat is specified by the sRepeat parameter.
+ *
+ * The size of the hash family corresponds to the number of independent hashes you want to apply to the data.
+ * In a k-near neighbors style of searching, this corresponds to the number of neighbors you want to find
+ * (i.e. the number of vectors within a distance according to cosine similarity).
+ *
+ * This UDF, indeed all p-stable LSH functions are parameterized with a quantization parameter (w or r in the literature
+ * , depending on where you look). Consider the following excerpt from Datar, M.; Immorlica, N.; Indyk, P.; Mirrokni, V.S. (2004). "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions". Proceedings of the Symposium on Computational Geometry.
+ *
+ * <pre>
+ * Decreasing the width of the projection (w) decreases the probability of collision for any two points.
+ * Thus, it has the same effect as increasing k . As a result, we would like to set w as small as possible
+ * and in this way decrease the number of projections we need to make.
+ * </pre>
+ *
+ * In the literature, the quantization parameter (or width of the projection) is found empirically given a sample of
+ * the data and the likely threshold of for the metric. Tuning this parameter is very important for the performance
+ * of this algorithm.
+ *
+ * <p>
+ * Consider the following example where we input some 3-dimensional points and a set of 3-dimensional queries
+ * and find the nearest neighbors of the query points:
+ * <pre>
+ * -- Create a L2PStableHash of
+ * -- 3 dimensional data
+ * -- projection width of 200
+ * -- 1 internal hashes
+ * -- family of 5 hashes
+ * -- with a seed of 0
+ *
+ * -- This creates a bag of tuples:
+ * -- lsh_id:Integer the family ID (in this case, 0-4)
+ * -- hash:Long the hash
+ *
+ * define LSH datafu.pig.hash.lsh.L2PStableHash('3', '200', '1', '5', '0');
+ * define METRIC datafu.pig.hash.lsh.metric.L2();
+ *
+ * PTS = LOAD 'input' AS (dim1:double, dim2:double, dim3:double);
+ *
+ * --hash the input points
+ * PTS_HASHED = foreach PTS generate TOTUPLE(dim1, dim2, dim3) as pt
+ * , FLATTEN(LSH(TOTUPLE(dim1, dim2, dim3)));
+ *
+ * -- the hash family ID and the hash should group the input points into partitions
+ * PARTITIONS = group PTS_HASHED by (lsh_id, hash);
+ *
+ * -- take in the query points and hash them
+ * QUERIES = LOAD 'queries' as (dim1:double, dim2:double, dim3:double);
+ * QUERIES_HASHED = foreach QUERIES generate TOTUPLE(dim1, dim2, dim3) as query_pt
+ * , FLATTEN(LSH(TOTUPLE(dim1, dim2, dim3)))
+ * ;
+ *
+ * -- join the hashed query points with the (presumably larger) list of input data split by partitions
+ * QUERIES_W_PARTS = join QUERIES_HASHED by (lsh_id, hash), PARTITIONS by (group.$0, group.$1);
+ *
+ * -- Now, use the appropriate METRIC UDF (in this case L2 (aka Euclidean) distance) to find the first point within
+ * -- a parameterized threshold (in this case, 1000). It takes:
+ * -- query_pt:Tuple the query point
+ * -- threshold:Double the threshold, so that if the distance between the query point and a point
+ * -- in the partition is less than this threshold, it returns the point (and stops searching)
+ * -- partition:Bag The bag of tuples in the partition.
+ *
+ * tuples from
+ * NEAR_NEIGHBORS = foreach QUERIES_W_PARTS generate query_pt as query_pt
+ * , METRIC(query_pt, 1000, PTS_HASHED) as neighbor
+ * ;
+ * describe NEAR_NEIGHBORS;
+ * describe NEAR_NEIGHBORS;
+ * -- {query_pt: (dim1: double,dim2: double,dim3: double)
+ * -- ,neighbor: (pt: (dim1: double,dim2: double,dim3: double)
+ * -- ,lsh::lsh_id: int
+ * -- ,lsh::hash: long
+ * -- )
+ * -- }
+ *
+ * -- project out the query and the matching point
+ * NEIGHBORS_PROJ = foreach NEAR_NEIGHBORS {
+ * generate query_pt as query_pt, neighbor.pt as matching_pts;
+ * };
+ *
+ * -- Filter out the hashes which resulted in no matches
+ * NOT_NULL = filter NEIGHBORS_PROJ by SIZE(matching_pts) > 0;
+ *
+ * -- group by the query
+ * NEIGHBORS_GRP = group NOT_NULL by query_pt;
+ * describe NEIGHBORS_GRP;
+ *
+ * -- Generate the query, the number of matches and the bag of matching points
+ * NEIGHBOR_CNT = foreach NEIGHBORS_GRP{
+ * MATCHING_PTS = foreach NOT_NULL generate FLATTEN(matching_pts);
+ * DIST_MATCHING_PTS = DISTINCT MATCHING_PTS;
+ * generate group as query_pt, COUNT(NOT_NULL), DIST_MATCHING_PTS;
+ * };
+ * describe NEIGHBOR_CNT;
+ * -- NEIGHBOR_CNT: {query_pt: (dim1: double,dim2: double,dim3: double)
+ * -- ,long
+ * -- ,DIST_MATCHING_PTS: { (matching_pts::dim1: double,matching_pts::dim2: double,matching_pts::dim3: double)
+ * -- }
+ * -- }
+ * STORE NEIGHBOR_CNT INTO 'neighbors';
+ * </pre>
+ *
+ *
+ *
+ * @param sDim Dimension of the vectors
+ * @param sW A double representing the quantization parameter (also known as the projection width)
+ * @param sRepeat Number of internal repetitions (generally this should be 1 as the p-stable hashes have a larger range than one bit)
+ * @param sNumHashes Size of the hash family (if you're looking for k near neighbors, this is the k)
+ * @param sSeed Seed to use when constructing LSH family
+ */
+ public L2PStableHash(String sDim, String sW, String sRepeat, String sNumHashes, String sSeed)
+ {
+ super(sSeed);
+ dim = Integer.parseInt(sDim);
+ w = Double.parseDouble(sW);
+ repeat = Integer.parseInt(sRepeat);
+ numHashes = Integer.parseInt(sNumHashes);
+ }
+
+ public L2PStableHash(String sDim, String sW, String sRepeat, String sNumHashes)
+ {
+ this(sDim, sW, sRepeat, sNumHashes, null);
+ }
+
+
+ @Override
+ protected LSHCreator createLSHCreator() {
+ return new LSHCreator(dim, numHashes, repeat, getSeed())
+ {
+
+ @Override
+ protected LSH constructLSH(RandomGenerator rg) throws MathException {
+ return new L2LSH(dim, w, rg );
+ }
+
+ };
+ }
+
+
+ @Override
+ protected int getDimension() {
+ return dim;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/LSHFamily.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/LSHFamily.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/LSHFamily.java
new file mode 100644
index 0000000..394109c
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/LSHFamily.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh;
+
+import java.util.List;
+
+import org.apache.commons.math.linear.RealVector;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterables;
+
+import datafu.pig.hash.lsh.interfaces.LSH;
+
+/**
+ * A family of k locality sensitive hashes. For a given point, k hashes will be computed.
+ * @author cstella
+ *
+ */
+public class LSHFamily {
+
+ private List<LSH> hashes;
+ /**
+ * Construct a family of hashes
+ * @param hashes Hashes which will be applied in turn to a given point
+ */
+ public LSHFamily(List<LSH> hashes)
+ {
+ this.hashes = hashes;
+ }
+
+ /**
+ * Compute the family of k-hashes for a vector.
+ *
+ * @param vector
+ * @return An iterable of hashes
+ */
+ public Iterable<Long> apply(final RealVector vector) {
+ return Iterables.transform(hashes, new Function<LSH, Long>()
+ {
+ @Override
+ public Long apply(LSH lsh)
+ {
+ return lsh.apply(vector);
+ }
+ }
+ );
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/LSHFunc.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/LSHFunc.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/LSHFunc.java
new file mode 100644
index 0000000..a7ab070
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/LSHFunc.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh;
+
+import java.io.IOException;
+import java.util.Random;
+
+import org.apache.commons.math.MathException;
+import org.apache.commons.math.linear.RealVector;
+import org.apache.pig.EvalFunc;
+import org.apache.pig.backend.executionengine.ExecException;
+import org.apache.pig.data.BagFactory;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.DataType;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
+import org.apache.pig.impl.util.UDFContext;
+
+import datafu.pig.hash.lsh.interfaces.LSHCreator;
+import datafu.pig.hash.lsh.util.DataTypeUtil;
+
+/**
+ * The base UDF for locality sensitive hashing.
+ * @author cstella
+ *
+ */
+public abstract class LSHFunc extends EvalFunc<DataBag>
+{
+ protected LSHFamily lsh = null;
+ protected Long seed;
+ private TupleFactory mTupleFactory = TupleFactory.getInstance();
+ private BagFactory mBagFactory = BagFactory.getInstance();
+ private LSHCreator lshCreator = null;
+ protected abstract LSHCreator createLSHCreator();
+ protected abstract int getDimension();
+
+ public LSHFunc(String sSeed)
+ {
+ if(sSeed == null)
+ {
+ seed = null;
+ }
+ else
+ {
+ seed = Long.parseLong(sSeed);
+ }
+ }
+ /**
+ * The expectation is that the input is a Tuple of numeric data representing a real vector.
+ *
+ * @return DataBag containing tuples which have the lsh_id (the hash ID in the family) and the hash
+ */
+ @Override
+ public DataBag exec(Tuple t) throws IOException
+ {
+ if (lsh == null) {
+ try {
+ lshCreator = createLSHCreator();
+ lsh = lshCreator.constructFamily(lshCreator.createGenerator());
+ } catch (MathException e) {
+ throw new RuntimeException("Unable to construct LSH!", e);
+ }
+ }
+
+ RealVector r = null;
+ try {
+ r = DataTypeUtil.INSTANCE.convert(t, lshCreator.getDim());
+ } catch (ExecException e) {
+ throw new IllegalStateException("Unable to convert tuple: "
+ + t.toString() + " to RealVector");
+ }
+ DataBag ret = mBagFactory.newDefaultBag();
+
+ int idx = 0;
+ for(Long hash : lsh.apply(r))
+ {
+ Tuple out = mTupleFactory.newTuple(2);
+ out.set(0, idx++);
+ out.set(1, hash);
+ ret.add(out);
+ }
+ return ret;
+ }
+
+ protected long getSeed()
+ {
+ if(seed == null)
+ {
+ UDFContext context = UDFContext.getUDFContext();
+ return Long.parseLong(context.getUDFProperties(this.getClass()).getProperty("seed"));
+ }
+ else
+ {
+ return seed;
+ }
+ }
+
+ /**
+ * The schema returned here is a bag containing pairs.
+ * The pairs are the lsh_id (or the ID of the hash) and the hash value.
+ */
+ public Schema outputSchema(Schema input) {
+ try {
+ validateInputSchema(input);
+ long randomSeed = new Random().nextLong();
+ UDFContext context = UDFContext.getUDFContext();
+ context.getUDFProperties(this.getClass()).setProperty("seed", "" + randomSeed);
+ Schema bagSchema = new Schema();
+ bagSchema.add(new Schema.FieldSchema("lsh_id", DataType.INTEGER));
+ bagSchema.add(new Schema.FieldSchema("hash", DataType.LONG));
+ return new Schema(new Schema.FieldSchema("lsh", bagSchema, DataType.BAG));
+ }catch (Exception e){
+ throw new RuntimeException("Unable to create output schema", e);
+ }
+ }
+ /**
+ * Validate the input schema to ensure that our input is consistent and that we fail fast.
+ * @param input
+ * @throws FrontendException
+ */
+ private void validateInputSchema(Schema input) throws FrontendException
+ {
+ FieldSchema vectorSchema = input.getField(0);
+ if(!DataTypeUtil.isValidVector(vectorSchema, getDimension()))
+ {
+ throw new FrontendException("Invalid vector element: Expected either a tuple or a bag, but found " + vectorSchema);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/RepeatingLSH.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/RepeatingLSH.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/RepeatingLSH.java
new file mode 100644
index 0000000..7f59e83
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/RepeatingLSH.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh;
+
+import java.util.List;
+
+import org.apache.commons.math.MathException;
+import org.apache.commons.math.linear.ArrayRealVector;
+import org.apache.commons.math.linear.RealVector;
+import org.apache.commons.math.random.RandomData;
+import org.apache.commons.math.random.RandomDataImpl;
+import org.apache.commons.math.random.RandomGenerator;
+
+import datafu.pig.hash.lsh.interfaces.LSH;
+
+/**
+ * A composite hash which takes multiple hashes and composes them. This is useful
+ * when a locality sensitive hash has a range that is very narrow and would want a wider
+ * range for the hash.
+ *
+ *
+ * @author cstella
+ *
+ */
+public class RepeatingLSH extends LSH
+{
+ private List<LSH> lshList;
+ private RealVector randomVec;
+
+ public RepeatingLSH(List<LSH> lshList) throws MathException
+ {
+ super(lshList.get(0).getDim(), lshList.get(0).getRandomGenerator());
+ this.lshList = lshList;
+ RandomGenerator rg = lshList.get(0).getRandomGenerator();
+ RandomData rd = new RandomDataImpl(rg);
+ /*
+ * Compute a random vector of lshList.size() with each component taken from U(0,10)
+ */
+ randomVec = new ArrayRealVector(lshList.size());
+ for(int i = 0; i < randomVec.getDimension();++i)
+ {
+ randomVec.setEntry(i, rd.nextUniform(0, 10.0));
+ }
+ }
+
+
+ /**
+ * Compute the aggregated hash of a vector. This is done by taking the output of the k hashes as a
+ * vector and computing the dot product with a vector of uniformly distributed components between 0 and 10.
+ *
+ * So, consider a_{0,k} such that a_i ~ U(0,10) and hash functions h_{0,k} treated as 2 k-dimensional vectors,
+ * we return the dot product of h(v) and a.
+ *
+ */
+ public long apply(RealVector vector) {
+ long res = 0;
+
+ for(int i = 0;i < lshList.size();++i)
+ {
+ res += randomVec.getEntry(i)* lshList.get(i).apply(vector);
+ }
+ return res;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/cosine/HyperplaneLSH.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/cosine/HyperplaneLSH.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/cosine/HyperplaneLSH.java
new file mode 100644
index 0000000..deff98e
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/cosine/HyperplaneLSH.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh.cosine;
+
+import org.apache.commons.math.linear.ArrayRealVector;
+import org.apache.commons.math.linear.RealVector;
+import org.apache.commons.math.random.RandomGenerator;
+import org.apache.commons.math.random.UnitSphereRandomVectorGenerator;
+
+import datafu.pig.hash.lsh.interfaces.LSH;
+
+/**
+ * From wikipedia's article on {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>}:
+ * <pre>
+ * Locality-sensitive hashing (LSH) is a method of performing probabilistic dimension reduction of high-dimensional data.
+ * The basic idea is to hash the input items so that similar items are mapped to the same buckets with high probability
+ * (the number of buckets being much smaller than the universe of possible input items).
+ * </pre>
+ *
+ * In particular, this implementation implements a locality sensitive hashing scheme which maps high-dimensional vectors which are
+ * close together (with high probability) according to {@link <a href="http://en.wikipedia.org/wiki/Cosine_similarity" target="_blank">Cosine Similarity</a>}
+ * into the same buckets. Each LSH maps a vector onto one side or the other of a random hyperplane, thereby producing a single
+ * bit as the hash value. Multiple, independent, hashes can be run on the same input and aggregated together to form a more
+ * broad domain than a single bit.
+ *
+ * For more information, see Charikar, Moses S.. (2002). "Similarity Estimation Techniques from Rounding Algorithms". Proceedings of the 34th Annual ACM Symposium on Theory of Computing 2002.
+ *
+ *
+ */
+public class HyperplaneLSH extends LSH
+{
+
+
+ private RealVector r;
+
+ /**
+ * Locality sensitive hash that maps vectors onto 0,1 in such a way that colliding
+ * vectors are "near" one another according to cosine similarity with high probability.
+ *
+ * <p>
+ * Generally, multiple LSH are combined via repetition to increase the range of the hash function to the full set of longs.
+ * This repetition is accomplished by wrapping instances of the LSH in a LSHFamily, which does the combination.
+ *
+ * The size of the hash family corresponds to the number of independent hashes you want to apply to the data.
+ * In a k-near neighbors style of searching, this corresponds to the number of neighbors you want to find
+ * (i.e. the number of vectors within a distance according to cosine similarity).
+ */
+ public HyperplaneLSH(int dim, RandomGenerator rg)
+ {
+ super(dim, rg);
+
+ UnitSphereRandomVectorGenerator generator = new UnitSphereRandomVectorGenerator(dim, rg);
+ //compute our vector representing a hyperplane of dimension dim by taking a random vector
+ //located on the unit sphere
+ double[] normalVector = generator.nextVector();
+ r = new ArrayRealVector(normalVector);
+ }
+
+
+ /**
+ * Compute which side of the hyperplane that the parameter is on.
+ *
+ * @param vector The vector to test.
+ * @return one if the dot product with the hyperplane is positive, 0 if negative.
+ */
+ public long apply(RealVector vector)
+ {
+ return r.dotProduct(vector) >= 0?1:0;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/cosine/package-info.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/cosine/package-info.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/cosine/package-info.java
new file mode 100644
index 0000000..2e44920
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/cosine/package-info.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * Implementation of {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>}
+ * for {@link <a href="http://en.wikipedia.org/wiki/Cosine_similarity" target="_blank">Cosine Similarity</a>}
+ */
+package datafu.pig.hash.lsh.cosine;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/LSH.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/LSH.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/LSH.java
new file mode 100644
index 0000000..69d4043
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/LSH.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh.interfaces;
+
+import org.apache.commons.math.linear.RealVector;
+import org.apache.commons.math.random.RandomGenerator;
+
+/**
+ * An abstract class representing a locality sensitive hash. From wikipedia's article on {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>}:
+ * <pre>
+ * Locality-sensitive hashing (LSH) is a method of performing probabilistic dimension reduction of high-dimensional data.
+ * The basic idea is to hash the input items so that similar items are mapped to the same buckets with high probability
+ * (the number of buckets being much smaller than the universe of possible input items).
+ * </pre>
+ * @author cstella
+ *
+ */
+public abstract class LSH
+{
+ protected RandomGenerator rg;
+ protected int dim;
+
+ /**
+ * Construct a locality sensitive hash. Note, one may pass a pre-seeded generator.
+ *
+ * @param dim The dimension of the vectors which are to be hashed
+ * @param rg The random generator to use internally.
+ */
+ public LSH(int dim, RandomGenerator rg)
+ {
+ this.dim = dim;
+ this.rg = rg;
+ }
+
+ /**
+ *
+ * @return The random generator from this LSH
+ */
+ public RandomGenerator getRandomGenerator() { return rg;}
+ /**
+ *
+ * @return The dimension of the vectors which this LSh supports
+ */
+ public int getDim() { return dim; }
+
+ /**
+ * Hash a vector.
+ *
+ * @param vector A vector to be hashed
+ * @return A hash which collides with vectors close according to some metric (implementation dependent).
+ */
+ public abstract long apply(RealVector vector);
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/LSHCreator.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/LSHCreator.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/LSHCreator.java
new file mode 100644
index 0000000..e3dc5b4
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/LSHCreator.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh.interfaces;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.math.MathException;
+import org.apache.commons.math.random.JDKRandomGenerator;
+import org.apache.commons.math.random.RandomGenerator;
+
+import datafu.pig.hash.lsh.LSHFamily;
+import datafu.pig.hash.lsh.RepeatingLSH;
+
+/**
+ * Create a Locality sensitive hash.
+ *
+ * @author cstella
+ *
+ */
+public abstract class LSHCreator
+{
+ private int dim;
+ private int numHashes;
+ private int numInternalRepetitions;
+ private long seed;
+
+ /**
+ * Create a LSHCreator
+ *
+ * @param dim The dimension of the vectors which the LSH will hash
+ * @param numHashes The number of locality sensitive hashes to create
+ * @param numInternalRepetitions Each locality sensitive hash is a composite of numInternalRepetitions LSHes (this is done to increase range of the LSH)
+ * @param seed The seed to use
+ */
+ public LSHCreator(int dim, int numHashes, int numInternalRepetitions, long seed)
+ {
+ this.dim = dim;
+ this.numHashes = numHashes;
+ this.numInternalRepetitions = numInternalRepetitions;
+ this.seed = seed;
+ }
+ protected abstract LSH constructLSH(RandomGenerator rg) throws MathException;
+ public int getDim() { return dim;}
+ public RandomGenerator createGenerator()
+ {
+ RandomGenerator rg = new JDKRandomGenerator();
+ rg.setSeed(seed);
+ return rg;
+ }
+
+ /**
+ *
+ * @return The number of locality sensitive hashes to create
+ */
+ public int getNumHashes()
+ {
+ return numHashes;
+ }
+ /**
+ * Each locality sensitive hash is a composite of numInternalRepetitions LSHes (this is done to increase range of the LSH)
+ * @return The number of internal repetitions
+ */
+ public int getNumInternalRepetitions()
+ {
+ return numInternalRepetitions;
+ }
+
+
+ /**
+ *
+ * @param rg The random generator to use when constructing the family
+ * @return The family of locality sensitive hashes
+ * @throws MathException
+ */
+ public LSHFamily constructFamily(RandomGenerator rg) throws MathException
+ {
+ List<LSH> hashes = new ArrayList<LSH>();
+ for(int i = 0;i < getNumHashes();++i)
+ {
+ LSH lsh = null;
+ if(getNumInternalRepetitions() == 1)
+ {
+ // in the situation of 1, we don't do a composite of 1..we just pass the raw LSH back
+ lsh = constructLSH(rg);
+ }
+ else
+ {
+ List<LSH> lshFamily = new ArrayList<LSH>();
+ for(int j = 0;j < getNumInternalRepetitions();++j)
+ {
+ lshFamily.add(constructLSH(rg));
+ }
+ lsh = new RepeatingLSH(lshFamily);
+ }
+ hashes.add(lsh);
+ }
+ return new LSHFamily(hashes);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/Sampler.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/Sampler.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/Sampler.java
new file mode 100644
index 0000000..0c57a0d
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/Sampler.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh.interfaces;
+
+import org.apache.commons.math.MathException;
+import org.apache.commons.math.random.RandomDataImpl;
+
+/**
+ * A helper interface to sample from a distribution specified by a RandomDataImpl
+ * @author cstella
+ *
+ */
+public interface Sampler {
+ /**
+ * Generate a sample
+ *
+ * @param randomData The distribution used
+ * @return A sample
+ * @throws MathException
+ */
+ public double sample(RandomDataImpl randomData) throws MathException ;
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/package-info.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/package-info.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/package-info.java
new file mode 100644
index 0000000..5357473
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * Interfaces used in the implementation of {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>}.
+ */
+package datafu.pig.hash.lsh.interfaces;
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/Cosine.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/Cosine.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/Cosine.java
new file mode 100644
index 0000000..22222a1
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/Cosine.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh.metric;
+
+import org.apache.commons.math.linear.RealVector;
+
+/**
+ * A UDF used to find a vector v in a bag such that for query point q, metric m and threshold t
+ * m(v,q) < t. In other words, find the first vector in the bag within a threshold distance away.
+ *
+ * It returns one of the tuples of the bag of vectors. The metric used is
+ * {@link <a href="http://en.wikipedia.org/wiki/Cosine_similarity" target="_blank">Cosine Similarity</a>},
+ * which technically does not form a metric, but I'm stretching the definition here.
+ *
+ * @see datafu.pig.hash.lsh.CosineDistanceHash CosineDistanceHash for an example
+ * @author cstella
+ *
+ */
+public class Cosine extends MetricUDF {
+
+ /**
+ * Create a new Cosine Metric UDF with a given dimension.
+ *
+ * @param sDim
+ */
+ public Cosine(String sDim) {
+ super(sDim);
+ }
+
+ /**
+ * Cosine similarity.
+ * @param v1
+ * @param v2
+ * @return The cosine of the angle between the vectors
+ */
+ public static double distance(RealVector v1, RealVector v2) {
+ return (v1.dotProduct(v2)) / (v1.getNorm() * v2.getNorm());
+ }
+
+ /**
+ * Cosine similarity.
+ * @param v1
+ * @param v2
+ * @return Roughly the cosine of the angle between the vectors
+ */
+ @Override
+ protected double dist(RealVector v1, RealVector v2) {
+ return distance(v1, v2);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L1.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L1.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L1.java
new file mode 100644
index 0000000..311c2ed
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L1.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh.metric;
+
+import org.apache.commons.math.linear.RealVector;
+
+/**
+ * A UDF used to find a vector v in a bag such that for query point q, metric m and threshold t
+ * m(v,q) < t. In other words, find the first vector in the bag within a threshold distance away.
+ *
+ * It returns one of the tuples of the bag of vectors using {@link <a href="http://en.wikipedia.org/wiki/Taxicab_geometry" target="_blank">L1 distance</a>},
+ * distance between two vectors. This is otherwise known as
+ * the manhattan distance, taxicab distance or city block distance.
+ *
+ * @see datafu.pig.hash.lsh.L1PStableHash L1PStableHash for an example
+ * @author cstella
+ *
+ */
+public class L1 extends MetricUDF {
+
+ /**
+ * Create a new L1 Metric UDF with a given dimension.
+ *
+ * @param sDim
+ */
+ public L1(String sDim) {
+ super(sDim);
+
+ }
+
+ public static double distance(RealVector v1, RealVector v2) {
+ return v1.getL1Distance(v2);
+ }
+
+ @Override
+ protected double dist(RealVector v1, RealVector v2) {
+ return distance(v1, v2);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java
new file mode 100644
index 0000000..d01f9fb
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh.metric;
+
+import org.apache.commons.math.linear.RealVector;
+/**
+ * A UDF used to find a vector v in a bag such that for query point q, metric m and threshold t
+ * m(v,q) < t. In other words, find the first vector in the bag within a threshold distance away.
+ *
+ * It returns one of the tuples of the bag of vectors using {@link <a href="http://en.wikipedia.org/wiki/Lp_space" target="_blank">L2 distance</a>},
+ * distance between two vectors. This is otherwise known as
+ * the Euclidean distance.
+ *
+ * @see datafu.pig.hash.lsh.L2PStableHash L2PStableHash for an example
+ * @author cstella
+ *
+ */
+public class L2 extends MetricUDF {
+
+ /**
+ * Create a new L2 Metric UDF with a given dimension.
+ *
+ * @param sDim
+ */
+ public L2(String sDim) {
+ super(sDim);
+
+ }
+
+ public static double distance(RealVector v1, RealVector v2) {
+ return v1.getDistance(v2);
+ }
+
+ @Override
+ protected double dist(RealVector v1, RealVector v2) {
+ return distance(v1, v2);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java
new file mode 100644
index 0000000..da00a60
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh.metric;
+
+import java.io.IOException;
+
+import org.apache.commons.math.linear.RealVector;
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.DataType;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
+
+import datafu.pig.hash.lsh.util.DataTypeUtil;
+
+/**
+ * A base UDF used to find a vector v in a bag such that for query point q, metric m and threshold t
+ * m(v,q) < t. In other words, find the first vector in the bag within a threshold distance away.
+ *
+ * It returns one of the tuples of the bag of vectors. For an example of its use, please see datafu.pig.hash.lsh.CosineDistanceHash.
+ *
+ * @see datafu.pig.hash.lsh.CosineDistanceHash
+ * @author cstella
+ *
+ */
+public abstract class MetricUDF extends EvalFunc<Tuple>
+{
+ protected int dim;
+
+ /**
+ * Create a new Metric UDF with a given dimension.
+ *
+ * @param sDim
+ */
+ public MetricUDF(String sDim)
+ {
+ dim = Integer.parseInt(sDim);
+ }
+
+ /**
+ * The distance metric used. Given v1 and v2, compute the distance between those vectors.
+ * @param v1 vector
+ * @param v2 vector
+ * @return the distance between v1 and v2
+ */
+ protected abstract double dist(RealVector v1, RealVector v2);
+
+ /**
+ * This UDF expects a query vector as the first element, a threshold (double) as the second, and a bag of vectors.
+ * Vectors are represented by tuples with doubles as elements or bags of tuples representing position and value
+ * in the case of sparse vectors.
+ *
+ * It returns one of the tuples of the bag of vectors. For an example of its use, please see datafu.pig.hash.lsh.CosineDistanceHash.
+ *
+ * @see datafu.pig.hash.lsh.CosineDistanceHash
+ */
+ @Override
+ public Tuple exec(Tuple input) throws IOException {
+ Object firstElement = input.get(0);
+ double distanceRange = ((Number)input.get(1)).doubleValue();
+ DataBag vectorBag = (DataBag)input.get(2);
+ RealVector referenceVector = null;
+ if(firstElement instanceof Tuple)
+ {
+ //in which case the first element is a non-sparse tuple
+ referenceVector = DataTypeUtil.INSTANCE.convert((Tuple)firstElement, dim);
+ }
+ else {
+ //in which case the first element is a bag, representing a sparse tuple
+ referenceVector = DataTypeUtil.INSTANCE.convert(input, dim);
+ }
+
+ for(Tuple vecTuple : vectorBag )
+ {
+ Object vectorObj = vecTuple.get(0);
+ RealVector v2 = null;
+ if(vectorObj instanceof Tuple)
+ {
+ v2 = DataTypeUtil.INSTANCE.convert((Tuple)vecTuple.get(0), referenceVector.getDimension());
+ }
+ else
+ {
+ v2 = DataTypeUtil.INSTANCE.convert(vecTuple, referenceVector.getDimension());
+ }
+ double dist = dist(referenceVector, v2);
+ if(dist < distanceRange)
+ {
+ return vecTuple;
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Create the output schema, based on the input schema.
+ *
+ * @return the output schema, which is a tuple matching the schema of the third input field.
+ */
+ public Schema outputSchema(Schema input) {
+ try{
+ validateInputSchema(input);
+ FieldSchema fieldSchema = input.getField(2);
+ return fieldSchema.schema;
+
+ }catch (Exception e){
+ throw new RuntimeException("Unable to create output schema", e);
+ }
+ }
+
+ /**
+ * Validate the input schema to ensure that our input is consistent and that we fail fast.
+ * @param input
+ * @throws FrontendException
+ */
+ private void validateInputSchema(Schema input) throws FrontendException
+ {
+ {
+ FieldSchema vectorSchema = input.getField(0);
+ if(!DataTypeUtil.isValidVector(vectorSchema, dim))
+ {
+ throw new FrontendException("Invalid vector element: Expected either a tuple or a bag, but found " + vectorSchema);
+ }
+ }
+
+ {
+ FieldSchema distanceSchema = input.getField(1);
+ if(distanceSchema.type != DataType.DOUBLE
+ && distanceSchema.type != DataType.INTEGER
+ && distanceSchema.type != DataType.LONG
+ )
+ {
+ throw new FrontendException("Invalid distance element: Expected a number, but found " + distanceSchema);
+ }
+ }
+
+ {
+ FieldSchema pointsSchema = input.getField(2);
+ if( pointsSchema.type != DataType.BAG)
+ {
+ throw new FrontendException("Invalid points element: Expected a bag, but found " + pointsSchema);
+ }
+ FieldSchema tupleInBag = pointsSchema.schema.getField(0);
+ FieldSchema vectorInTuple = tupleInBag.schema.getField(0);
+ if(!DataTypeUtil.isValidVector(vectorInTuple, dim))
+ {
+ throw new FrontendException("Invalid points element: Expected a bag of vectors, but found " + vectorInTuple.schema);
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java
new file mode 100644
index 0000000..df8dbc7
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * UDFs for different {@link <a href="http://en.wikipedia.org/wiki/Metric_(mathematics)" target="_blank">distance functions</a>} (and some similarity functions)
+ * used with Locality Sensitive Hashing.
+ */
+package datafu.pig.hash.lsh.metric;
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java
new file mode 100644
index 0000000..0f3ba94
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh.p_stable;
+
+import org.apache.commons.math.MathException;
+import org.apache.commons.math.linear.RealVector;
+import org.apache.commons.math.random.RandomDataImpl;
+import org.apache.commons.math.random.RandomGenerator;
+
+import datafu.pig.hash.lsh.interfaces.LSH;
+import datafu.pig.hash.lsh.interfaces.Sampler;
+
+/**
+ * This is the base-class for all p-stable based locality sensitive hashes. p-stable locality sensitive
+ * hashes are defined by a few parameters: a dimension, d , a vector taken from a
+ * {@link <a href="http://en.wikipedia.org/wiki/Stable_distribution" target="_blank">k-stable distribution</a>}
+ * (where k is 1 or 2) and a width of projection, w.
+ * <p>
+ * All p-stable LSH functions are parameterized with a quantization parameter (w or r in
+ * the literature , depending on where you look). Consider the following excerpt
+ * from Datar, M.; Immorlica, N.; Indyk, P.; Mirrokni, V.S. (2004).
+ * "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions".
+ * Proceedings of the Symposium on Computational Geometry.
+ *
+ * <pre>
+ * Decreasing the width of the projection (w) decreases the probability of collision for any two points.
+ * Thus, it has the same effect as increasing k . As a result, we would like to set w as small as possible
+ * and in this way decrease the number of projections we need to make.
+ * </pre>
+ *
+ * In the literature, the quantization parameter (or width of the projection) is
+ * found empirically given a sample of the data and the likely threshold for
+ * the metric. Tuning this parameter is very important for the performance of
+ * this algorithm. For more information, see Datar, M.; Immorlica, N.; Indyk,
+ * P.; Mirrokni, V.S. (2004).
+ * "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions".
+ * Proceedings of the Symposium on Computational Geometry.
+ *
+ * @author cstella
+ *
+ */
+public abstract class AbstractStableDistributionFunction extends LSH
+{
+
+ private double[] a;
+ private double b;
+ double w;
+
+
+ /**
+ * Constructs a new instance.
+ * @param dim The dimension of the vectors to be hashed
+ * @param w A double representing the quantization parameter (also known as the projection width)
+ * @param rand The random generator used
+ * @throws MathException
+ */
+ public AbstractStableDistributionFunction(int dim, double w, RandomGenerator rand) throws MathException
+ {
+ super(dim, rand);
+ reset(dim, w);
+ }
+
+ public void reset(int dim, double w) throws MathException
+ {
+ RandomDataImpl dataSampler = new RandomDataImpl(rg);
+ Sampler sampler = getSampler();
+ this.a = new double[dim];
+ this.dim = dim;
+ this.w = w;
+ for(int i = 0;i < dim;++i)
+ {
+ a[i] = sampler.sample(dataSampler);
+ }
+ b = dataSampler.nextUniform(0, w);
+ }
+
+
+ /**
+ * The sampler determines the metric which this LSH is associated with.
+ * A 1-stable sample will yield a LSH which corresponds to a L1 metric; likewise for 2-stable and L2.
+ * @return The sampler to use.
+ */
+ protected abstract Sampler
+ getSampler();
+
+ /**
+ * Compute the LSH for a given vector.
+ */
+ public long apply(RealVector vector)
+ {
+ /*
+ * The hash is just floor(<v, a>/w)
+ */
+ double ret = b;
+
+ for(int i = 0;i < dim;++i)
+ {
+ ret += vector.getEntry(i)*a[i];
+ }
+ return (long)Math.floor(ret/w);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java
new file mode 100644
index 0000000..79bf7e5
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh.p_stable;
+
+import org.apache.commons.math.MathException;
+import org.apache.commons.math.random.RandomDataImpl;
+import org.apache.commons.math.random.RandomGenerator;
+
+import datafu.pig.hash.lsh.interfaces.Sampler;
+
+/**
+ * A locality sensitive hash associated with the L1 metric. This uses a 1-stable distribution
+ * to construct the hash.
+ *
+ * @author cstella
+ *
+ */
+public class L1LSH extends AbstractStableDistributionFunction implements Sampler
+{
+ /**
+ * Constructs a new instance.
+ * @throws MathException
+ */
+ public L1LSH(int dim, double d, RandomGenerator rand) throws MathException {
+ super(dim, d, rand);
+ }
+
+ /**
+ * Draw a sample s ~ Cauchy(0,1), which is 1-stable.
+ *
+ * @return a sample from a cauchy distribution with median 0 and scale 1
+ */
+ public double sample(RandomDataImpl randomData) throws MathException {
+
+ return randomData.nextCauchy(0, 1);
+
+ }
+ @Override
+ protected Sampler getSampler() {
+ return this;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java
new file mode 100644
index 0000000..d18b189
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh.p_stable;
+
+import org.apache.commons.math.MathException;
+import org.apache.commons.math.random.RandomDataImpl;
+import org.apache.commons.math.random.RandomGenerator;
+
+import datafu.pig.hash.lsh.interfaces.Sampler;
+
+/**
+ * A locality sensitive hash associated with the L2 metric. This uses a 2-stable distribution
+ * to construct the hash.
+ *
+ * @author cstella
+ *
+ */
+public class L2LSH extends AbstractStableDistributionFunction implements Sampler {
+
+ /**
+ * Constructs a new instance.
+ * @throws MathException
+ */
+ public L2LSH(int dim, double w, RandomGenerator rand) throws MathException {
+ super(dim, w, rand);
+ }
+
+ /**
+ * Draw a sample s ~ Gaussian(0,1), which is 2-stable.
+ *
+ * @return a sample from a Gaussian distribution with mu of 0 and sigma of 1
+ */
+ public double sample(RandomDataImpl randomData)
+ {
+ return randomData.nextGaussian(0,1);
+ }
+
+ @Override
+ protected Sampler getSampler() {
+ return this;
+ }
+
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java
new file mode 100644
index 0000000..ec9c313
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * Implementation of {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>} for
+ * {@link <a href="http://en.wikipedia.org/wiki/Lp_space" target="_blank">L1 and L2 metrics</a>}.
+ *
+ * See Datar, M.; Immorlica, N.; Indyk, P.; Mirrokni, V.S. (2004). "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions". Proceedings of the Symposium on Computational Geometry.
+ *
+ */
+package datafu.pig.hash.lsh.p_stable;
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java
new file mode 100644
index 0000000..045ed0d
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * UDFs for {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>}
+ */
+package datafu.pig.hash.lsh;