You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datafu.apache.org by mh...@apache.org on 2014/05/14 19:01:14 UTC

[2/2] git commit: DATAFU-37 Adding LSH implementation.

DATAFU-37 Adding LSH implementation.

https://issues.apache.org/jira/browse/DATAFU-37

Signed-off-by: Matt Hayes <mh...@linkedin.com>


Project: http://git-wip-us.apache.org/repos/asf/incubator-datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-datafu/commit/544e5af0
Tree: http://git-wip-us.apache.org/repos/asf/incubator-datafu/tree/544e5af0
Diff: http://git-wip-us.apache.org/repos/asf/incubator-datafu/diff/544e5af0

Branch: refs/heads/master
Commit: 544e5af08c81671e8f14832fc3657de696cd313b
Parents: e15390f
Author: cstella <ce...@gmail.com>
Authored: Tue May 13 12:04:08 2014 -0400
Committer: Matt Hayes <mh...@linkedin.com>
Committed: Wed May 14 10:00:28 2014 -0700

----------------------------------------------------------------------
 .../datafu/pig/hash/lsh/CosineDistanceHash.java | 185 +++++++
 .../java/datafu/pig/hash/lsh/L1PStableHash.java | 198 ++++++++
 .../java/datafu/pig/hash/lsh/L2PStableHash.java | 202 ++++++++
 .../java/datafu/pig/hash/lsh/LSHFamily.java     |  66 +++
 .../main/java/datafu/pig/hash/lsh/LSHFunc.java  | 149 ++++++
 .../java/datafu/pig/hash/lsh/RepeatingLSH.java  |  82 ++++
 .../pig/hash/lsh/cosine/HyperplaneLSH.java      |  88 ++++
 .../pig/hash/lsh/cosine/package-info.java       |  24 +
 .../datafu/pig/hash/lsh/interfaces/LSH.java     |  71 +++
 .../pig/hash/lsh/interfaces/LSHCreator.java     | 117 +++++
 .../datafu/pig/hash/lsh/interfaces/Sampler.java |  39 ++
 .../pig/hash/lsh/interfaces/package-info.java   |  23 +
 .../java/datafu/pig/hash/lsh/metric/Cosine.java |  68 +++
 .../java/datafu/pig/hash/lsh/metric/L1.java     |  57 +++
 .../java/datafu/pig/hash/lsh/metric/L2.java     |  55 +++
 .../datafu/pig/hash/lsh/metric/MetricUDF.java   | 170 +++++++
 .../pig/hash/lsh/metric/package-info.java       |  24 +
 .../AbstractStableDistributionFunction.java     | 119 +++++
 .../datafu/pig/hash/lsh/p_stable/L1LSH.java     |  60 +++
 .../datafu/pig/hash/lsh/p_stable/L2LSH.java     |  61 +++
 .../pig/hash/lsh/p_stable/package-info.java     |  27 +
 .../java/datafu/pig/hash/lsh/package-info.java  |  23 +
 .../datafu/pig/hash/lsh/util/DataTypeUtil.java  | 178 +++++++
 .../datafu/pig/hash/lsh/util/package-info.java  |  23 +
 .../datafu/test/pig/hash/lsh/LSHPigTest.java    | 492 +++++++++++++++++++
 .../java/datafu/test/pig/hash/lsh/LSHTest.java  | 268 ++++++++++
 26 files changed, 2869 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/CosineDistanceHash.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/CosineDistanceHash.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/CosineDistanceHash.java
new file mode 100644
index 0000000..1664362
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/CosineDistanceHash.java
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh;
+
+import org.apache.commons.math.MathException;
+import org.apache.commons.math.random.RandomGenerator;
+
+import datafu.pig.hash.lsh.cosine.HyperplaneLSH;
+import datafu.pig.hash.lsh.interfaces.LSH;
+import datafu.pig.hash.lsh.interfaces.LSHCreator;
+
+/**
+ * From wikipedia's article on {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>}:
+ * <pre>
+ * Locality-sensitive hashing (LSH) is a method of performing probabilistic dimension reduction of high-dimensional data. 
+ * The basic idea is to hash the input items so that similar items are mapped to the same buckets with high probability 
+ * (the number of buckets being much smaller than the universe of possible input items).
+ * </pre>
+ * 
+ * In particular, this implementation implements a locality sensitive hashing scheme which maps high-dimensional vectors which are
+ * close together (with high probability) according to {@link <a href="http://en.wikipedia.org/wiki/Cosine_similarity" target="_blank">Cosine Similarity</a>}
+ * into the same buckets.  Each LSH maps a vector onto one side or the other of a random hyperplane, thereby producing a single
+ * bit as the hash value.  Multiple, independent, hashes can be run on the same input and aggregated together to form a more
+ * broad domain than a single bit.
+ * 
+ * For more information, see Charikar, Moses S.. (2002). "Similarity Estimation Techniques from Rounding Algorithms". Proceedings of the 34th Annual ACM Symposium on Theory of Computing 2002.
+ * 
+ * 
+ */
+public class CosineDistanceHash extends LSHFunc
+{
+    int dim;
+    long seed;
+    int repeat;
+    int numHashes;
+    
+    /**
+     * Locality sensitive hash that maps vectors onto 0,1 in such a way that colliding
+     * vectors are "near" one another according to cosine similarity with high probability.  
+     * 
+     * <p>
+     * Generally, multiple LSH are combined via repetition to increase the range of the hash function to the full set of longs.
+     * The number of functions which you want to internally repeat is specified by the sRepeat parameter.
+     * 
+     * The size of the hash family corresponds to the number of independent hashes you want to apply to the data.
+     * In a k-near neighbors style of searching, this corresponds to the number of neighbors you want to find
+     * (i.e. the number of vectors within a distance according to cosine similarity).
+     * 
+     * <p>
+     * Consider the following example where we input some 3-dimensional points and a set of 3-dimensional queries
+     * and find the nearest neighbors of the query points:
+     * <pre>
+     * -- Create a CosineDistanceHash of 
+     * --   3 dimensional data
+     * --   1500 internal hashes (being combined into one hash)
+     * --   family of 5 hashes
+     * --   with a seed of 0
+     * 
+     * -- This creates a bag of tuples:
+     * --   lsh_id:Integer the family ID (in this case, 0-4)
+     * --   hash:Long the hash 
+     * 
+     * define LSH datafu.pig.hash.lsh.CosineDistanceHash('3', '1500', '5', '0');
+     * define METRIC datafu.pig.hash.lsh.metric.L2();
+     *
+     * PTS = LOAD 'input' AS (dim1:double, dim2:double, dim3:double);
+     * 
+     * --hash the input points
+     * PTS_HASHED = foreach PTS generate TOTUPLE(dim1, dim2, dim3) as pt
+     *                    , FLATTEN(LSH(TOTUPLE(dim1, dim2, dim3)));
+     * 
+     * -- the hash family ID and the hash should group the input points into partitions
+     * PARTITIONS = group PTS_HASHED by (lsh_id, hash);
+     * 
+     * -- take in the query points and hash them
+     * QUERIES = LOAD 'queries' as (dim1:double, dim2:double, dim3:double);
+     * QUERIES_HASHED = foreach QUERIES generate TOTUPLE(dim1, dim2, dim3) as query_pt
+     *                        , FLATTEN(LSH(TOTUPLE(dim1, dim2, dim3)))
+     *                        ;
+     * 
+     * -- join the hashed query points with the (presumably larger) list of input data split by partitions
+     * QUERIES_W_PARTS = join QUERIES_HASHED by (lsh_id, hash), PARTITIONS by (group.$0, group.$1);
+     * 
+     * -- Now, use the appropriate METRIC UDF (in this case Cosine distance) to find the first point within
+     * -- a parameterized threshold (in this case, .001).  It takes:
+     * --   query_pt:Tuple the query point
+     * --   threshold:Double the threshold, so that if the distance between the query point and a point
+     * --                    in the partition is less than this threshold, it returns the point (and stops searching)
+     * --   partition:Bag The bag of tuples in the partition.
+     * 
+     * tuples from 
+     * NEAR_NEIGHBORS = foreach QUERIES_W_PARTS generate query_pt as query_pt
+     *                                                 , METRIC(query_pt, .001, PTS_HASHED) as neighbor
+     *                                                 ;
+     * describe NEAR_NEIGHBORS;
+     * -- {query_pt: (dim1: double,dim2: double,dim3: double)
+     * -- ,neighbor: (pt: (dim1: double,dim2: double,dim3: double)
+     * --            ,lsh::lsh_id: int
+     * --            ,lsh::hash: long
+     * --            )
+     * -- }
+     * 
+     * -- project out the query and the matching point
+     * NEIGHBORS_PROJ = foreach NEAR_NEIGHBORS {
+     *  generate query_pt as query_pt, neighbor.pt as matching_pts;
+     * };
+     * 
+     * -- Filter out the hashes which resulted in no matches
+     * NOT_NULL = filter NEIGHBORS_PROJ by SIZE(matching_pts) > 0;
+     * 
+     * -- group by the query
+     * NEIGHBORS_GRP = group NOT_NULL by query_pt;
+     * describe NEIGHBORS_GRP;
+     * 
+     * -- Generate the query, the number of matches and the bag of matching points
+     * NEIGHBOR_CNT = foreach NEIGHBORS_GRP{
+     *    MATCHING_PTS = foreach NOT_NULL generate FLATTEN(matching_pts);
+     *    DIST_MATCHING_PTS = DISTINCT MATCHING_PTS;
+     *    generate group as query_pt, COUNT(NOT_NULL), DIST_MATCHING_PTS;
+     * };
+     * describe NEIGHBOR_CNT;
+     * -- NEIGHBOR_CNT: {query_pt: (dim1: double,dim2: double,dim3: double)
+     * --               ,long
+     * --               ,DIST_MATCHING_PTS: { (matching_pts::dim1: double,matching_pts::dim2: double,matching_pts::dim3: double)
+     * --                              }
+     * --               }
+     * STORE NEIGHBOR_CNT INTO 'neighbors';
+     * </pre>
+     * 
+     * 
+     * 
+     * @param sDim  Dimension of the vectors
+     * @param sRepeat Number of internal repetitions
+     * @param sNumHashes Size of the hash family (if you're looking for k near neighbors, this is the k)
+     * @param sSeed Seed to use when constructing LSH family
+     */
+    public CosineDistanceHash(String sDim, String sRepeat, String sNumHashes, String sSeed)
+    {
+      super(sSeed);
+      dim = Integer.parseInt(sDim);
+      repeat = Integer.parseInt(sRepeat);
+      numHashes = Integer.parseInt(sNumHashes);
+    }
+    
+    public CosineDistanceHash(String sDim, String sRepeat, String sNumHashes)
+    {
+       this(sDim, sRepeat, sNumHashes, null);
+    }
+
+    @Override
+  protected LSHCreator createLSHCreator() {
+    return new LSHCreator(dim, numHashes, repeat, getSeed())
+    {
+
+      @Override
+      protected LSH constructLSH(RandomGenerator rg) throws MathException {
+        return new HyperplaneLSH(dim, rg );
+      }
+      
+    };
+  }
+
+  @Override
+  protected int getDimension() {
+    return dim;
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/L1PStableHash.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/L1PStableHash.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/L1PStableHash.java
new file mode 100644
index 0000000..2dd42cd
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/L1PStableHash.java
@@ -0,0 +1,198 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh;
+
+import org.apache.commons.math.MathException;
+import org.apache.commons.math.random.RandomGenerator;
+
+import datafu.pig.hash.lsh.interfaces.LSH;
+import datafu.pig.hash.lsh.interfaces.LSHCreator;
+import datafu.pig.hash.lsh.p_stable.L1LSH;
+
+/**
+ * From wikipedia's article on {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>}:
+ * <pre>
+ * Locality-sensitive hashing (LSH) is a method of performing probabilistic dimension reduction of high-dimensional data. 
+ * The basic idea is to hash the input items so that similar items are mapped to the same buckets with high probability 
+ * (the number of buckets being much smaller than the universe of possible input items).
+ * </pre>
+ * 
+ * In particular, this implementation implements a locality sensitive hashing scheme which maps high-dimensional vectors which are
+ * close together (with high probability) according to the {@link <a href="http://en.wikipedia.org/wiki/Lp_space" target="_blank">L1</a>}
+ * distance metric into the same buckets.  This implementation uses a 1-stable distribution (a Cauchy distribution) in order
+ * to accomplish this.
+ * 
+ * For more information, see Datar, M.; Immorlica, N.; Indyk, P.; Mirrokni, V.S. (2004). "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions". Proceedings of the Symposium on Computational Geometry.
+ * 
+ */
+public class L1PStableHash extends LSHFunc
+{
+    int dim;
+    double w;
+   
+    int repeat;
+    int numHashes;
+    
+    /**
+     * Locality sensitive hash that maps vectors onto a long in such a way that colliding
+     * vectors are "near" one another according to cosine similarity with high probability.  
+     * 
+     * <p>
+     * Generally, multiple LSH are combined via repetition to increase the range of the hash function to the full set of longs.
+     * The number of functions which you want to internally repeat is specified by the sRepeat parameter.
+     * 
+     * The size of the hash family corresponds to the number of independent hashes you want to apply to the data.
+     * In a k-near neighbors style of searching, this corresponds to the number of neighbors you want to find
+     * (i.e. the number of vectors within a distance according to cosine similarity).
+     * 
+     * This UDF, indeed all p-stable LSH functions, are parameterized with a quantization parameter (w or r in the literature
+     * , depending on where you look).  Consider the following excerpt from Datar, M.; Immorlica, N.; Indyk, P.; Mirrokni, V.S. (2004). "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions". Proceedings of the Symposium on Computational Geometry.
+     * 
+     * <pre>
+     * Decreasing the width of the projection (w) decreases the probability of collision for any two points. 
+     * Thus, it has the same effect as increasing k . As a result, we would like to set w as small as possible
+     * and in this way decrease the number of projections we need to make. 
+     * </pre>
+     * 
+     * In the literature, the quantization parameter (or width of the projection) is found empirically given a sample of
+     * the data and the likely threshold of for the metric.  Tuning this parameter is very important for the performance
+     * of this algorithm.
+     * 
+     * <p>
+     * Consider the following example where we input some 3-dimensional points and a set of 3-dimensional queries
+     * and find the nearest neighbors of the query points:
+     * <pre>
+     * -- Create a L1PStableHash of 
+     * --   3 dimensional data
+     * --   projection width of 150
+     * --   1 internal hashes 
+     * --   family of 5 hashes
+     * --   with a seed of 0
+     * 
+     * -- This creates a bag of tuples:
+     * --   lsh_id:Integer the family ID (in this case, 0-4)
+     * --   hash:Long the hash 
+     * 
+     * define LSH datafu.pig.hash.lsh.L1PStableHash('3', '150', '1', '5', '0');
+     * define METRIC datafu.pig.hash.lsh.metric.L1();
+     *
+     * PTS = LOAD 'input' AS (dim1:double, dim2:double, dim3:double);
+     * 
+     * --hash the input points
+     * PTS_HASHED = foreach PTS generate TOTUPLE(dim1, dim2, dim3) as pt
+     *                    , FLATTEN(LSH(TOTUPLE(dim1, dim2, dim3)));
+     * 
+     * -- the hash family ID and the hash should group the input points into partitions
+     * PARTITIONS = group PTS_HASHED by (lsh_id, hash);
+     * 
+     * -- take in the query points and hash them
+     * QUERIES = LOAD 'queries' as (dim1:double, dim2:double, dim3:double);
+     * QUERIES_HASHED = foreach QUERIES generate TOTUPLE(dim1, dim2, dim3) as query_pt
+     *                        , FLATTEN(LSH(TOTUPLE(dim1, dim2, dim3)))
+     *                        ;
+     * 
+     * -- join the hashed query points with the (presumably larger) list of input data split by partitions
+     * QUERIES_W_PARTS = join QUERIES_HASHED by (lsh_id, hash), PARTITIONS by (group.$0, group.$1);
+     * 
+     * -- Now, use the appropriate METRIC UDF (in this case L1 (aka city block) distance) to find the first point within
+     * -- a parameterized threshold (in this case, 1000).  It takes:
+     * --   query_pt:Tuple the query point
+     * --   threshold:Double the threshold, so that if the distance between the query point and a point
+     * --                    in the partition is less than this threshold, it returns the point (and stops searching)
+     * --   partition:Bag The bag of tuples in the partition.
+     * 
+     *  
+     * NEAR_NEIGHBORS = foreach QUERIES_W_PARTS generate query_pt as query_pt
+     *                                                 , METRIC(query_pt, 1000, PTS_HASHED) as neighbor
+     *                                                 ;
+     * describe NEAR_NEIGHBORS;
+     * -- {query_pt: (dim1: double,dim2: double,dim3: double)
+     * -- ,neighbor: (pt: (dim1: double,dim2: double,dim3: double)
+     * --            ,lsh::lsh_id: int
+     * --            ,lsh::hash: long
+     * --            )
+     * -- }
+     * 
+     * -- project out the query and the matching point
+     * NEIGHBORS_PROJ = foreach NEAR_NEIGHBORS {
+     *  generate query_pt as query_pt, neighbor.pt as matching_pts;
+     * };
+     * 
+     * -- Filter out the hashes which resulted in no matches
+     * NOT_NULL = filter NEIGHBORS_PROJ by SIZE(matching_pts) > 0;
+     * 
+     * -- group by the query
+     * NEIGHBORS_GRP = group NOT_NULL by query_pt;
+     * describe NEIGHBORS_GRP;
+     * 
+     * -- Generate the query, the number of matches and the bag of matching points
+     * NEIGHBOR_CNT = foreach NEIGHBORS_GRP{
+     *    DIST_MATCHING_PTS = DISTINCT MATCHING_PTS;
+     *    generate group as query_pt, COUNT(NOT_NULL), DIST_MATCHING_PTS;
+     * };
+     * describe NEIGHBOR_CNT;
+     * -- NEIGHBOR_CNT: {query_pt: (dim1: double,dim2: double,dim3: double)
+     * --               ,long
+     * --               ,DIST_MATCHING_PTS: { (matching_pts::dim1: double,matching_pts::dim2: double,matching_pts::dim3: double)
+     * --                              }
+     * --               }
+     * STORE NEIGHBOR_CNT INTO 'neighbors';
+     * </pre>
+     * 
+     * 
+     * 
+     * @param sDim  Dimension of the vectors
+     * @param sW A double representing the quantization parameter (also known as the projection width)
+     * @param sRepeat Number of internal repetitions (generally this should be 1 as the p-stable hashes have a larger range than one bit)
+     * @param sNumHashes Size of the hash family (if you're looking for k near neighbors, this is the k)
+     * @param sSeed Seed to use when constructing LSH family
+     */
+    public L1PStableHash(String sDim, String sW, String sRepeat, String sNumHashes, String sSeed)
+    {
+      super(sSeed);
+      dim = Integer.parseInt(sDim);
+      w = Double.parseDouble(sW);
+      repeat = Integer.parseInt(sRepeat); 
+      numHashes = Integer.parseInt(sNumHashes);
+    }
+    
+    public L1PStableHash(String sDim, String sW, String sRepeat, String sNumHashes)
+    {
+       this(sDim, sW, sRepeat, sNumHashes, null);
+    }
+
+    @Override
+    protected LSHCreator createLSHCreator() {
+      return new LSHCreator(dim, numHashes, repeat, getSeed())
+      {
+  
+        @Override
+        protected LSH constructLSH(RandomGenerator rg) throws MathException {
+          return new L1LSH(dim, w, rg );
+        }
+        
+      };
+    
+    }
+    @Override
+    protected int getDimension() {
+      return dim;
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/L2PStableHash.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/L2PStableHash.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/L2PStableHash.java
new file mode 100644
index 0000000..588f199
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/L2PStableHash.java
@@ -0,0 +1,202 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh;
+
+import org.apache.commons.math.MathException;
+import org.apache.commons.math.random.RandomGenerator;
+
+import datafu.pig.hash.lsh.interfaces.LSH;
+import datafu.pig.hash.lsh.interfaces.LSHCreator;
+import datafu.pig.hash.lsh.p_stable.L2LSH;
+
+/**
+ * From wikipedia's article on {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>}:
+ * <pre>
+ * Locality-sensitive hashing (LSH) is a method of performing probabilistic dimension reduction of high-dimensional data. 
+ * The basic idea is to hash the input items so that similar items are mapped to the same buckets with high probability 
+ * (the number of buckets being much smaller than the universe of possible input items).
+ * </pre>
+ * 
+ * In particular, this implementation implements a locality sensitive hashing scheme which maps high-dimensional vectors which are
+ * close together (with high probability) according to the {@link <a href="http://en.wikipedia.org/wiki/Lp_space" target="_blank">L2</a>}
+ * distance metric into the same buckets.  This implementation uses a 2-stable distribution (a Gaussian distribution) in order
+ * to accomplish this.
+ * 
+ * For more information, see Datar, M.; Immorlica, N.; Indyk, P.; Mirrokni, V.S. (2004). "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions". Proceedings of the Symposium on Computational Geometry.
+ * 
+ */
+public class L2PStableHash extends LSHFunc{
+  int dim;
+    double w;
+    long seed;
+    int repeat;
+    int numHashes;
+    
+    /**
+     * Locality sensitive hash that maps vectors onto a long in such a way that colliding
+     * vectors are "near" one another according to cosine similarity with high probability.  
+     * 
+     * <p>
+     * Generally, multiple LSH are combined via repetition to increase the range of the hash function to the full set of longs.
+     * The number of functions which you want to internally repeat is specified by the sRepeat parameter.
+     * 
+     * The size of the hash family corresponds to the number of independent hashes you want to apply to the data.
+     * In a k-near neighbors style of searching, this corresponds to the number of neighbors you want to find
+     * (i.e. the number of vectors within a distance according to cosine similarity).
+     * 
+     * This UDF, indeed all p-stable LSH functions are parameterized with a quantization parameter (w or r in the literature
+     * , depending on where you look).  Consider the following excerpt from Datar, M.; Immorlica, N.; Indyk, P.; Mirrokni, V.S. (2004). "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions". Proceedings of the Symposium on Computational Geometry.
+     * 
+     * <pre>
+     * Decreasing the width of the projection (w) decreases the probability of collision for any two points. 
+     * Thus, it has the same effect as increasing k . As a result, we would like to set w as small as possible
+     * and in this way decrease the number of projections we need to make. 
+     * </pre>
+     * 
+     * In the literature, the quantization parameter (or width of the projection) is found empirically given a sample of
+     * the data and the likely threshold of for the metric.  Tuning this parameter is very important for the performance
+     * of this algorithm.
+     * 
+     * <p>
+     * Consider the following example where we input some 3-dimensional points and a set of 3-dimensional queries
+     * and find the nearest neighbors of the query points:
+     * <pre>
+     * -- Create a L2PStableHash of 
+     * --   3 dimensional data
+     * --   projection width of 200
+     * --   1 internal hashes 
+     * --   family of 5 hashes
+     * --   with a seed of 0
+     * 
+     * -- This creates a bag of tuples:
+     * --   lsh_id:Integer the family ID (in this case, 0-4)
+     * --   hash:Long the hash 
+     * 
+     * define LSH datafu.pig.hash.lsh.L2PStableHash('3', '200', '1', '5', '0');
+     * define METRIC datafu.pig.hash.lsh.metric.L2();
+     *
+     * PTS = LOAD 'input' AS (dim1:double, dim2:double, dim3:double);
+     * 
+     * --hash the input points
+     * PTS_HASHED = foreach PTS generate TOTUPLE(dim1, dim2, dim3) as pt
+     *                    , FLATTEN(LSH(TOTUPLE(dim1, dim2, dim3)));
+     * 
+     * -- the hash family ID and the hash should group the input points into partitions
+     * PARTITIONS = group PTS_HASHED by (lsh_id, hash);
+     * 
+     * -- take in the query points and hash them
+     * QUERIES = LOAD 'queries' as (dim1:double, dim2:double, dim3:double);
+     * QUERIES_HASHED = foreach QUERIES generate TOTUPLE(dim1, dim2, dim3) as query_pt
+     *                        , FLATTEN(LSH(TOTUPLE(dim1, dim2, dim3)))
+     *                        ;
+     * 
+     * -- join the hashed query points with the (presumably larger) list of input data split by partitions
+     * QUERIES_W_PARTS = join QUERIES_HASHED by (lsh_id, hash), PARTITIONS by (group.$0, group.$1);
+     * 
+     * -- Now, use the appropriate METRIC UDF (in this case L2 (aka Euclidean) distance) to find the first point within
+     * -- a parameterized threshold (in this case, 1000).  It takes:
+     * --   query_pt:Tuple the query point
+     * --   threshold:Double the threshold, so that if the distance between the query point and a point
+     * --                    in the partition is less than this threshold, it returns the point (and stops searching)
+     * --   partition:Bag The bag of tuples in the partition.
+     * 
+     * tuples from 
+     * NEAR_NEIGHBORS = foreach QUERIES_W_PARTS generate query_pt as query_pt
+     *                                                 , METRIC(query_pt, 1000, PTS_HASHED) as neighbor
+     *                                                 ;
+     * describe NEAR_NEIGHBORS;
+     * describe NEAR_NEIGHBORS;
+     * -- {query_pt: (dim1: double,dim2: double,dim3: double)
+     * -- ,neighbor: (pt: (dim1: double,dim2: double,dim3: double)
+     * --            ,lsh::lsh_id: int
+     * --            ,lsh::hash: long
+     * --            )
+     * -- }
+     * 
+     * -- project out the query and the matching point
+     * NEIGHBORS_PROJ = foreach NEAR_NEIGHBORS {
+     *  generate query_pt as query_pt, neighbor.pt as matching_pts;
+     * };
+     * 
+     * -- Filter out the hashes which resulted in no matches
+     * NOT_NULL = filter NEIGHBORS_PROJ by SIZE(matching_pts) > 0;
+     * 
+     * -- group by the query
+     * NEIGHBORS_GRP = group NOT_NULL by query_pt;
+     * describe NEIGHBORS_GRP;
+     * 
+     * -- Generate the query, the number of matches and the bag of matching points
+     * NEIGHBOR_CNT = foreach NEIGHBORS_GRP{
+     *    MATCHING_PTS = foreach NOT_NULL generate FLATTEN(matching_pts);
+     *    DIST_MATCHING_PTS = DISTINCT MATCHING_PTS;
+     *    generate group as query_pt, COUNT(NOT_NULL), DIST_MATCHING_PTS;
+     * };
+     * describe NEIGHBOR_CNT;
+     * -- NEIGHBOR_CNT: {query_pt: (dim1: double,dim2: double,dim3: double)
+     * --               ,long
+     * --               ,DIST_MATCHING_PTS: { (matching_pts::dim1: double,matching_pts::dim2: double,matching_pts::dim3: double)
+     * --                              }
+     * --               }
+     * STORE NEIGHBOR_CNT INTO 'neighbors';
+     * </pre>
+     * 
+     * 
+     * 
+     * @param sDim  Dimension of the vectors
+     * @param sW A double representing the quantization parameter (also known as the projection width)
+     * @param sRepeat Number of internal repetitions (generally this should be 1 as the p-stable hashes have a larger range than one bit)
+     * @param sNumHashes Size of the hash family (if you're looking for k near neighbors, this is the k)
+     * @param sSeed Seed to use when constructing LSH family
+     */
+    public L2PStableHash(String sDim, String sW, String sRepeat, String sNumHashes, String sSeed)
+    {
+      super(sSeed);
+      dim = Integer.parseInt(sDim);
+      w = Double.parseDouble(sW);
+      repeat = Integer.parseInt(sRepeat);  
+      numHashes = Integer.parseInt(sNumHashes);
+    }
+    
+    public L2PStableHash(String sDim, String sW, String sRepeat, String sNumHashes)
+    {
+       this(sDim, sW, sRepeat, sNumHashes, null);
+    }
+
+
+  @Override
+  protected LSHCreator createLSHCreator() {
+    return new LSHCreator(dim, numHashes, repeat, getSeed())
+    {
+
+      @Override
+      protected LSH constructLSH(RandomGenerator rg) throws MathException {
+        return new L2LSH(dim, w, rg );
+      }
+      
+    };
+  }
+
+
+  @Override
+  protected int getDimension() {
+    return dim;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/LSHFamily.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/LSHFamily.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/LSHFamily.java
new file mode 100644
index 0000000..394109c
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/LSHFamily.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh;
+
+import java.util.List;
+
+import org.apache.commons.math.linear.RealVector;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterables;
+
+import datafu.pig.hash.lsh.interfaces.LSH;
+
+/**
+ * A family of k locality sensitive hashes.  For a given point, k hashes will be computed.
+ * @author cstella
+ *
+ */
+public class LSHFamily {
+
+  private List<LSH> hashes;
+  /**
+   * Construct a family of hashes
+   * @param hashes Hashes which will be applied in turn to a given point
+   */
+  public LSHFamily(List<LSH> hashes) 
+  {
+    this.hashes = hashes;
+  }
+
+  /**
+   * Compute the family of k-hashes for a vector.
+   * 
+   * @param vector
+   * @return An iterable of hashes
+   */
+  public Iterable<Long> apply(final RealVector vector) {
+    return Iterables.transform(hashes, new Function<LSH, Long>()
+        {
+          @Override
+          public Long apply(LSH lsh)
+          {
+            return lsh.apply(vector);
+          }
+        }
+                 );
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/LSHFunc.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/LSHFunc.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/LSHFunc.java
new file mode 100644
index 0000000..a7ab070
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/LSHFunc.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh;
+
+import java.io.IOException;
+import java.util.Random;
+
+import org.apache.commons.math.MathException;
+import org.apache.commons.math.linear.RealVector;
+import org.apache.pig.EvalFunc;
+import org.apache.pig.backend.executionengine.ExecException;
+import org.apache.pig.data.BagFactory;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.DataType;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
+import org.apache.pig.impl.util.UDFContext;
+
+import datafu.pig.hash.lsh.interfaces.LSHCreator;
+import datafu.pig.hash.lsh.util.DataTypeUtil;
+
+/**
+ * The base UDF for locality sensitive hashing.  
+ * @author cstella
+ *
+ */
+public abstract class LSHFunc extends EvalFunc<DataBag>
+{
+  protected LSHFamily lsh = null;
+  protected Long seed;
+  private TupleFactory mTupleFactory = TupleFactory.getInstance();
+  private BagFactory mBagFactory = BagFactory.getInstance();
+  private LSHCreator lshCreator = null;
+  protected abstract LSHCreator createLSHCreator();
+  protected abstract int getDimension();
+  
+  public LSHFunc(String sSeed)
+  {
+    if(sSeed == null)
+    {
+      seed = null;
+    }
+    else
+    {
+      seed = Long.parseLong(sSeed);
+    }
+  }
+    /**
+     * The expectation is that the input is a Tuple of numeric data representing a real vector.
+   * 
+   * @return DataBag containing tuples which have the lsh_id (the hash ID in the family) and the hash
+     */
+  @Override
+  public DataBag exec(Tuple t) throws IOException
+  {
+    if (lsh == null) {
+      try {
+        lshCreator = createLSHCreator();
+        lsh = lshCreator.constructFamily(lshCreator.createGenerator());
+      } catch (MathException e) {
+        throw new RuntimeException("Unable to construct LSH!", e);
+      }
+    }
+
+    RealVector r = null;
+    try {
+      r = DataTypeUtil.INSTANCE.convert(t, lshCreator.getDim());
+    } catch (ExecException e) {
+      throw new IllegalStateException("Unable to convert tuple: "
+          + t.toString() + " to RealVector");
+    }
+    DataBag ret = mBagFactory.newDefaultBag();
+    
+    int idx = 0;
+    for(Long hash : lsh.apply(r))
+    {
+      Tuple out = mTupleFactory.newTuple(2);
+      out.set(0, idx++);
+      out.set(1,  hash);
+      ret.add(out);
+    }
+    return ret;
+  }
+  
+  protected long getSeed()
+  {
+    if(seed == null)
+    {
+      UDFContext context = UDFContext.getUDFContext();
+      return Long.parseLong(context.getUDFProperties(this.getClass()).getProperty("seed"));
+    }
+    else
+    {
+      return seed;
+    }
+  }
+  
+  /**
+   * The schema returned here is a bag containing pairs.  
+   * The pairs are the lsh_id (or the ID of the hash) and the hash value.
+   */
+   public Schema outputSchema(Schema input) {
+         try {
+           validateInputSchema(input);
+           long randomSeed = new Random().nextLong();
+           UDFContext context = UDFContext.getUDFContext();
+           context.getUDFProperties(this.getClass()).setProperty("seed", "" + randomSeed);
+           Schema bagSchema = new Schema();
+           bagSchema.add(new Schema.FieldSchema("lsh_id", DataType.INTEGER));
+           bagSchema.add(new Schema.FieldSchema("hash", DataType.LONG));
+           return new Schema(new Schema.FieldSchema("lsh", bagSchema, DataType.BAG));
+         }catch (Exception e){
+            throw new RuntimeException("Unable to create output schema", e);
+         }
+   }
+   /**
+    * Validate the input schema to ensure that our input is consistent and that we fail fast.
+    * @param input
+    * @throws FrontendException
+    */
+   private void validateInputSchema(Schema input) throws FrontendException
+   {
+     FieldSchema vectorSchema = input.getField(0);
+     if(!DataTypeUtil.isValidVector(vectorSchema, getDimension()))
+     {
+       throw new FrontendException("Invalid vector element: Expected either a tuple or a bag, but found " + vectorSchema);
+     }
+   }
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/RepeatingLSH.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/RepeatingLSH.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/RepeatingLSH.java
new file mode 100644
index 0000000..7f59e83
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/RepeatingLSH.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh;
+
+import java.util.List;
+
+import org.apache.commons.math.MathException;
+import org.apache.commons.math.linear.ArrayRealVector;
+import org.apache.commons.math.linear.RealVector;
+import org.apache.commons.math.random.RandomData;
+import org.apache.commons.math.random.RandomDataImpl;
+import org.apache.commons.math.random.RandomGenerator;
+
+import datafu.pig.hash.lsh.interfaces.LSH;
+
+/**
+ * A composite hash which takes multiple hashes and composes them.  This is useful
+ * when a locality sensitive hash has a range that is very narrow and would want a wider
+ * range for the hash.
+ * 
+ * 
+ * @author cstella
+ *
+ */
+public class RepeatingLSH extends LSH
+{
+  private List<LSH> lshList;
+  private RealVector randomVec;
+  
+  public RepeatingLSH(List<LSH> lshList) throws MathException
+  {
+    super(lshList.get(0).getDim(), lshList.get(0).getRandomGenerator());
+    this.lshList = lshList;
+    RandomGenerator rg = lshList.get(0).getRandomGenerator();
+    RandomData rd = new RandomDataImpl(rg);
+    /*
+     * Compute a random vector of lshList.size() with each component taken from U(0,10)
+     */
+    randomVec = new ArrayRealVector(lshList.size());
+    for(int i = 0; i < randomVec.getDimension();++i)
+    {
+      randomVec.setEntry(i, rd.nextUniform(0, 10.0));
+    }
+  }
+  
+
+  /**
+   * Compute the aggregated hash of a vector.  This is done by taking the output of the k hashes as a
+   * vector and computing the dot product with a vector of uniformly distributed components between 0 and 10.
+   * 
+   * So, consider a_{0,k} such that a_i ~ U(0,10) and hash functions h_{0,k} treated as 2 k-dimensional vectors,
+   * we return the dot product of h(v) and a.
+   * 
+   */
+  public long apply(RealVector vector) {
+    long res = 0;
+    
+    for(int i = 0;i < lshList.size();++i)
+    {
+      res += randomVec.getEntry(i)* lshList.get(i).apply(vector);
+    }
+    return res;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/cosine/HyperplaneLSH.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/cosine/HyperplaneLSH.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/cosine/HyperplaneLSH.java
new file mode 100644
index 0000000..deff98e
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/cosine/HyperplaneLSH.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh.cosine;
+
+import org.apache.commons.math.linear.ArrayRealVector;
+import org.apache.commons.math.linear.RealVector;
+import org.apache.commons.math.random.RandomGenerator;
+import org.apache.commons.math.random.UnitSphereRandomVectorGenerator;
+
+import datafu.pig.hash.lsh.interfaces.LSH;
+
+/**
+ * From wikipedia's article on {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>}:
+ * <pre>
+ * Locality-sensitive hashing (LSH) is a method of performing probabilistic dimension reduction of high-dimensional data. 
+ * The basic idea is to hash the input items so that similar items are mapped to the same buckets with high probability 
+ * (the number of buckets being much smaller than the universe of possible input items).
+ * </pre>
+ * 
+ * In particular, this implementation implements a locality sensitive hashing scheme which maps high-dimensional vectors which are
+ * close together (with high probability) according to {@link <a href="http://en.wikipedia.org/wiki/Cosine_similarity" target="_blank">Cosine Similarity</a>}
+ * into the same buckets.  Each LSH maps a vector onto one side or the other of a random hyperplane, thereby producing a single
+ * bit as the hash value.  Multiple, independent, hashes can be run on the same input and aggregated together to form a more
+ * broad domain than a single bit.
+ * 
+ * For more information, see Charikar, Moses S.. (2002). "Similarity Estimation Techniques from Rounding Algorithms". Proceedings of the 34th Annual ACM Symposium on Theory of Computing 2002.
+ * 
+ * 
+ */
+public class HyperplaneLSH extends LSH
+{
+   
+ 
+    private RealVector r;
+    
+    /**
+     * Locality sensitive hash that maps vectors onto 0,1 in such a way that colliding
+     * vectors are "near" one another according to cosine similarity with high probability.  
+     * 
+     * <p>
+     * Generally, multiple LSH are combined via repetition to increase the range of the hash function to the full set of longs.
+     * This repetition is accomplished by wrapping instances of the LSH in a LSHFamily, which does the combination.
+     * 
+     * The size of the hash family corresponds to the number of independent hashes you want to apply to the data.
+     * In a k-near neighbors style of searching, this corresponds to the number of neighbors you want to find
+     * (i.e. the number of vectors within a distance according to cosine similarity).
+     */
+    public HyperplaneLSH(int dim, RandomGenerator rg)
+    {
+        super(dim, rg);
+        
+        UnitSphereRandomVectorGenerator generator = new UnitSphereRandomVectorGenerator(dim, rg);
+        //compute our vector representing a hyperplane of dimension dim by taking a random vector
+        //located on the unit sphere
+        double[] normalVector = generator.nextVector();
+        r = new ArrayRealVector(normalVector);
+    }
+  
+
+    /**
+     * Compute which side of the hyperplane that the parameter is on.  
+     * 
+     * @param vector The vector to test.
+     * @return one if the dot product with the hyperplane is positive, 0 if negative.
+     */
+    public long apply(RealVector vector)
+    {
+        return r.dotProduct(vector) >= 0?1:0;
+    }
+    
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/cosine/package-info.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/cosine/package-info.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/cosine/package-info.java
new file mode 100644
index 0000000..2e44920
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/cosine/package-info.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * Implementation of {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>} 
+ * for {@link <a href="http://en.wikipedia.org/wiki/Cosine_similarity" target="_blank">Cosine Similarity</a>}
+ */
+package datafu.pig.hash.lsh.cosine;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/LSH.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/LSH.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/LSH.java
new file mode 100644
index 0000000..69d4043
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/LSH.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh.interfaces;
+
+import org.apache.commons.math.linear.RealVector;
+import org.apache.commons.math.random.RandomGenerator;
+
+/**
+ * An abstract class representing a locality sensitive hash. From wikipedia's article on {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>}:
+ * <pre>
+ * Locality-sensitive hashing (LSH) is a method of performing probabilistic dimension reduction of high-dimensional data. 
+ * The basic idea is to hash the input items so that similar items are mapped to the same buckets with high probability 
+ * (the number of buckets being much smaller than the universe of possible input items).
+ * </pre>
+ * @author cstella
+ *
+ */
+public abstract class LSH 
+{
+  protected RandomGenerator rg;
+  protected int dim;
+  
+  /**
+   * Construct a locality sensitive hash.  Note, one may pass a pre-seeded generator.
+   * 
+   * @param dim The dimension of the vectors which are to be hashed
+   * @param rg The random generator to use internally.
+   */
+  public LSH(int dim, RandomGenerator rg)
+  {
+    this.dim = dim;
+    this.rg = rg;
+  }
+  
+  /**
+   * 
+   * @return The random generator from this LSH
+   */
+  public RandomGenerator getRandomGenerator() { return rg;}
+  /**
+   * 
+   * @return The dimension of the vectors which this LSh supports
+   */
+  public int getDim() { return dim; }
+  
+  /**
+   * Hash a vector.
+   * 
+   * @param vector A vector to be hashed
+   * @return A hash which collides with vectors close according to some metric (implementation dependent).
+   */
+  public abstract long apply(RealVector vector);
+  
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/LSHCreator.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/LSHCreator.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/LSHCreator.java
new file mode 100644
index 0000000..e3dc5b4
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/LSHCreator.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh.interfaces;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.math.MathException;
+import org.apache.commons.math.random.JDKRandomGenerator;
+import org.apache.commons.math.random.RandomGenerator;
+
+import datafu.pig.hash.lsh.LSHFamily;
+import datafu.pig.hash.lsh.RepeatingLSH;
+
+/**
+ * Create a Locality sensitive hash.
+ * 
+ * @author cstella
+ *
+ */
+public abstract class LSHCreator 
+{
+  private int dim;
+  private int numHashes;
+  private int numInternalRepetitions;
+  private long seed;
+  
+  /**
+   * Create a LSHCreator
+   * 
+   * @param dim The dimension of the vectors which the LSH will hash
+   * @param numHashes The number of locality sensitive hashes to create
+   * @param numInternalRepetitions Each locality sensitive hash is a composite of numInternalRepetitions LSHes (this is done to increase range of the LSH)
+   * @param seed The seed to use
+   */
+  public LSHCreator(int dim, int numHashes, int numInternalRepetitions, long seed)
+  {
+    this.dim = dim;
+    this.numHashes = numHashes;
+    this.numInternalRepetitions = numInternalRepetitions;
+    this.seed = seed;
+  }
+  protected abstract LSH constructLSH(RandomGenerator rg) throws MathException;
+  public int getDim() { return dim;}
+  public RandomGenerator createGenerator() 
+  {
+    RandomGenerator rg = new JDKRandomGenerator();
+    rg.setSeed(seed);
+    return rg;
+  }
+  
+  /**
+   * 
+   * @return The number of locality sensitive hashes to create
+   */
+  public int getNumHashes()
+  {
+    return numHashes;
+  }
+  /**
+   * Each locality sensitive hash is a composite of numInternalRepetitions LSHes (this is done to increase range of the LSH)
+   * @return The number of internal repetitions
+   */
+  public int getNumInternalRepetitions()
+  {
+    return numInternalRepetitions;
+  }
+  
+  
+  /**
+   * 
+   * @param rg The random generator to use when constructing the family
+   * @return The family of locality sensitive hashes
+   * @throws MathException
+   */
+  public  LSHFamily constructFamily(RandomGenerator rg) throws MathException
+    { 
+      List<LSH> hashes = new ArrayList<LSH>();
+      for(int i = 0;i < getNumHashes();++i)
+      {
+        LSH lsh = null;
+        if(getNumInternalRepetitions() == 1)
+        {
+          // in the situation of 1, we don't do a composite of 1..we just pass the raw LSH back
+          lsh = constructLSH(rg);
+        }
+        else
+        {
+          List<LSH> lshFamily = new ArrayList<LSH>();
+          for(int j = 0;j < getNumInternalRepetitions();++j)
+          {
+            lshFamily.add(constructLSH(rg));
+          }
+          lsh = new RepeatingLSH(lshFamily);
+        }
+        hashes.add(lsh);
+      }
+      return new LSHFamily(hashes);
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/Sampler.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/Sampler.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/Sampler.java
new file mode 100644
index 0000000..0c57a0d
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/Sampler.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh.interfaces;
+
+import org.apache.commons.math.MathException;
+import org.apache.commons.math.random.RandomDataImpl;
+
+/**
+ * A helper interface to sample from a distribution specified by a RandomDataImpl
+ * @author cstella
+ *
+ */
+public interface Sampler {
+  /**
+   * Generate a sample
+   * 
+   * @param randomData The distribution used
+   * @return A sample
+   * @throws MathException
+   */
+  public double sample(RandomDataImpl randomData) throws MathException ;
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/package-info.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/package-info.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/package-info.java
new file mode 100644
index 0000000..5357473
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/interfaces/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * Interfaces used in the implementation of {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>}.
+ */
+package datafu.pig.hash.lsh.interfaces;

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/Cosine.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/Cosine.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/Cosine.java
new file mode 100644
index 0000000..22222a1
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/Cosine.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh.metric;
+
+import org.apache.commons.math.linear.RealVector;
+
+/**
+ * A UDF used to find a vector v in a bag such that for query point q, metric m and threshold t
+ * m(v,q) < t.  In other words, find the first vector in the bag within a threshold distance away.
+ * 
+ *  It returns one of the tuples of the bag of vectors.  The metric used is 
+ * {@link <a href="http://en.wikipedia.org/wiki/Cosine_similarity" target="_blank">Cosine Similarity</a>}, 
+ * which technically does not form a metric, but I'm stretching the definition here.
+ * 
+ * @see datafu.pig.hash.lsh.CosineDistanceHash CosineDistanceHash for an example
+ * @author cstella
+ *
+ */
+public class Cosine extends MetricUDF {
+  
+  /**
+   * Create a new Cosine Metric UDF with a given dimension.
+   * 
+   * @param sDim
+   */
+  public Cosine(String sDim) {
+    super(sDim); 
+  }
+  
+  /**
+   * Cosine similarity.
+   * @param v1
+   * @param v2
+   * @return The cosine of the angle between the vectors
+   */
+  public static double distance(RealVector v1, RealVector v2) {
+    return (v1.dotProduct(v2)) / (v1.getNorm() * v2.getNorm());
+  }
+
+  /**
+   * Cosine similarity.
+   * @param v1
+   * @param v2
+   * @return Roughly the cosine of the angle between the vectors
+   */
+  @Override
+  protected double dist(RealVector v1, RealVector v2) {
+    return distance(v1, v2);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L1.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L1.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L1.java
new file mode 100644
index 0000000..311c2ed
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L1.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh.metric;
+
+import org.apache.commons.math.linear.RealVector;
+
+/**
+ * A UDF used to find a vector v in a bag such that for query point q, metric m and threshold t
+ * m(v,q) < t.  In other words, find the first vector in the bag within a threshold distance away.
+ * 
+ *  It returns one of the tuples of the bag of vectors using {@link <a href="http://en.wikipedia.org/wiki/Taxicab_geometry" target="_blank">L1 distance</a>}, 
+ * distance between two vectors.  This is otherwise known as
+ * the manhattan distance, taxicab distance or city block distance.
+ * 
+ * @see datafu.pig.hash.lsh.L1PStableHash L1PStableHash for an example
+ * @author cstella
+ *
+ */
+public class L1 extends MetricUDF {
+
+  /**
+   * Create a new L1 Metric UDF with a given dimension.
+   * 
+   * @param sDim
+   */
+  public L1(String sDim) {
+    super(sDim);
+   
+  }
+  
+  public static double distance(RealVector v1, RealVector v2) {
+    return v1.getL1Distance(v2);
+  }
+
+  @Override
+  protected double dist(RealVector v1, RealVector v2) {
+    return distance(v1, v2);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java
new file mode 100644
index 0000000..d01f9fb
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh.metric;
+
+import org.apache.commons.math.linear.RealVector;
+/**
+ * A UDF used to find a vector v in a bag such that for query point q, metric m and threshold t
+ * m(v,q) < t.  In other words, find the first vector in the bag within a threshold distance away.
+ * 
+ *  It returns one of the tuples of the bag of vectors using {@link <a href="http://en.wikipedia.org/wiki/Lp_space" target="_blank">L2 distance</a>}, 
+ * distance between two vectors.  This is otherwise known as
+ * the Euclidean distance.
+ * 
+ * @see datafu.pig.hash.lsh.L2PStableHash L2PStableHash for an example
+ * @author cstella
+ *
+ */
+public class L2 extends MetricUDF {
+
+  /**
+   * Create a new L2 Metric UDF with a given dimension.
+   * 
+   * @param sDim
+   */
+  public L2(String sDim) {
+    super(sDim);
+   
+  }
+
+  public static double distance(RealVector v1, RealVector v2) {
+    return v1.getDistance(v2);
+  }
+  
+  @Override
+  protected double dist(RealVector v1, RealVector v2) {
+    return distance(v1, v2);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java
new file mode 100644
index 0000000..da00a60
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh.metric;
+
+import java.io.IOException;
+
+import org.apache.commons.math.linear.RealVector;
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.DataType;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
+
+import datafu.pig.hash.lsh.util.DataTypeUtil;
+
+/**
+ * A base UDF used to find a vector v in a bag such that for query point q, metric m and threshold t
+ * m(v,q) < t.  In other words, find the first vector in the bag within a threshold distance away.
+ * 
+ *  It returns one of the tuples of the bag of vectors.  For an example of its use, please see datafu.pig.hash.lsh.CosineDistanceHash.
+ * 
+ * @see datafu.pig.hash.lsh.CosineDistanceHash
+ * @author cstella
+ *
+ */
+public abstract class MetricUDF extends EvalFunc<Tuple>
+{
+  protected int dim;
+  
+  /**
+   * Create a new Metric UDF with a given dimension.
+   * 
+   * @param sDim
+   */
+  public MetricUDF(String sDim)
+  {
+    dim = Integer.parseInt(sDim);
+  }
+  
+  /**
+   * The distance metric used.  Given v1 and v2, compute the distance between those vectors.
+   * @param v1 vector
+   * @param v2 vector
+   * @return the distance between v1 and v2
+   */
+  protected abstract double dist(RealVector v1, RealVector v2);
+  
+  /**
+   * This UDF expects a query vector as the first element, a threshold (double) as the second, and a bag of vectors.
+   * Vectors are represented by tuples with doubles as elements or bags of tuples representing position and value
+   * in the case of sparse vectors.
+   * 
+   * It returns one of the tuples of the bag of vectors.  For an example of its use, please see datafu.pig.hash.lsh.CosineDistanceHash.
+   * 
+   * @see datafu.pig.hash.lsh.CosineDistanceHash
+   */
+  @Override
+  public Tuple exec(Tuple input) throws IOException {
+    Object firstElement = input.get(0);
+    double distanceRange = ((Number)input.get(1)).doubleValue();
+    DataBag vectorBag = (DataBag)input.get(2);
+    RealVector referenceVector = null;
+    if(firstElement instanceof Tuple)
+    {
+      //in which case the first element is a non-sparse tuple
+      referenceVector = DataTypeUtil.INSTANCE.convert((Tuple)firstElement, dim);
+    }
+    else {
+      //in which case the first element is a bag, representing a sparse tuple
+      referenceVector = DataTypeUtil.INSTANCE.convert(input, dim);
+    }
+    
+    for(Tuple vecTuple : vectorBag )
+    {
+      Object vectorObj = vecTuple.get(0);
+      RealVector v2 = null;
+      if(vectorObj instanceof Tuple)
+      {
+        v2 = DataTypeUtil.INSTANCE.convert((Tuple)vecTuple.get(0), referenceVector.getDimension());
+      }
+      else
+      {
+        v2 = DataTypeUtil.INSTANCE.convert(vecTuple, referenceVector.getDimension());
+      }
+      double dist = dist(referenceVector, v2);
+      if(dist < distanceRange)
+      {
+        return vecTuple;
+      }
+    }
+    return null;
+  }
+  
+  /**
+   * Create the output schema, based on the input schema.
+   * 
+   * @return the output schema, which is a tuple matching the schema of the third input field.
+   */
+   public Schema outputSchema(Schema input) {
+          try{
+            validateInputSchema(input);
+            FieldSchema fieldSchema = input.getField(2);
+            return fieldSchema.schema;
+             
+          }catch (Exception e){
+                 throw new RuntimeException("Unable to create output schema", e);
+          }
+   }
+   
+   /**
+    * Validate the input schema to ensure that our input is consistent and that we fail fast.
+    * @param input
+    * @throws FrontendException
+    */
+   private void validateInputSchema(Schema input) throws FrontendException
+   {
+     {
+       FieldSchema vectorSchema = input.getField(0);
+       if(!DataTypeUtil.isValidVector(vectorSchema, dim))
+       {
+         throw new FrontendException("Invalid vector element: Expected either a tuple or a bag, but found " + vectorSchema);
+       }
+     }
+     
+     {
+       FieldSchema distanceSchema = input.getField(1);
+       if(distanceSchema.type != DataType.DOUBLE 
+       && distanceSchema.type != DataType.INTEGER 
+       && distanceSchema.type != DataType.LONG 
+       )
+       {
+         throw new FrontendException("Invalid distance element: Expected a number, but found " + distanceSchema);
+       }
+     }
+     
+     {
+       FieldSchema pointsSchema = input.getField(2);
+       if( pointsSchema.type != DataType.BAG)
+       {
+         throw new FrontendException("Invalid points element: Expected a bag, but found " + pointsSchema);
+       }
+       FieldSchema tupleInBag = pointsSchema.schema.getField(0);
+       FieldSchema vectorInTuple = tupleInBag.schema.getField(0);
+       if(!DataTypeUtil.isValidVector(vectorInTuple, dim))
+       {
+         throw new FrontendException("Invalid points element: Expected a bag of vectors, but found " + vectorInTuple.schema);
+       }
+     }
+   }
+   
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java
new file mode 100644
index 0000000..df8dbc7
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * UDFs for different {@link <a href="http://en.wikipedia.org/wiki/Metric_(mathematics)" target="_blank">distance functions</a>} (and some similarity functions)
+ * used with Locality Sensitive Hashing.
+ */
+package datafu.pig.hash.lsh.metric;

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java
new file mode 100644
index 0000000..0f3ba94
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh.p_stable;
+
+import org.apache.commons.math.MathException;
+import org.apache.commons.math.linear.RealVector;
+import org.apache.commons.math.random.RandomDataImpl;
+import org.apache.commons.math.random.RandomGenerator;
+
+import datafu.pig.hash.lsh.interfaces.LSH;
+import datafu.pig.hash.lsh.interfaces.Sampler;
+
+/**
+ * This is the base-class for all p-stable based locality sensitive hashes. p-stable locality sensitive
+ * hashes are defined by a few parameters: a dimension, d , a vector taken from a 
+ * {@link <a href="http://en.wikipedia.org/wiki/Stable_distribution" target="_blank">k-stable distribution</a>} 
+ * (where k is 1 or 2) and a width of projection, w.
+ * <p>
+ * All p-stable LSH functions are parameterized with a quantization parameter (w or r in
+ * the literature , depending on where you look). Consider the following excerpt
+ * from Datar, M.; Immorlica, N.; Indyk, P.; Mirrokni, V.S. (2004).
+ * "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions".
+ * Proceedings of the Symposium on Computational Geometry.
+ * 
+ * <pre>
+ * Decreasing the width of the projection (w) decreases the probability of collision for any two points. 
+ * Thus, it has the same effect as increasing k . As a result, we would like to set w as small as possible
+ * and in this way decrease the number of projections we need to make.
+ * </pre>
+ * 
+ * In the literature, the quantization parameter (or width of the projection) is
+ * found empirically given a sample of the data and the likely threshold for
+ * the metric. Tuning this parameter is very important for the performance of
+ * this algorithm. For more information, see Datar, M.; Immorlica, N.; Indyk,
+ * P.; Mirrokni, V.S. (2004).
+ * "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions".
+ * Proceedings of the Symposium on Computational Geometry.
+ * 
+ * @author cstella
+ * 
+ */
+public abstract class AbstractStableDistributionFunction extends LSH
+{
+  
+   private double[] a;
+   private double b;
+   double w;
+
+ 
+   /**
+    * Constructs a new instance.
+    * @param dim The dimension of the vectors to be hashed
+    * @param w A double representing the quantization parameter (also known as the projection width)
+    * @param rand The random generator used 
+    * @throws MathException 
+    */
+   public AbstractStableDistributionFunction(int dim, double w, RandomGenerator rand) throws MathException
+   {
+     super(dim, rand);
+     reset(dim, w); 
+   }
+
+   public void reset(int dim, double w) throws MathException
+   {
+      RandomDataImpl dataSampler = new RandomDataImpl(rg);
+      Sampler sampler = getSampler();      
+      this.a = new double[dim];
+      this.dim = dim;
+      this.w = w;
+      for(int i = 0;i < dim;++i)
+      {
+         a[i] = sampler.sample(dataSampler);
+      }
+      b = dataSampler.nextUniform(0, w);
+   }
+
+
+   /**
+    * The sampler determines the metric which this LSH is associated with.
+    * A 1-stable sample will yield a LSH which corresponds to a L1 metric; likewise for 2-stable and L2.
+    * @return The sampler to use. 
+    */
+   protected abstract Sampler 
+   getSampler();
+   
+   /**
+    * Compute the LSH for a given vector.
+    */
+   public long apply(RealVector vector)
+   {
+     /*
+      * The hash is just floor(<v, a>/w)
+      */
+      double ret = b;
+     
+      for(int i = 0;i < dim;++i)
+      {
+         ret += vector.getEntry(i)*a[i];
+      }
+      return (long)Math.floor(ret/w);
+   } 
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java
new file mode 100644
index 0000000..79bf7e5
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh.p_stable;
+
+import org.apache.commons.math.MathException;
+import org.apache.commons.math.random.RandomDataImpl;
+import org.apache.commons.math.random.RandomGenerator;
+
+import datafu.pig.hash.lsh.interfaces.Sampler;
+
+/**
+ * A locality sensitive hash associated with the L1 metric.  This uses a 1-stable distribution
+ * to construct the hash.
+ * 
+ * @author cstella
+ *
+ */
+public class L1LSH extends AbstractStableDistributionFunction implements Sampler
+{
+  /**
+   * Constructs a new instance.
+   * @throws MathException 
+   */
+  public L1LSH(int dim, double d, RandomGenerator rand) throws MathException {
+    super(dim, d, rand);
+  }
+
+  /**
+   * Draw a sample s ~ Cauchy(0,1), which is 1-stable.
+   * 
+   * @return a sample from a cauchy distribution with median 0 and scale 1
+   */
+  public double sample(RandomDataImpl randomData) throws MathException {
+    
+    return randomData.nextCauchy(0, 1);
+    
+  }
+  @Override
+  protected Sampler getSampler() {
+    return this;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java
new file mode 100644
index 0000000..d18b189
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.hash.lsh.p_stable;
+
+import org.apache.commons.math.MathException;
+import org.apache.commons.math.random.RandomDataImpl;
+import org.apache.commons.math.random.RandomGenerator;
+
+import datafu.pig.hash.lsh.interfaces.Sampler;
+
+/**
+ * A locality sensitive hash associated with the L2 metric.  This uses a 2-stable distribution
+ * to construct the hash.
+ * 
+ * @author cstella
+ *
+ */
+public class L2LSH extends AbstractStableDistributionFunction implements Sampler {
+
+  /**
+   * Constructs a new instance.
+   * @throws MathException 
+   */
+  public L2LSH(int dim, double w, RandomGenerator rand) throws MathException {
+    super(dim, w, rand);
+  }
+
+  /**
+   * Draw a sample s ~ Gaussian(0,1), which is 2-stable.
+   * 
+   * @return a sample from a Gaussian distribution with mu of 0 and sigma of 1
+   */
+   public double sample(RandomDataImpl randomData)
+     {
+           return randomData.nextGaussian(0,1); 
+     }
+
+  @Override
+  protected Sampler getSampler() {
+    return this;
+  }
+
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java
new file mode 100644
index 0000000..ec9c313
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * Implementation of {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>} for 
+ * {@link <a href="http://en.wikipedia.org/wiki/Lp_space" target="_blank">L1 and L2 metrics</a>}.
+ * 
+ * See Datar, M.; Immorlica, N.; Indyk, P.; Mirrokni, V.S. (2004). "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions". Proceedings of the Symposium on Computational Geometry.
+ * 
+ */
+package datafu.pig.hash.lsh.p_stable;

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/544e5af0/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java
new file mode 100644
index 0000000..045ed0d
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * UDFs for {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>}
+ */
+package datafu.pig.hash.lsh;