You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hivemall.apache.org by my...@apache.org on 2019/11/27 09:03:48 UTC
[incubator-hivemall] branch master updated: [HIVEMALL-56][DOC] Add
documentation about Similarity/Distance functions
This is an automated email from the ASF dual-hosted git repository.
myui pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git
The following commit(s) were added to refs/heads/master by this push:
new 5113ee0 [HIVEMALL-56][DOC] Add documentation about Similarity/Distance functions
5113ee0 is described below
commit 5113ee00fd420449c42ca5385085a67d1195575f
Author: Makoto Yui <my...@apache.org>
AuthorDate: Wed Nov 27 18:03:41 2019 +0900
[HIVEMALL-56][DOC] Add documentation about Similarity/Distance functions
## What changes were proposed in this pull request?
Add documentation about Similarity/Distance functions
## What type of PR is it?
Documentation
## What is the Jira issue?
https://issues.apache.org/jira/browse/HIVEMALL-56
## Checklist
Author: Makoto Yui <my...@apache.org>
Closes #222 from myui/HIVEMALL-56.
---
.../hivemall/knn/distance/AngularDistanceUDF.java | 36 ++-
.../hivemall/knn/distance/CosineDistanceUDF.java | 32 ++-
.../hivemall/knn/distance/EuclidDistanceUDF.java | 32 ++-
.../hivemall/knn/distance/HammingDistanceUDF.java | 11 +-
.../hivemall/knn/distance/JaccardDistanceUDF.java | 12 +-
.../hivemall/knn/distance/KLDivergenceUDF.java | 3 +-
.../knn/distance/ManhattanDistanceUDF.java | 36 ++-
.../knn/distance/MinkowskiDistanceUDF.java | 39 ++-
.../java/hivemall/knn/distance/PopcountUDF.java | 10 +-
.../knn/similarity/AngularSimilarityUDF.java | 37 ++-
.../knn/similarity/CosineSimilarityUDF.java | 35 ++-
.../knn/similarity/Distance2SimilarityUDF.java | 2 +-
.../hivemall/knn/similarity/EuclidSimilarity.java | 35 ++-
.../hivemall/knn/similarity/JaccardIndexUDF.java | 32 ++-
docs/gitbook/misc/funcs.md | 302 ++++++++++++++++++++-
15 files changed, 624 insertions(+), 30 deletions(-)
diff --git a/core/src/main/java/hivemall/knn/distance/AngularDistanceUDF.java b/core/src/main/java/hivemall/knn/distance/AngularDistanceUDF.java
index c2f864f..05abae0 100644
--- a/core/src/main/java/hivemall/knn/distance/AngularDistanceUDF.java
+++ b/core/src/main/java/hivemall/knn/distance/AngularDistanceUDF.java
@@ -37,8 +37,38 @@ import org.apache.hadoop.io.FloatWritable;
/**
* @see http://en.wikipedia.org/wiki/Cosine_similarity#Angular_similarity
*/
+//@formatter:off
@Description(name = "angular_distance",
- value = "_FUNC_(ftvec1, ftvec2) - Returns an angular distance of the given two vectors")
+ value = "_FUNC_(ftvec1, ftvec2) - Returns an angular distance of the given two vectors",
+ extended = "WITH docs as (\n" +
+ " select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features\n" +
+ " union all\n" +
+ " select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" +
+ " union all\n" +
+ " select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" +
+ ") \n" +
+ "select\n" +
+ " l.docid as doc1,\n" +
+ " r.docid as doc2,\n" +
+ " angular_distance(l.features, r.features) as distance,\n" +
+ " distance2similarity(angular_distance(l.features, r.features)) as similarity\n" +
+ "from \n" +
+ " docs l\n" +
+ " CROSS JOIN docs r\n" +
+ "where\n" +
+ " l.docid != r.docid\n" +
+ "order by \n" +
+ " doc1 asc,\n" +
+ " distance asc;\n" +
+ "\n" +
+ "doc1 doc2 distance similarity\n" +
+ "1 3 0.31678355 0.75942624\n" +
+ "1 2 0.33333337 0.75\n" +
+ "2 3 0.09841931 0.91039914\n" +
+ "2 1 0.33333337 0.75\n" +
+ "3 2 0.09841931 0.91039914\n" +
+ "3 1 0.31678355 0.75942624")
+//@formatter:on
@UDFType(deterministic = true, stateful = false)
public final class AngularDistanceUDF extends GenericUDF {
@@ -49,8 +79,8 @@ public final class AngularDistanceUDF extends GenericUDF {
if (argOIs.length != 2) {
throw new UDFArgumentException("angular_distance takes 2 arguments");
}
- this.arg0ListOI = HiveUtils.asListOI(argOIs[0]);
- this.arg1ListOI = HiveUtils.asListOI(argOIs[1]);
+ this.arg0ListOI = HiveUtils.asListOI(argOIs, 0);
+ this.arg1ListOI = HiveUtils.asListOI(argOIs, 1);
return PrimitiveObjectInspectorFactory.writableFloatObjectInspector;
}
diff --git a/core/src/main/java/hivemall/knn/distance/CosineDistanceUDF.java b/core/src/main/java/hivemall/knn/distance/CosineDistanceUDF.java
index 8c36660..59f88c6 100644
--- a/core/src/main/java/hivemall/knn/distance/CosineDistanceUDF.java
+++ b/core/src/main/java/hivemall/knn/distance/CosineDistanceUDF.java
@@ -37,9 +37,39 @@ import org.apache.hadoop.io.FloatWritable;
/**
* @link http://en.wikipedia.org/wiki/Cosine_similarity
*/
+//@formatter:off
@Description(name = "cosine_distance",
- value = "_FUNC_(ftvec1, ftvec2) - Returns a cosine distance of the given two vectors")
+ value = "_FUNC_(ftvec1, ftvec2) - Returns a cosine distance of the given two vectors",
+ extended = "WITH docs as (\n" +
+ " select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features\n" +
+ " union all\n" +
+ " select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" +
+ " union all\n" +
+ " select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" +
+ ") \n" +
+ "select\n" +
+ " l.docid as doc1,\n" +
+ " r.docid as doc2,\n" +
+ " cosine_distance(l.features, r.features) as distance,\n" +
+ " distance2similarity(cosine_distance(l.features, r.features)) as similarity\n" +
+ "from \n" +
+ " docs l\n" +
+ " CROSS JOIN docs r\n" +
+ "where\n" +
+ " l.docid != r.docid\n" +
+ "order by \n" +
+ " doc1 asc,\n" +
+ " distance asc;\n" +
+ "\n" +
+ "doc1 doc2 distance similarity\n" +
+ "1 3 0.45566893 0.6869694\n" +
+ "1 2 0.5 0.6666667\n" +
+ "2 3 0.04742068 0.95472616\n" +
+ "2 1 0.5 0.6666667\n" +
+ "3 2 0.04742068 0.95472616\n" +
+ "3 1 0.45566893 0.6869694")
@UDFType(deterministic = true, stateful = false)
+//@formatter:on
public final class CosineDistanceUDF extends GenericUDF {
private ListObjectInspector arg0ListOI, arg1ListOI;
diff --git a/core/src/main/java/hivemall/knn/distance/EuclidDistanceUDF.java b/core/src/main/java/hivemall/knn/distance/EuclidDistanceUDF.java
index ca06fcb..b36b893 100644
--- a/core/src/main/java/hivemall/knn/distance/EuclidDistanceUDF.java
+++ b/core/src/main/java/hivemall/knn/distance/EuclidDistanceUDF.java
@@ -36,10 +36,40 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.FloatWritable;
+//@formatter:off
@Description(name = "euclid_distance",
value = "_FUNC_(ftvec1, ftvec2) - Returns the square root of the sum of the squared differences"
- + ": sqrt(sum((x - y)^2))")
+ + ": sqrt(sum((x - y)^2))",
+ extended = "WITH docs as (\n" +
+ " select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features\n" +
+ " union all\n" +
+ " select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" +
+ " union all\n" +
+ " select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" +
+ ") \n" +
+ "select\n" +
+ " l.docid as doc1,\n" +
+ " r.docid as doc2,\n" +
+ " euclid_distance(l.features, r.features) as distance,\n" +
+ " distance2similarity(euclid_distance(l.features, r.features)) as similarity\n" +
+ "from \n" +
+ " docs l\n" +
+ " CROSS JOIN docs r\n" +
+ "where\n" +
+ " l.docid != r.docid\n" +
+ "order by \n" +
+ " doc1 asc,\n" +
+ " distance asc;\n" +
+ "\n" +
+ "doc1 doc2 distance similarity\n" +
+ "1 2 2.4494898 0.28989795\n" +
+ "1 3 2.6457512 0.2742919\n" +
+ "2 3 1.0 0.5\n" +
+ "2 1 2.4494898 0.28989795\n" +
+ "3 2 1.0 0.5\n" +
+ "3 1 2.6457512 0.2742919")
@UDFType(deterministic = true, stateful = false)
+//@formatter:on
public final class EuclidDistanceUDF extends GenericUDF {
private ListObjectInspector arg0ListOI, arg1ListOI;
diff --git a/core/src/main/java/hivemall/knn/distance/HammingDistanceUDF.java b/core/src/main/java/hivemall/knn/distance/HammingDistanceUDF.java
index c420d8b..931f6e9 100644
--- a/core/src/main/java/hivemall/knn/distance/HammingDistanceUDF.java
+++ b/core/src/main/java/hivemall/knn/distance/HammingDistanceUDF.java
@@ -29,9 +29,18 @@ import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
+//@formatter:off
@Description(name = "hamming_distance",
- value = "_FUNC_(A, B [,int k]) - Returns Hamming distance between A and B")
+ value = "_FUNC_(integer A, integer B) - Returns Hamming distance between A and B",
+ extended = "select \n" +
+ " hamming_distance(0,3) as c1, \n" +
+ " hamming_distance(\"0\",\"3\") as c2 -- 0=0x00, 3=0x11\n" +
+ ";\n" +
+ "\n" +
+ "c1 c2\n" +
+ "2 2")
@UDFType(deterministic = true, stateful = false)
+//@formatter:on
public class HammingDistanceUDF extends UDF {
public IntWritable evaluate(long a, long b) {
diff --git a/core/src/main/java/hivemall/knn/distance/JaccardDistanceUDF.java b/core/src/main/java/hivemall/knn/distance/JaccardDistanceUDF.java
index 70c281e..48330dc 100644
--- a/core/src/main/java/hivemall/knn/distance/JaccardDistanceUDF.java
+++ b/core/src/main/java/hivemall/knn/distance/JaccardDistanceUDF.java
@@ -28,9 +28,19 @@ import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.io.FloatWritable;
+//@formatter:off
@Description(name = "jaccard_distance",
- value = "_FUNC_(A, B [,int k]) - Returns Jaccard distance between A and B")
+ value = "_FUNC_(integer A, integer B [,int k=128]) - Returns Jaccard distance between A and B",
+ extended = "select \n" +
+ " jaccard_distance(0,3) as c1, \n" +
+ " jaccard_distance(\"0\",\"3\") as c2, -- 0=0x00, 0=0x11\n" +
+ " jaccard_distance(0,4) as c3\n" +
+ ";\n" +
+ "\n" +
+ "c1 c2 c3\n" +
+ "0.03125 0.03125 0.015625")
@UDFType(deterministic = true, stateful = false)
+//@formatter:on
public final class JaccardDistanceUDF extends UDF {
private final Set<Object> union = new HashSet<Object>();
diff --git a/core/src/main/java/hivemall/knn/distance/KLDivergenceUDF.java b/core/src/main/java/hivemall/knn/distance/KLDivergenceUDF.java
index 5b68be3..6c1d6fa 100644
--- a/core/src/main/java/hivemall/knn/distance/KLDivergenceUDF.java
+++ b/core/src/main/java/hivemall/knn/distance/KLDivergenceUDF.java
@@ -24,9 +24,10 @@ import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.io.FloatWritable;
-@Description(name = "kld", value = "_FUNC_(double m1, double sigma1, double mu2, double sigma 2)"
+@Description(name = "kld", value = "_FUNC_(double mu1, double sigma1, double mu2, double sigma2)"
+ " - Returns KL divergence between two distributions")
@UDFType(deterministic = true, stateful = false)
+//@formatter:on
public final class KLDivergenceUDF extends UDF {
public DoubleWritable evaluate(double mu1, double sigma1, double mu2, double sigma2) {
diff --git a/core/src/main/java/hivemall/knn/distance/ManhattanDistanceUDF.java b/core/src/main/java/hivemall/knn/distance/ManhattanDistanceUDF.java
index 4ab258d..c35ff8d 100644
--- a/core/src/main/java/hivemall/knn/distance/ManhattanDistanceUDF.java
+++ b/core/src/main/java/hivemall/knn/distance/ManhattanDistanceUDF.java
@@ -36,8 +36,38 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.FloatWritable;
-@Description(name = "manhattan_distance", value = "_FUNC_(list x, list y) - Returns sum(|x - y|)")
+//@formatter:off
+@Description(name = "manhattan_distance", value = "_FUNC_(list x, list y) - Returns sum(|x - y|)",
+ extended = "WITH docs as (\n" +
+ " select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features\n" +
+ " union all\n" +
+ " select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" +
+ " union all\n" +
+ " select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" +
+ ") \n" +
+ "select\n" +
+ " l.docid as doc1,\n" +
+ " r.docid as doc2,\n" +
+ " manhattan_distance(l.features, r.features) as distance,\n" +
+ " distance2similarity(angular_distance(l.features, r.features)) as similarity\n" +
+ "from \n" +
+ " docs l\n" +
+ " CROSS JOIN docs r\n" +
+ "where\n" +
+ " l.docid != r.docid\n" +
+ "order by \n" +
+ " doc1 asc,\n" +
+ " distance asc;\n" +
+ "\n" +
+ "doc1 doc2 distance similarity\n" +
+ "1 2 4.0 0.75\n" +
+ "1 3 5.0 0.75942624\n" +
+ "2 3 1.0 0.91039914\n" +
+ "2 1 4.0 0.75\n" +
+ "3 2 1.0 0.91039914\n" +
+ "3 1 5.0 0.75942624")
@UDFType(deterministic = true, stateful = false)
+//@formatter:on
public final class ManhattanDistanceUDF extends GenericUDF {
private ListObjectInspector arg0ListOI, arg1ListOI;
@@ -47,8 +77,8 @@ public final class ManhattanDistanceUDF extends GenericUDF {
if (argOIs.length != 2) {
throw new UDFArgumentException("manhattan_distance takes 2 arguments");
}
- this.arg0ListOI = HiveUtils.asListOI(argOIs[0]);
- this.arg1ListOI = HiveUtils.asListOI(argOIs[1]);
+ this.arg0ListOI = HiveUtils.asListOI(argOIs, 0);
+ this.arg1ListOI = HiveUtils.asListOI(argOIs, 1);
return PrimitiveObjectInspectorFactory.writableFloatObjectInspector;
}
diff --git a/core/src/main/java/hivemall/knn/distance/MinkowskiDistanceUDF.java b/core/src/main/java/hivemall/knn/distance/MinkowskiDistanceUDF.java
index b0282b8..32fe637 100644
--- a/core/src/main/java/hivemall/knn/distance/MinkowskiDistanceUDF.java
+++ b/core/src/main/java/hivemall/knn/distance/MinkowskiDistanceUDF.java
@@ -36,9 +36,42 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.FloatWritable;
+//@formatter:off
@Description(name = "minkowski_distance",
- value = "_FUNC_(list x, list y, double p) - Returns sum(|x - y|^p)^(1/p)")
+ value = "_FUNC_(list x, list y, double p) - Returns sum(|x - y|^p)^(1/p)",
+ extended = "WITH docs as (\n" +
+ " select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features\n" +
+ " union all\n" +
+ " select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" +
+ " union all\n" +
+ " select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" +
+ ") \n" +
+ "select\n" +
+ " l.docid as doc1,\n" +
+ " r.docid as doc2,\n" +
+ " minkowski_distance(l.features, r.features, 1) as distance1, -- p=1 (manhattan_distance)\n" +
+ " minkowski_distance(l.features, r.features, 2) as distance2, -- p=2 (euclid_distance)\n" +
+ " minkowski_distance(l.features, r.features, 3) as distance3, -- p=3\n" +
+ " manhattan_distance(l.features, r.features) as manhattan_distance,\n" +
+ " euclid_distance(l.features, r.features) as euclid_distance\n" +
+ "from \n" +
+ " docs l\n" +
+ " CROSS JOIN docs r\n" +
+ "where\n" +
+ " l.docid != r.docid\n" +
+ "order by \n" +
+ " doc1 asc,\n" +
+ " distance1 asc;\n" +
+ "\n" +
+ "doc1 doc2 distance1 distance2 distance3 manhattan_distance euclid_distance\n" +
+ "1 2 4.0 2.4494898 2.1544347 4.0 2.4494898\n" +
+ "1 3 5.0 2.6457512 2.2239802 5.0 2.6457512\n" +
+ "2 3 1.0 1.0 1.0 1.0 1.0\n" +
+ "2 1 4.0 2.4494898 2.1544347 4.0 2.4494898\n" +
+ "3 2 1.0 1.0 1.0 1.0 1.0\n" +
+ "3 1 5.0 2.6457512 2.2239802 5.0 2.6457512")
@UDFType(deterministic = true, stateful = false)
+//@formatter:on
public final class MinkowskiDistanceUDF extends GenericUDF {
private ListObjectInspector arg0ListOI, arg1ListOI;
@@ -49,8 +82,8 @@ public final class MinkowskiDistanceUDF extends GenericUDF {
if (argOIs.length != 3) {
throw new UDFArgumentException("minkowski_distance takes 3 arguments");
}
- this.arg0ListOI = HiveUtils.asListOI(argOIs[0]);
- this.arg1ListOI = HiveUtils.asListOI(argOIs[1]);
+ this.arg0ListOI = HiveUtils.asListOI(argOIs, 0);
+ this.arg1ListOI = HiveUtils.asListOI(argOIs, 1);
this.order_p = HiveUtils.getAsConstDouble(argOIs[2]);
return PrimitiveObjectInspectorFactory.writableFloatObjectInspector;
diff --git a/core/src/main/java/hivemall/knn/distance/PopcountUDF.java b/core/src/main/java/hivemall/knn/distance/PopcountUDF.java
index 0d99d4d..57106b9 100644
--- a/core/src/main/java/hivemall/knn/distance/PopcountUDF.java
+++ b/core/src/main/java/hivemall/knn/distance/PopcountUDF.java
@@ -28,8 +28,16 @@ import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.io.IntWritable;
-@Description(name = "popcnt", value = "_FUNC_(a [, b]) - Returns a popcount value")
+//@formatter:off
+@Description(name = "popcnt", value = "_FUNC_(a [, b]) - Returns a popcount value",
+ extended = "select \n" +
+ " popcnt(3),\n" +
+ " popcnt(\"3\"), -- 3=0x11\n" +
+ " popcnt(array(1,3));\n" +
+ "\n" +
+ "2 2 3")
@UDFType(deterministic = true, stateful = false)
+//@formatter:on
public final class PopcountUDF extends UDF {
public IntWritable evaluate(long a) {
diff --git a/core/src/main/java/hivemall/knn/similarity/AngularSimilarityUDF.java b/core/src/main/java/hivemall/knn/similarity/AngularSimilarityUDF.java
index 1e881fd..e381d36 100644
--- a/core/src/main/java/hivemall/knn/similarity/AngularSimilarityUDF.java
+++ b/core/src/main/java/hivemall/knn/similarity/AngularSimilarityUDF.java
@@ -36,8 +36,37 @@ import org.apache.hadoop.io.FloatWritable;
/**
* @see http://en.wikipedia.org/wiki/Cosine_similarity#Angular_similarity
*/
+//@formatter:off
@Description(name = "angular_similarity",
- value = "_FUNC_(ftvec1, ftvec2) - Returns an angular similarity of the given two vectors")
+ value = "_FUNC_(ftvec1, ftvec2) - Returns an angular similarity of the given two vectors",
+ extended = "WITH docs as (\n" +
+ " select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features\n" +
+ " union all\n" +
+ " select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" +
+ " union all\n" +
+ " select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" +
+ ") \n" +
+ "select\n" +
+ " l.docid as doc1,\n" +
+ " r.docid as doc2,\n" +
+ " angular_similarity(l.features, r.features) as similarity\n" +
+ "from \n" +
+ " docs l\n" +
+ " CROSS JOIN docs r\n" +
+ "where\n" +
+ " l.docid != r.docid\n" +
+ "order by \n" +
+ " doc1 asc,\n" +
+ " similarity desc;\n" +
+ "\n" +
+ "doc1 doc2 similarity\n" +
+ "1 3 0.68321645\n" +
+ "1 2 0.6666666\n" +
+ "2 3 0.9015807\n" +
+ "2 1 0.6666666\n" +
+ "3 2 0.9015807\n" +
+ "3 1 0.68321645")
+//@formatter:on
@UDFType(deterministic = true, stateful = false)
public final class AngularSimilarityUDF extends GenericUDF {
@@ -45,11 +74,11 @@ public final class AngularSimilarityUDF extends GenericUDF {
@Override
public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
- if (argOIs.length != 2) {
+ if (argOIs.length != 2) {
throw new UDFArgumentException("angular_similarity takes 2 arguments");
}
- this.arg0ListOI = HiveUtils.asListOI(argOIs[0]);
- this.arg1ListOI = HiveUtils.asListOI(argOIs[1]);
+ this.arg0ListOI = HiveUtils.asListOI(argOIs, 0);
+ this.arg1ListOI = HiveUtils.asListOI(argOIs, 1);
return PrimitiveObjectInspectorFactory.writableFloatObjectInspector;
}
diff --git a/core/src/main/java/hivemall/knn/similarity/CosineSimilarityUDF.java b/core/src/main/java/hivemall/knn/similarity/CosineSimilarityUDF.java
index 163ec70..469850c 100644
--- a/core/src/main/java/hivemall/knn/similarity/CosineSimilarityUDF.java
+++ b/core/src/main/java/hivemall/knn/similarity/CosineSimilarityUDF.java
@@ -36,8 +36,37 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.FloatWritable;
+//@formatter:off
@Description(name = "cosine_similarity",
- value = "_FUNC_(ftvec1, ftvec2) - Returns a cosine similarity of the given two vectors")
+ value = "_FUNC_(ftvec1, ftvec2) - Returns a cosine similarity of the given two vectors",
+ extended = "WITH docs as (\n" +
+ " select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features\n" +
+ " union all\n" +
+ " select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" +
+ " union all\n" +
+ " select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" +
+ ") \n" +
+ "select\n" +
+ " l.docid as doc1,\n" +
+ " r.docid as doc2,\n" +
+ " cosine_similarity(l.features, r.features) as similarity\n" +
+ "from \n" +
+ " docs l\n" +
+ " CROSS JOIN docs r\n" +
+ "where\n" +
+ " l.docid != r.docid\n" +
+ "order by \n" +
+ " doc1 asc,\n" +
+ " similarity desc;\n" +
+ "\n" +
+ "doc1 doc2 similarity\n" +
+ "1 3 0.5443311\n" +
+ "1 2 0.5\n" +
+ "2 3 0.9525793\n" +
+ "2 1 0.5\n" +
+ "3 2 0.9525793\n" +
+ "3 1 0.5443311")
+//@formatter:on
@UDFType(deterministic = true, stateful = false)
public final class CosineSimilarityUDF extends GenericUDF {
@@ -48,8 +77,8 @@ public final class CosineSimilarityUDF extends GenericUDF {
if (argOIs.length != 2) {
throw new UDFArgumentException("cosine_similarity takes 2 arguments");
}
- this.arg0ListOI = HiveUtils.asListOI(argOIs[0]);
- this.arg1ListOI = HiveUtils.asListOI(argOIs[1]);
+ this.arg0ListOI = HiveUtils.asListOI(argOIs, 0);
+ this.arg1ListOI = HiveUtils.asListOI(argOIs, 1);
return PrimitiveObjectInspectorFactory.writableFloatObjectInspector;
}
diff --git a/core/src/main/java/hivemall/knn/similarity/Distance2SimilarityUDF.java b/core/src/main/java/hivemall/knn/similarity/Distance2SimilarityUDF.java
index 9186b80..64fef74 100644
--- a/core/src/main/java/hivemall/knn/similarity/Distance2SimilarityUDF.java
+++ b/core/src/main/java/hivemall/knn/similarity/Distance2SimilarityUDF.java
@@ -44,7 +44,7 @@ public final class Distance2SimilarityUDF extends GenericUDF {
if (argOIs.length != 1) {
throw new UDFArgumentException("distance2similarity takes 1 argument");
}
- this.distanceOI = HiveUtils.asDoubleCompatibleOI(argOIs[0]);
+ this.distanceOI = HiveUtils.asDoubleCompatibleOI(argOIs, 0);
return PrimitiveObjectInspectorFactory.writableFloatObjectInspector;
}
diff --git a/core/src/main/java/hivemall/knn/similarity/EuclidSimilarity.java b/core/src/main/java/hivemall/knn/similarity/EuclidSimilarity.java
index fad06a5..9494e21 100644
--- a/core/src/main/java/hivemall/knn/similarity/EuclidSimilarity.java
+++ b/core/src/main/java/hivemall/knn/similarity/EuclidSimilarity.java
@@ -34,9 +34,38 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.FloatWritable;
+//@formatter:off
@Description(name = "euclid_similarity",
value = "_FUNC_(ftvec1, ftvec2) - Returns a euclid distance based similarity"
- + ", which is `1.0 / (1.0 + distance)`, of the given two vectors")
+ + ", which is `1.0 / (1.0 + distance)`, of the given two vectors",
+ extended = "WITH docs as (\n" +
+ " select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features\n" +
+ " union all\n" +
+ " select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" +
+ " union all\n" +
+ " select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" +
+ ") \n" +
+ "select\n" +
+ " l.docid as doc1,\n" +
+ " r.docid as doc2,\n" +
+ " euclid_similarity(l.features, r.features) as similarity\n" +
+ "from \n" +
+ " docs l\n" +
+ " CROSS JOIN docs r\n" +
+ "where\n" +
+ " l.docid != r.docid\n" +
+ "order by \n" +
+ " doc1 asc,\n" +
+ " similarity desc;\n" +
+ "\n" +
+ "doc1 doc2 similarity\n" +
+ "1 2 0.28989795\n" +
+ "1 3 0.2742919\n" +
+ "2 3 0.5\n" +
+ "2 1 0.28989795\n" +
+ "3 2 0.5\n" +
+ "3 1 0.2742919")
+//@formatter:on
@UDFType(deterministic = true, stateful = false)
public final class EuclidSimilarity extends GenericUDF {
@@ -47,8 +76,8 @@ public final class EuclidSimilarity extends GenericUDF {
if (argOIs.length != 2) {
throw new UDFArgumentException("euclid_similarity takes 2 arguments");
}
- this.arg0ListOI = HiveUtils.asListOI(argOIs[0]);
- this.arg1ListOI = HiveUtils.asListOI(argOIs[1]);
+ this.arg0ListOI = HiveUtils.asListOI(argOIs, 0);
+ this.arg1ListOI = HiveUtils.asListOI(argOIs, 1);
return PrimitiveObjectInspectorFactory.writableFloatObjectInspector;
}
diff --git a/core/src/main/java/hivemall/knn/similarity/JaccardIndexUDF.java b/core/src/main/java/hivemall/knn/similarity/JaccardIndexUDF.java
index f453f82..dfce04d 100644
--- a/core/src/main/java/hivemall/knn/similarity/JaccardIndexUDF.java
+++ b/core/src/main/java/hivemall/knn/similarity/JaccardIndexUDF.java
@@ -19,6 +19,7 @@
package hivemall.knn.similarity;
import static hivemall.utils.hadoop.WritableUtils.val;
+
import hivemall.knn.distance.HammingDistanceUDF;
import java.math.BigInteger;
@@ -31,8 +32,37 @@ import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.io.FloatWritable;
+//@formatter:off
@Description(name = "jaccard_similarity",
- value = "_FUNC_(A, B [,int k]) - Returns Jaccard similarity coefficient of A and B")
+ value = "_FUNC_(A, B [,int k]) - Returns Jaccard similarity coefficient of A and B",
+ extended = "WITH docs as (\n" +
+ " select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features\n" +
+ " union all\n" +
+ " select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" +
+ " union all\n" +
+ " select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" +
+ ") \n" +
+ "select\n" +
+ " l.docid as doc1,\n" +
+ " r.docid as doc2,\n" +
+ " jaccard_similarity(l.features, r.features) as similarity\n" +
+ "from \n" +
+ " docs l\n" +
+ " CROSS JOIN docs r\n" +
+ "where\n" +
+ " l.docid != r.docid\n" +
+ "order by \n" +
+ " doc1 asc,\n" +
+ " similarity desc;\n" +
+ "\n" +
+ "doc1 doc2 similarity\n" +
+ "1 2 0.14285715\n" +
+ "1 3 0.0\n" +
+ "2 3 0.6\n" +
+ "2 1 0.14285715\n" +
+ "3 2 0.6\n" +
+ "3 1 0.0")
+//@formatter:on
@UDFType(deterministic = true, stateful = false)
public final class JaccardIndexUDF extends UDF {
diff --git a/docs/gitbook/misc/funcs.md b/docs/gitbook/misc/funcs.md
index 3e5f92b..e04713b 100644
--- a/docs/gitbook/misc/funcs.md
+++ b/docs/gitbook/misc/funcs.md
@@ -424,22 +424,202 @@ Reference: <a href="https://papers.nips.cc/paper/3848-adaptive-regularization-of
# Distance measures
- `angular_distance(ftvec1, ftvec2)` - Returns an angular distance of the given two vectors
+ ```sql
+ WITH docs as (
+ select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features
+ union all
+ select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+ union all
+ select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+ )
+ select
+ l.docid as doc1,
+ r.docid as doc2,
+ angular_distance(l.features, r.features) as distance,
+ distance2similarity(angular_distance(l.features, r.features)) as similarity
+ from
+ docs l
+ CROSS JOIN docs r
+ where
+ l.docid != r.docid
+ order by
+ doc1 asc,
+ distance asc;
+
+ doc1 doc2 distance similarity
+ 1 3 0.31678355 0.75942624
+ 1 2 0.33333337 0.75
+ 2 3 0.09841931 0.91039914
+ 2 1 0.33333337 0.75
+ 3 2 0.09841931 0.91039914
+ 3 1 0.31678355 0.75942624
+ ```
- `cosine_distance(ftvec1, ftvec2)` - Returns a cosine distance of the given two vectors
+ ```sql
+ WITH docs as (
+ select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features
+ union all
+ select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+ union all
+ select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+ )
+ select
+ l.docid as doc1,
+ r.docid as doc2,
+ cosine_distance(l.features, r.features) as distance,
+ distance2similarity(cosine_distance(l.features, r.features)) as similarity
+ from
+ docs l
+ CROSS JOIN docs r
+ where
+ l.docid != r.docid
+ order by
+ doc1 asc,
+ distance asc;
+
+ doc1 doc2 distance similarity
+ 1 3 0.45566893 0.6869694
+ 1 2 0.5 0.6666667
+ 2 3 0.04742068 0.95472616
+ 2 1 0.5 0.6666667
+ 3 2 0.04742068 0.95472616
+ 3 1 0.45566893 0.6869694
+ ```
- `euclid_distance(ftvec1, ftvec2)` - Returns the square root of the sum of the squared differences: sqrt(sum((x - y)^2))
+ ```sql
+ WITH docs as (
+ select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features
+ union all
+ select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+ union all
+ select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+ )
+ select
+ l.docid as doc1,
+ r.docid as doc2,
+ euclid_distance(l.features, r.features) as distance,
+ distance2similarity(euclid_distance(l.features, r.features)) as similarity
+ from
+ docs l
+ CROSS JOIN docs r
+ where
+ l.docid != r.docid
+ order by
+ doc1 asc,
+ distance asc;
+
+ doc1 doc2 distance similarity
+ 1 2 2.4494898 0.28989795
+ 1 3 2.6457512 0.2742919
+ 2 3 1.0 0.5
+ 2 1 2.4494898 0.28989795
+ 3 2 1.0 0.5
+ 3 1 2.6457512 0.2742919
+ ```
+
+- `hamming_distance(integer A, integer B)` - Returns Hamming distance between A and B
+ ```sql
+ select
+ hamming_distance(0,3) as c1,
+ hamming_distance("0","3") as c2 -- 0=0x00, 3=0x11
+ ;
-- `hamming_distance(A, B [,int k])` - Returns Hamming distance between A and B
+ c1 c2
+ 2 2
+ ```
+
+- `jaccard_distance(integer A, integer B [,int k=128])` - Returns Jaccard distance between A and B
+ ```sql
+ select
+ jaccard_distance(0,3) as c1,
+ jaccard_distance("0","3") as c2, -- 0=0x00, 0=0x11
+ jaccard_distance(0,4) as c3
+ ;
-- `jaccard_distance(A, B [,int k])` - Returns Jaccard distance between A and B
+ c1 c2 c3
+ 0.03125 0.03125 0.015625
+ ```
-- `kld(double m1, double sigma1, double mu2, double sigma 2)` - Returns KL divergence between two distributions
+- `kld(double mu1, double sigma1, double mu2, double sigma2)` - Returns KL divergence between two distributions
- `manhattan_distance(list x, list y)` - Returns sum(|x - y|)
+ ```sql
+ WITH docs as (
+ select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features
+ union all
+ select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+ union all
+ select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+ )
+ select
+ l.docid as doc1,
+ r.docid as doc2,
+ manhattan_distance(l.features, r.features) as distance,
+ distance2similarity(angular_distance(l.features, r.features)) as similarity
+ from
+ docs l
+ CROSS JOIN docs r
+ where
+ l.docid != r.docid
+ order by
+ doc1 asc,
+ distance asc;
+
+ doc1 doc2 distance similarity
+ 1 2 4.0 0.75
+ 1 3 5.0 0.75942624
+ 2 3 1.0 0.91039914
+ 2 1 4.0 0.75
+ 3 2 1.0 0.91039914
+ 3 1 5.0 0.75942624
+ ```
- `minkowski_distance(list x, list y, double p)` - Returns sum(|x - y|^p)^(1/p)
+ ```sql
+ WITH docs as (
+ select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features
+ union all
+ select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+ union all
+ select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+ )
+ select
+ l.docid as doc1,
+ r.docid as doc2,
+ minkowski_distance(l.features, r.features, 1) as distance1, -- p=1 (manhattan_distance)
+ minkowski_distance(l.features, r.features, 2) as distance2, -- p=2 (euclid_distance)
+ minkowski_distance(l.features, r.features, 3) as distance3, -- p=3
+ manhattan_distance(l.features, r.features) as manhattan_distance,
+ euclid_distance(l.features, r.features) as euclid_distance
+ from
+ docs l
+ CROSS JOIN docs r
+ where
+ l.docid != r.docid
+ order by
+ doc1 asc,
+ distance1 asc;
+
+ doc1 doc2 distance1 distance2 distance3 manhattan_distance euclid_distance
+ 1 2 4.0 2.4494898 2.1544347 4.0 2.4494898
+ 1 3 5.0 2.6457512 2.2239802 5.0 2.6457512
+ 2 3 1.0 1.0 1.0 1.0 1.0
+ 2 1 4.0 2.4494898 2.1544347 4.0 2.4494898
+ 3 2 1.0 1.0 1.0 1.0 1.0
+ 3 1 5.0 2.6457512 2.2239802 5.0 2.6457512
+ ```
- `popcnt(a [, b])` - Returns a popcount value
+ ```sql
+ select
+ popcnt(3),
+ popcnt("3"), -- 3=0x11
+ popcnt(array(1,3));
+
+ 2 2 3
+ ```
# Locality-sensitive hashing
@@ -452,16 +632,132 @@ Reference: <a href="https://papers.nips.cc/paper/3848-adaptive-regularization-of
# Similarity measures
- `angular_similarity(ftvec1, ftvec2)` - Returns an angular similarity of the given two vectors
+ ```sql
+ WITH docs as (
+ select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features
+ union all
+ select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+ union all
+ select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+ )
+ select
+ l.docid as doc1,
+ r.docid as doc2,
+ angular_similarity(l.features, r.features) as similarity
+ from
+ docs l
+ CROSS JOIN docs r
+ where
+ l.docid != r.docid
+ order by
+ doc1 asc,
+ similarity desc;
+
+ doc1 doc2 similarity
+ 1 3 0.68321645
+ 1 2 0.6666666
+ 2 3 0.9015807
+ 2 1 0.6666666
+ 3 2 0.9015807
+ 3 1 0.68321645
+ ```
- `cosine_similarity(ftvec1, ftvec2)` - Returns a cosine similarity of the given two vectors
+ ```sql
+ WITH docs as (
+ select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features
+ union all
+ select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+ union all
+ select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+ )
+ select
+ l.docid as doc1,
+ r.docid as doc2,
+ cosine_similarity(l.features, r.features) as similarity
+ from
+ docs l
+ CROSS JOIN docs r
+ where
+ l.docid != r.docid
+ order by
+ doc1 asc,
+ similarity desc;
+
+ doc1 doc2 similarity
+ 1 3 0.5443311
+ 1 2 0.5
+ 2 3 0.9525793
+ 2 1 0.5
+ 3 2 0.9525793
+ 3 1 0.5443311
+ ```
- `dimsum_mapper(array<string> row, map<int col_id, double norm> colNorms [, const string options])` - Returns column-wise partial similarities
- `distance2similarity(float d)` - Returns 1.0 / (1.0 + d)
- `euclid_similarity(ftvec1, ftvec2)` - Returns a euclid distance based similarity, which is `1.0 / (1.0 + distance)`, of the given two vectors
+ ```sql
+ WITH docs as (
+ select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features
+ union all
+ select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+ union all
+ select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+ )
+ select
+ l.docid as doc1,
+ r.docid as doc2,
+ euclid_similarity(l.features, r.features) as similarity
+ from
+ docs l
+ CROSS JOIN docs r
+ where
+ l.docid != r.docid
+ order by
+ doc1 asc,
+ similarity desc;
+
+ doc1 doc2 similarity
+ 1 2 0.28989795
+ 1 3 0.2742919
+ 2 3 0.5
+ 2 1 0.28989795
+ 3 2 0.5
+ 3 1 0.2742919
+ ```
- `jaccard_similarity(A, B [,int k])` - Returns Jaccard similarity coefficient of A and B
+ ```sql
+ WITH docs as (
+ select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features
+ union all
+ select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+ union all
+ select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+ )
+ select
+ l.docid as doc1,
+ r.docid as doc2,
+ jaccard_similarity(l.features, r.features) as similarity
+ from
+ docs l
+ CROSS JOIN docs r
+ where
+ l.docid != r.docid
+ order by
+ doc1 asc,
+ similarity desc;
+
+ doc1 doc2 similarity
+ 1 2 0.14285715
+ 1 3 0.0
+ 2 3 0.6
+ 2 1 0.14285715
+ 3 2 0.6
+ 3 1 0.0
+ ```
# Evaluation