You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hivemall.apache.org by my...@apache.org on 2019/11/27 09:03:48 UTC

[incubator-hivemall] branch master updated: [HIVEMALL-56][DOC] Add documentation about Similarity/Distance functions

This is an automated email from the ASF dual-hosted git repository.

myui pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git


The following commit(s) were added to refs/heads/master by this push:
     new 5113ee0  [HIVEMALL-56][DOC] Add documentation about Similarity/Distance functions
5113ee0 is described below

commit 5113ee00fd420449c42ca5385085a67d1195575f
Author: Makoto Yui <my...@apache.org>
AuthorDate: Wed Nov 27 18:03:41 2019 +0900

    [HIVEMALL-56][DOC] Add documentation about Similarity/Distance functions
    
    ## What changes were proposed in this pull request?
    
    Add documentation about Similarity/Distance functions
    
    ## What type of PR is it?
    
    Documentation
    
    ## What is the Jira issue?
    
    https://issues.apache.org/jira/browse/HIVEMALL-56
    
    ## Checklist
    
    Author: Makoto Yui <my...@apache.org>
    
    Closes #222 from myui/HIVEMALL-56.
---
 .../hivemall/knn/distance/AngularDistanceUDF.java  |  36 ++-
 .../hivemall/knn/distance/CosineDistanceUDF.java   |  32 ++-
 .../hivemall/knn/distance/EuclidDistanceUDF.java   |  32 ++-
 .../hivemall/knn/distance/HammingDistanceUDF.java  |  11 +-
 .../hivemall/knn/distance/JaccardDistanceUDF.java  |  12 +-
 .../hivemall/knn/distance/KLDivergenceUDF.java     |   3 +-
 .../knn/distance/ManhattanDistanceUDF.java         |  36 ++-
 .../knn/distance/MinkowskiDistanceUDF.java         |  39 ++-
 .../java/hivemall/knn/distance/PopcountUDF.java    |  10 +-
 .../knn/similarity/AngularSimilarityUDF.java       |  37 ++-
 .../knn/similarity/CosineSimilarityUDF.java        |  35 ++-
 .../knn/similarity/Distance2SimilarityUDF.java     |   2 +-
 .../hivemall/knn/similarity/EuclidSimilarity.java  |  35 ++-
 .../hivemall/knn/similarity/JaccardIndexUDF.java   |  32 ++-
 docs/gitbook/misc/funcs.md                         | 302 ++++++++++++++++++++-
 15 files changed, 624 insertions(+), 30 deletions(-)

diff --git a/core/src/main/java/hivemall/knn/distance/AngularDistanceUDF.java b/core/src/main/java/hivemall/knn/distance/AngularDistanceUDF.java
index c2f864f..05abae0 100644
--- a/core/src/main/java/hivemall/knn/distance/AngularDistanceUDF.java
+++ b/core/src/main/java/hivemall/knn/distance/AngularDistanceUDF.java
@@ -37,8 +37,38 @@ import org.apache.hadoop.io.FloatWritable;
 /**
  * @see http://en.wikipedia.org/wiki/Cosine_similarity#Angular_similarity
  */
+//@formatter:off
 @Description(name = "angular_distance",
-        value = "_FUNC_(ftvec1, ftvec2) - Returns an angular distance of the given two vectors")
+        value = "_FUNC_(ftvec1, ftvec2) - Returns an angular distance of the given two vectors",
+        extended = "WITH docs as (\n" + 
+                "  select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features\n" + 
+                "  union all\n" + 
+                "  select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" + 
+                "  union all\n" + 
+                "  select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" + 
+                ") \n" + 
+                "select\n" + 
+                "  l.docid as doc1,\n" + 
+                "  r.docid as doc2,\n" + 
+                "  angular_distance(l.features, r.features) as distance,\n" + 
+                "  distance2similarity(angular_distance(l.features, r.features)) as similarity\n" + 
+                "from \n" + 
+                "  docs l\n" + 
+                "  CROSS JOIN docs r\n" + 
+                "where\n" + 
+                "  l.docid != r.docid\n" + 
+                "order by \n" + 
+                "  doc1 asc,\n" + 
+                "  distance asc;\n" + 
+                "\n" + 
+                "doc1    doc2    distance        similarity\n" + 
+                "1       3       0.31678355      0.75942624\n" + 
+                "1       2       0.33333337      0.75\n" + 
+                "2       3       0.09841931      0.91039914\n" + 
+                "2       1       0.33333337      0.75\n" + 
+                "3       2       0.09841931      0.91039914\n" + 
+                "3       1       0.31678355      0.75942624")
+//@formatter:on
 @UDFType(deterministic = true, stateful = false)
 public final class AngularDistanceUDF extends GenericUDF {
 
@@ -49,8 +79,8 @@ public final class AngularDistanceUDF extends GenericUDF {
         if (argOIs.length != 2) {
             throw new UDFArgumentException("angular_distance takes 2 arguments");
         }
-        this.arg0ListOI = HiveUtils.asListOI(argOIs[0]);
-        this.arg1ListOI = HiveUtils.asListOI(argOIs[1]);
+        this.arg0ListOI = HiveUtils.asListOI(argOIs, 0);
+        this.arg1ListOI = HiveUtils.asListOI(argOIs, 1);
 
         return PrimitiveObjectInspectorFactory.writableFloatObjectInspector;
     }
diff --git a/core/src/main/java/hivemall/knn/distance/CosineDistanceUDF.java b/core/src/main/java/hivemall/knn/distance/CosineDistanceUDF.java
index 8c36660..59f88c6 100644
--- a/core/src/main/java/hivemall/knn/distance/CosineDistanceUDF.java
+++ b/core/src/main/java/hivemall/knn/distance/CosineDistanceUDF.java
@@ -37,9 +37,39 @@ import org.apache.hadoop.io.FloatWritable;
 /**
  * @link http://en.wikipedia.org/wiki/Cosine_similarity
  */
+//@formatter:off
 @Description(name = "cosine_distance",
-        value = "_FUNC_(ftvec1, ftvec2) - Returns a cosine distance of the given two vectors")
+        value = "_FUNC_(ftvec1, ftvec2) - Returns a cosine distance of the given two vectors",
+        extended = "WITH docs as (\n" + 
+                "  select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features\n" + 
+                "  union all\n" + 
+                "  select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" + 
+                "  union all\n" + 
+                "  select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" + 
+                ") \n" + 
+                "select\n" + 
+                "  l.docid as doc1,\n" + 
+                "  r.docid as doc2,\n" + 
+                "  cosine_distance(l.features, r.features) as distance,\n" + 
+                "  distance2similarity(cosine_distance(l.features, r.features)) as similarity\n" + 
+                "from \n" + 
+                "  docs l\n" + 
+                "  CROSS JOIN docs r\n" + 
+                "where\n" + 
+                "  l.docid != r.docid\n" + 
+                "order by \n" + 
+                "  doc1 asc,\n" + 
+                "  distance asc;\n" + 
+                "\n" + 
+                "doc1    doc2    distance        similarity\n" + 
+                "1       3       0.45566893      0.6869694\n" + 
+                "1       2       0.5     0.6666667\n" + 
+                "2       3       0.04742068      0.95472616\n" + 
+                "2       1       0.5     0.6666667\n" + 
+                "3       2       0.04742068      0.95472616\n" + 
+                "3       1       0.45566893      0.6869694")
 @UDFType(deterministic = true, stateful = false)
+//@formatter:on
 public final class CosineDistanceUDF extends GenericUDF {
 
     private ListObjectInspector arg0ListOI, arg1ListOI;
diff --git a/core/src/main/java/hivemall/knn/distance/EuclidDistanceUDF.java b/core/src/main/java/hivemall/knn/distance/EuclidDistanceUDF.java
index ca06fcb..b36b893 100644
--- a/core/src/main/java/hivemall/knn/distance/EuclidDistanceUDF.java
+++ b/core/src/main/java/hivemall/knn/distance/EuclidDistanceUDF.java
@@ -36,10 +36,40 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 import org.apache.hadoop.io.FloatWritable;
 
+//@formatter:off
 @Description(name = "euclid_distance",
         value = "_FUNC_(ftvec1, ftvec2) - Returns the square root of the sum of the squared differences"
-                + ": sqrt(sum((x - y)^2))")
+                + ": sqrt(sum((x - y)^2))",
+        extended = "WITH docs as (\n" + 
+                "  select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features\n" + 
+                "  union all\n" + 
+                "  select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" + 
+                "  union all\n" + 
+                "  select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" + 
+                ") \n" + 
+                "select\n" + 
+                "  l.docid as doc1,\n" + 
+                "  r.docid as doc2,\n" + 
+                "  euclid_distance(l.features, r.features) as distance,\n" + 
+                "  distance2similarity(euclid_distance(l.features, r.features)) as similarity\n" + 
+                "from \n" + 
+                "  docs l\n" + 
+                "  CROSS JOIN docs r\n" + 
+                "where\n" + 
+                "  l.docid != r.docid\n" + 
+                "order by \n" + 
+                "  doc1 asc,\n" + 
+                "  distance asc;\n" + 
+                "\n" + 
+                "doc1    doc2    distance        similarity\n" + 
+                "1       2       2.4494898       0.28989795\n" + 
+                "1       3       2.6457512       0.2742919\n" + 
+                "2       3       1.0     0.5\n" + 
+                "2       1       2.4494898       0.28989795\n" + 
+                "3       2       1.0     0.5\n" + 
+                "3       1       2.6457512       0.2742919")
 @UDFType(deterministic = true, stateful = false)
+//@formatter:on
 public final class EuclidDistanceUDF extends GenericUDF {
 
     private ListObjectInspector arg0ListOI, arg1ListOI;
diff --git a/core/src/main/java/hivemall/knn/distance/HammingDistanceUDF.java b/core/src/main/java/hivemall/knn/distance/HammingDistanceUDF.java
index c420d8b..931f6e9 100644
--- a/core/src/main/java/hivemall/knn/distance/HammingDistanceUDF.java
+++ b/core/src/main/java/hivemall/knn/distance/HammingDistanceUDF.java
@@ -29,9 +29,18 @@ import org.apache.hadoop.hive.ql.udf.UDFType;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.LongWritable;
 
+//@formatter:off
 @Description(name = "hamming_distance",
-        value = "_FUNC_(A, B [,int k]) - Returns Hamming distance between A and B")
+        value = "_FUNC_(integer A, integer B) - Returns Hamming distance between A and B",
+        extended = "select \n" + 
+                "  hamming_distance(0,3) as c1, \n" + 
+                "  hamming_distance(\"0\",\"3\") as c2 -- 0=0x00, 3=0x11\n" + 
+                ";\n" + 
+                "\n" + 
+                "c1      c2\n" + 
+                "2       2")
 @UDFType(deterministic = true, stateful = false)
+//@formatter:on
 public class HammingDistanceUDF extends UDF {
 
     public IntWritable evaluate(long a, long b) {
diff --git a/core/src/main/java/hivemall/knn/distance/JaccardDistanceUDF.java b/core/src/main/java/hivemall/knn/distance/JaccardDistanceUDF.java
index 70c281e..48330dc 100644
--- a/core/src/main/java/hivemall/knn/distance/JaccardDistanceUDF.java
+++ b/core/src/main/java/hivemall/knn/distance/JaccardDistanceUDF.java
@@ -28,9 +28,19 @@ import org.apache.hadoop.hive.ql.exec.UDF;
 import org.apache.hadoop.hive.ql.udf.UDFType;
 import org.apache.hadoop.io.FloatWritable;
 
+//@formatter:off
 @Description(name = "jaccard_distance",
-        value = "_FUNC_(A, B [,int k]) - Returns Jaccard distance between A and B")
+        value = "_FUNC_(integer A, integer B [,int k=128]) - Returns Jaccard distance between A and B",
+        extended = "select \n" + 
+                "  jaccard_distance(0,3) as c1, \n" + 
+                "  jaccard_distance(\"0\",\"3\") as c2, -- 0=0x00, 0=0x11\n" + 
+                "  jaccard_distance(0,4) as c3\n" + 
+                ";\n" + 
+                "\n" + 
+                "c1      c2      c3\n" + 
+                "0.03125 0.03125 0.015625")
 @UDFType(deterministic = true, stateful = false)
+//@formatter:on
 public final class JaccardDistanceUDF extends UDF {
 
     private final Set<Object> union = new HashSet<Object>();
diff --git a/core/src/main/java/hivemall/knn/distance/KLDivergenceUDF.java b/core/src/main/java/hivemall/knn/distance/KLDivergenceUDF.java
index 5b68be3..6c1d6fa 100644
--- a/core/src/main/java/hivemall/knn/distance/KLDivergenceUDF.java
+++ b/core/src/main/java/hivemall/knn/distance/KLDivergenceUDF.java
@@ -24,9 +24,10 @@ import org.apache.hadoop.hive.ql.udf.UDFType;
 import org.apache.hadoop.hive.serde2.io.DoubleWritable;
 import org.apache.hadoop.io.FloatWritable;
 
-@Description(name = "kld", value = "_FUNC_(double m1, double sigma1, double mu2, double sigma 2)"
+@Description(name = "kld", value = "_FUNC_(double mu1, double sigma1, double mu2, double sigma2)"
         + " - Returns KL divergence between two distributions")
 @UDFType(deterministic = true, stateful = false)
+//@formatter:on
 public final class KLDivergenceUDF extends UDF {
 
     public DoubleWritable evaluate(double mu1, double sigma1, double mu2, double sigma2) {
diff --git a/core/src/main/java/hivemall/knn/distance/ManhattanDistanceUDF.java b/core/src/main/java/hivemall/knn/distance/ManhattanDistanceUDF.java
index 4ab258d..c35ff8d 100644
--- a/core/src/main/java/hivemall/knn/distance/ManhattanDistanceUDF.java
+++ b/core/src/main/java/hivemall/knn/distance/ManhattanDistanceUDF.java
@@ -36,8 +36,38 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 import org.apache.hadoop.io.FloatWritable;
 
-@Description(name = "manhattan_distance", value = "_FUNC_(list x, list y) - Returns sum(|x - y|)")
+//@formatter:off
+@Description(name = "manhattan_distance", value = "_FUNC_(list x, list y) - Returns sum(|x - y|)",
+        extended = "WITH docs as (\n" + 
+                "  select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features\n" + 
+                "  union all\n" + 
+                "  select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" + 
+                "  union all\n" + 
+                "  select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" + 
+                ") \n" + 
+                "select\n" + 
+                "  l.docid as doc1,\n" + 
+                "  r.docid as doc2,\n" + 
+                "  manhattan_distance(l.features, r.features) as distance,\n" + 
+                "  distance2similarity(angular_distance(l.features, r.features)) as similarity\n" + 
+                "from \n" + 
+                "  docs l\n" + 
+                "  CROSS JOIN docs r\n" + 
+                "where\n" + 
+                "  l.docid != r.docid\n" + 
+                "order by \n" + 
+                "  doc1 asc,\n" + 
+                "  distance asc;\n" + 
+                "\n" + 
+                "doc1    doc2    distance        similarity\n" + 
+                "1       2       4.0     0.75\n" + 
+                "1       3       5.0     0.75942624\n" + 
+                "2       3       1.0     0.91039914\n" + 
+                "2       1       4.0     0.75\n" + 
+                "3       2       1.0     0.91039914\n" + 
+                "3       1       5.0     0.75942624")
 @UDFType(deterministic = true, stateful = false)
+//@formatter:on
 public final class ManhattanDistanceUDF extends GenericUDF {
 
     private ListObjectInspector arg0ListOI, arg1ListOI;
@@ -47,8 +77,8 @@ public final class ManhattanDistanceUDF extends GenericUDF {
         if (argOIs.length != 2) {
             throw new UDFArgumentException("manhattan_distance takes 2 arguments");
         }
-        this.arg0ListOI = HiveUtils.asListOI(argOIs[0]);
-        this.arg1ListOI = HiveUtils.asListOI(argOIs[1]);
+        this.arg0ListOI = HiveUtils.asListOI(argOIs, 0);
+        this.arg1ListOI = HiveUtils.asListOI(argOIs, 1);
 
         return PrimitiveObjectInspectorFactory.writableFloatObjectInspector;
     }
diff --git a/core/src/main/java/hivemall/knn/distance/MinkowskiDistanceUDF.java b/core/src/main/java/hivemall/knn/distance/MinkowskiDistanceUDF.java
index b0282b8..32fe637 100644
--- a/core/src/main/java/hivemall/knn/distance/MinkowskiDistanceUDF.java
+++ b/core/src/main/java/hivemall/knn/distance/MinkowskiDistanceUDF.java
@@ -36,9 +36,42 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 import org.apache.hadoop.io.FloatWritable;
 
+//@formatter:off
 @Description(name = "minkowski_distance",
-        value = "_FUNC_(list x, list y, double p) - Returns sum(|x - y|^p)^(1/p)")
+        value = "_FUNC_(list x, list y, double p) - Returns sum(|x - y|^p)^(1/p)", 
+        extended = "WITH docs as (\n" + 
+                "  select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features\n" + 
+                "  union all\n" + 
+                "  select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" + 
+                "  union all\n" + 
+                "  select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" + 
+                ") \n" + 
+                "select\n" + 
+                "  l.docid as doc1,\n" + 
+                "  r.docid as doc2,\n" + 
+                "  minkowski_distance(l.features, r.features, 1) as distance1, -- p=1 (manhattan_distance)\n" + 
+                "  minkowski_distance(l.features, r.features, 2) as distance2, -- p=2 (euclid_distance)\n" + 
+                "  minkowski_distance(l.features, r.features, 3) as distance3, -- p=3\n" + 
+                "  manhattan_distance(l.features, r.features) as manhattan_distance,\n" + 
+                "  euclid_distance(l.features, r.features) as euclid_distance\n" + 
+                "from \n" + 
+                "  docs l\n" + 
+                "  CROSS JOIN docs r\n" + 
+                "where\n" + 
+                "  l.docid != r.docid\n" + 
+                "order by \n" + 
+                "  doc1 asc,\n" + 
+                "  distance1 asc;\n" + 
+                "\n" + 
+                "doc1    doc2    distance1       distance2       distance3       manhattan_distance      euclid_distance\n" + 
+                "1       2       4.0     2.4494898       2.1544347       4.0     2.4494898\n" + 
+                "1       3       5.0     2.6457512       2.2239802       5.0     2.6457512\n" + 
+                "2       3       1.0     1.0     1.0     1.0     1.0\n" + 
+                "2       1       4.0     2.4494898       2.1544347       4.0     2.4494898\n" + 
+                "3       2       1.0     1.0     1.0     1.0     1.0\n" + 
+                "3       1       5.0     2.6457512       2.2239802       5.0     2.6457512")
 @UDFType(deterministic = true, stateful = false)
+//@formatter:on
 public final class MinkowskiDistanceUDF extends GenericUDF {
 
     private ListObjectInspector arg0ListOI, arg1ListOI;
@@ -49,8 +82,8 @@ public final class MinkowskiDistanceUDF extends GenericUDF {
         if (argOIs.length != 3) {
             throw new UDFArgumentException("minkowski_distance takes 3 arguments");
         }
-        this.arg0ListOI = HiveUtils.asListOI(argOIs[0]);
-        this.arg1ListOI = HiveUtils.asListOI(argOIs[1]);
+        this.arg0ListOI = HiveUtils.asListOI(argOIs, 0);
+        this.arg1ListOI = HiveUtils.asListOI(argOIs, 1);
         this.order_p = HiveUtils.getAsConstDouble(argOIs[2]);
 
         return PrimitiveObjectInspectorFactory.writableFloatObjectInspector;
diff --git a/core/src/main/java/hivemall/knn/distance/PopcountUDF.java b/core/src/main/java/hivemall/knn/distance/PopcountUDF.java
index 0d99d4d..57106b9 100644
--- a/core/src/main/java/hivemall/knn/distance/PopcountUDF.java
+++ b/core/src/main/java/hivemall/knn/distance/PopcountUDF.java
@@ -28,8 +28,16 @@ import org.apache.hadoop.hive.ql.exec.UDF;
 import org.apache.hadoop.hive.ql.udf.UDFType;
 import org.apache.hadoop.io.IntWritable;
 
-@Description(name = "popcnt", value = "_FUNC_(a [, b]) - Returns a popcount value")
+//@formatter:off
+@Description(name = "popcnt", value = "_FUNC_(a [, b]) - Returns a popcount value", 
+             extended = "select \n" + 
+                     "  popcnt(3),\n" + 
+                     "  popcnt(\"3\"),  -- 3=0x11\n" + 
+                     "  popcnt(array(1,3));\n" + 
+                     "\n" + 
+                     "2       2       3")
 @UDFType(deterministic = true, stateful = false)
+//@formatter:on
 public final class PopcountUDF extends UDF {
 
     public IntWritable evaluate(long a) {
diff --git a/core/src/main/java/hivemall/knn/similarity/AngularSimilarityUDF.java b/core/src/main/java/hivemall/knn/similarity/AngularSimilarityUDF.java
index 1e881fd..e381d36 100644
--- a/core/src/main/java/hivemall/knn/similarity/AngularSimilarityUDF.java
+++ b/core/src/main/java/hivemall/knn/similarity/AngularSimilarityUDF.java
@@ -36,8 +36,37 @@ import org.apache.hadoop.io.FloatWritable;
 /**
  * @see http://en.wikipedia.org/wiki/Cosine_similarity#Angular_similarity
  */
+//@formatter:off
 @Description(name = "angular_similarity",
-        value = "_FUNC_(ftvec1, ftvec2) - Returns an angular similarity of the given two vectors")
+        value = "_FUNC_(ftvec1, ftvec2) - Returns an angular similarity of the given two vectors",
+        extended = "WITH docs as (\n" + 
+                "  select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features\n" + 
+                "  union all\n" + 
+                "  select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" + 
+                "  union all\n" + 
+                "  select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" + 
+                ") \n" + 
+                "select\n" + 
+                "  l.docid as doc1,\n" + 
+                "  r.docid as doc2,\n" + 
+                "  angular_similarity(l.features, r.features) as similarity\n" + 
+                "from \n" + 
+                "  docs l\n" + 
+                "  CROSS JOIN docs r\n" + 
+                "where\n" + 
+                "  l.docid != r.docid\n" + 
+                "order by \n" + 
+                "  doc1 asc,\n" + 
+                "  similarity desc;\n" + 
+                "\n" + 
+                "doc1    doc2    similarity\n" + 
+                "1       3       0.68321645\n" + 
+                "1       2       0.6666666\n" + 
+                "2       3       0.9015807\n" + 
+                "2       1       0.6666666\n" + 
+                "3       2       0.9015807\n" + 
+                "3       1       0.68321645")
+//@formatter:on
 @UDFType(deterministic = true, stateful = false)
 public final class AngularSimilarityUDF extends GenericUDF {
 
@@ -45,11 +74,11 @@ public final class AngularSimilarityUDF extends GenericUDF {
 
     @Override
     public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
-        if (argOIs.length != 2) {
+        if (argOIs.length != 2) {            
             throw new UDFArgumentException("angular_similarity takes 2 arguments");
         }
-        this.arg0ListOI = HiveUtils.asListOI(argOIs[0]);
-        this.arg1ListOI = HiveUtils.asListOI(argOIs[1]);
+        this.arg0ListOI = HiveUtils.asListOI(argOIs, 0);
+        this.arg1ListOI = HiveUtils.asListOI(argOIs, 1);
 
         return PrimitiveObjectInspectorFactory.writableFloatObjectInspector;
     }
diff --git a/core/src/main/java/hivemall/knn/similarity/CosineSimilarityUDF.java b/core/src/main/java/hivemall/knn/similarity/CosineSimilarityUDF.java
index 163ec70..469850c 100644
--- a/core/src/main/java/hivemall/knn/similarity/CosineSimilarityUDF.java
+++ b/core/src/main/java/hivemall/knn/similarity/CosineSimilarityUDF.java
@@ -36,8 +36,37 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 import org.apache.hadoop.io.FloatWritable;
 
+//@formatter:off
 @Description(name = "cosine_similarity",
-        value = "_FUNC_(ftvec1, ftvec2) - Returns a cosine similarity of the given two vectors")
+        value = "_FUNC_(ftvec1, ftvec2) - Returns a cosine similarity of the given two vectors",
+        extended = "WITH docs as (\n" + 
+                "  select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features\n" + 
+                "  union all\n" + 
+                "  select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" + 
+                "  union all\n" + 
+                "  select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" + 
+                ") \n" + 
+                "select\n" + 
+                "  l.docid as doc1,\n" + 
+                "  r.docid as doc2,\n" + 
+                "  cosine_similarity(l.features, r.features) as similarity\n" + 
+                "from \n" + 
+                "  docs l\n" + 
+                "  CROSS JOIN docs r\n" + 
+                "where\n" + 
+                "  l.docid != r.docid\n" + 
+                "order by \n" + 
+                "  doc1 asc,\n" + 
+                "  similarity desc;\n" + 
+                "\n" + 
+                "doc1    doc2    similarity\n" + 
+                "1       3       0.5443311\n" + 
+                "1       2       0.5\n" + 
+                "2       3       0.9525793\n" + 
+                "2       1       0.5\n" + 
+                "3       2       0.9525793\n" + 
+                "3       1       0.5443311")
+//@formatter:on
 @UDFType(deterministic = true, stateful = false)
 public final class CosineSimilarityUDF extends GenericUDF {
 
@@ -48,8 +77,8 @@ public final class CosineSimilarityUDF extends GenericUDF {
         if (argOIs.length != 2) {
             throw new UDFArgumentException("cosine_similarity takes 2 arguments");
         }
-        this.arg0ListOI = HiveUtils.asListOI(argOIs[0]);
-        this.arg1ListOI = HiveUtils.asListOI(argOIs[1]);
+        this.arg0ListOI = HiveUtils.asListOI(argOIs, 0);
+        this.arg1ListOI = HiveUtils.asListOI(argOIs, 1);
 
         return PrimitiveObjectInspectorFactory.writableFloatObjectInspector;
     }
diff --git a/core/src/main/java/hivemall/knn/similarity/Distance2SimilarityUDF.java b/core/src/main/java/hivemall/knn/similarity/Distance2SimilarityUDF.java
index 9186b80..64fef74 100644
--- a/core/src/main/java/hivemall/knn/similarity/Distance2SimilarityUDF.java
+++ b/core/src/main/java/hivemall/knn/similarity/Distance2SimilarityUDF.java
@@ -44,7 +44,7 @@ public final class Distance2SimilarityUDF extends GenericUDF {
         if (argOIs.length != 1) {
             throw new UDFArgumentException("distance2similarity takes 1 argument");
         }
-        this.distanceOI = HiveUtils.asDoubleCompatibleOI(argOIs[0]);
+        this.distanceOI = HiveUtils.asDoubleCompatibleOI(argOIs, 0);
         return PrimitiveObjectInspectorFactory.writableFloatObjectInspector;
     }
 
diff --git a/core/src/main/java/hivemall/knn/similarity/EuclidSimilarity.java b/core/src/main/java/hivemall/knn/similarity/EuclidSimilarity.java
index fad06a5..9494e21 100644
--- a/core/src/main/java/hivemall/knn/similarity/EuclidSimilarity.java
+++ b/core/src/main/java/hivemall/knn/similarity/EuclidSimilarity.java
@@ -34,9 +34,38 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 import org.apache.hadoop.io.FloatWritable;
 
+//@formatter:off
 @Description(name = "euclid_similarity",
         value = "_FUNC_(ftvec1, ftvec2) - Returns a euclid distance based similarity"
-                + ", which is `1.0 / (1.0 + distance)`, of the given two vectors")
+                + ", which is `1.0 / (1.0 + distance)`, of the given two vectors",
+        extended = "WITH docs as (\n" + 
+                "  select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features\n" + 
+                "  union all\n" + 
+                "  select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" + 
+                "  union all\n" + 
+                "  select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" + 
+                ") \n" + 
+                "select\n" + 
+                "  l.docid as doc1,\n" + 
+                "  r.docid as doc2,\n" + 
+                "  euclid_similarity(l.features, r.features) as similarity\n" + 
+                "from \n" + 
+                "  docs l\n" + 
+                "  CROSS JOIN docs r\n" + 
+                "where\n" + 
+                "  l.docid != r.docid\n" + 
+                "order by \n" + 
+                "  doc1 asc,\n" + 
+                "  similarity desc;\n" + 
+                "\n" + 
+                "doc1    doc2    similarity\n" + 
+                "1       2       0.28989795\n" + 
+                "1       3       0.2742919\n" + 
+                "2       3       0.5\n" + 
+                "2       1       0.28989795\n" + 
+                "3       2       0.5\n" + 
+                "3       1       0.2742919")
+//@formatter:on
 @UDFType(deterministic = true, stateful = false)
 public final class EuclidSimilarity extends GenericUDF {
 
@@ -47,8 +76,8 @@ public final class EuclidSimilarity extends GenericUDF {
         if (argOIs.length != 2) {
             throw new UDFArgumentException("euclid_similarity takes 2 arguments");
         }
-        this.arg0ListOI = HiveUtils.asListOI(argOIs[0]);
-        this.arg1ListOI = HiveUtils.asListOI(argOIs[1]);
+        this.arg0ListOI = HiveUtils.asListOI(argOIs, 0);
+        this.arg1ListOI = HiveUtils.asListOI(argOIs, 1);
 
         return PrimitiveObjectInspectorFactory.writableFloatObjectInspector;
     }
diff --git a/core/src/main/java/hivemall/knn/similarity/JaccardIndexUDF.java b/core/src/main/java/hivemall/knn/similarity/JaccardIndexUDF.java
index f453f82..dfce04d 100644
--- a/core/src/main/java/hivemall/knn/similarity/JaccardIndexUDF.java
+++ b/core/src/main/java/hivemall/knn/similarity/JaccardIndexUDF.java
@@ -19,6 +19,7 @@
 package hivemall.knn.similarity;
 
 import static hivemall.utils.hadoop.WritableUtils.val;
+
 import hivemall.knn.distance.HammingDistanceUDF;
 
 import java.math.BigInteger;
@@ -31,8 +32,37 @@ import org.apache.hadoop.hive.ql.exec.UDF;
 import org.apache.hadoop.hive.ql.udf.UDFType;
 import org.apache.hadoop.io.FloatWritable;
 
+//@formatter:off
 @Description(name = "jaccard_similarity",
-        value = "_FUNC_(A, B [,int k]) - Returns Jaccard similarity coefficient of A and B")
+        value = "_FUNC_(A, B [,int k]) - Returns Jaccard similarity coefficient of A and B",
+        extended = "WITH docs as (\n" + 
+                "  select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features\n" + 
+                "  union all\n" + 
+                "  select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" + 
+                "  union all\n" + 
+                "  select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features\n" + 
+                ") \n" + 
+                "select\n" + 
+                "  l.docid as doc1,\n" + 
+                "  r.docid as doc2,\n" + 
+                "  jaccard_similarity(l.features, r.features) as similarity\n" + 
+                "from \n" + 
+                "  docs l\n" + 
+                "  CROSS JOIN docs r\n" + 
+                "where\n" + 
+                "  l.docid != r.docid\n" + 
+                "order by \n" + 
+                "  doc1 asc,\n" + 
+                "  similarity desc;\n" + 
+                "\n" + 
+                "doc1    doc2    similarity\n" + 
+                "1       2       0.14285715\n" + 
+                "1       3       0.0\n" + 
+                "2       3       0.6\n" + 
+                "2       1       0.14285715\n" + 
+                "3       2       0.6\n" + 
+                "3       1       0.0")
+//@formatter:on
 @UDFType(deterministic = true, stateful = false)
 public final class JaccardIndexUDF extends UDF {
 
diff --git a/docs/gitbook/misc/funcs.md b/docs/gitbook/misc/funcs.md
index 3e5f92b..e04713b 100644
--- a/docs/gitbook/misc/funcs.md
+++ b/docs/gitbook/misc/funcs.md
@@ -424,22 +424,202 @@ Reference: <a href="https://papers.nips.cc/paper/3848-adaptive-regularization-of
 # Distance measures
 
 - `angular_distance(ftvec1, ftvec2)` - Returns an angular distance of the given two vectors
+  ```sql
+  WITH docs as (
+    select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features
+    union all
+    select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+    union all
+    select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+  ) 
+  select
+    l.docid as doc1,
+    r.docid as doc2,
+    angular_distance(l.features, r.features) as distance,
+    distance2similarity(angular_distance(l.features, r.features)) as similarity
+  from 
+    docs l
+    CROSS JOIN docs r
+  where
+    l.docid != r.docid
+  order by 
+    doc1 asc,
+    distance asc;
+
+  doc1    doc2    distance        similarity
+  1       3       0.31678355      0.75942624
+  1       2       0.33333337      0.75
+  2       3       0.09841931      0.91039914
+  2       1       0.33333337      0.75
+  3       2       0.09841931      0.91039914
+  3       1       0.31678355      0.75942624
+  ```
 
 - `cosine_distance(ftvec1, ftvec2)` - Returns a cosine distance of the given two vectors
+  ```sql
+  WITH docs as (
+    select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features
+    union all
+    select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+    union all
+    select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+  ) 
+  select
+    l.docid as doc1,
+    r.docid as doc2,
+    cosine_distance(l.features, r.features) as distance,
+    distance2similarity(cosine_distance(l.features, r.features)) as similarity
+  from 
+    docs l
+    CROSS JOIN docs r
+  where
+    l.docid != r.docid
+  order by 
+    doc1 asc,
+    distance asc;
+
+  doc1    doc2    distance        similarity
+  1       3       0.45566893      0.6869694
+  1       2       0.5     0.6666667
+  2       3       0.04742068      0.95472616
+  2       1       0.5     0.6666667
+  3       2       0.04742068      0.95472616
+  3       1       0.45566893      0.6869694
+  ```
 
 - `euclid_distance(ftvec1, ftvec2)` - Returns the square root of the sum of the squared differences: sqrt(sum((x - y)^2))
+  ```sql
+  WITH docs as (
+    select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features
+    union all
+    select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+    union all
+    select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+  ) 
+  select
+    l.docid as doc1,
+    r.docid as doc2,
+    euclid_distance(l.features, r.features) as distance,
+    distance2similarity(euclid_distance(l.features, r.features)) as similarity
+  from 
+    docs l
+    CROSS JOIN docs r
+  where
+    l.docid != r.docid
+  order by 
+    doc1 asc,
+    distance asc;
+
+  doc1    doc2    distance        similarity
+  1       2       2.4494898       0.28989795
+  1       3       2.6457512       0.2742919
+  2       3       1.0     0.5
+  2       1       2.4494898       0.28989795
+  3       2       1.0     0.5
+  3       1       2.6457512       0.2742919
+  ```
+
+- `hamming_distance(integer A, integer B)` - Returns Hamming distance between A and B
+  ```sql
+  select 
+    hamming_distance(0,3) as c1, 
+    hamming_distance("0","3") as c2 -- 0=0x00, 3=0x11
+  ;
 
-- `hamming_distance(A, B [,int k])` - Returns Hamming distance between A and B
+  c1      c2
+  2       2
+  ```
+
+- `jaccard_distance(integer A, integer B [,int k=128])` - Returns Jaccard distance between A and B
+  ```sql
+  select 
+    jaccard_distance(0,3) as c1, 
+    jaccard_distance("0","3") as c2, -- 0=0x00, 0=0x11
+    jaccard_distance(0,4) as c3
+  ;
 
-- `jaccard_distance(A, B [,int k])` - Returns Jaccard distance between A and B
+  c1      c2      c3
+  0.03125 0.03125 0.015625
+  ```
 
-- `kld(double m1, double sigma1, double mu2, double sigma 2)` - Returns KL divergence between two distributions
+- `kld(double mu1, double sigma1, double mu2, double sigma2)` - Returns KL divergence between two distributions
 
 - `manhattan_distance(list x, list y)` - Returns sum(|x - y|)
+  ```sql
+  WITH docs as (
+    select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features
+    union all
+    select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+    union all
+    select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+  ) 
+  select
+    l.docid as doc1,
+    r.docid as doc2,
+    manhattan_distance(l.features, r.features) as distance,
+    distance2similarity(angular_distance(l.features, r.features)) as similarity
+  from 
+    docs l
+    CROSS JOIN docs r
+  where
+    l.docid != r.docid
+  order by 
+    doc1 asc,
+    distance asc;
+
+  doc1    doc2    distance        similarity
+  1       2       4.0     0.75
+  1       3       5.0     0.75942624
+  2       3       1.0     0.91039914
+  2       1       4.0     0.75
+  3       2       1.0     0.91039914
+  3       1       5.0     0.75942624
+  ```
 
 - `minkowski_distance(list x, list y, double p)` - Returns sum(|x - y|^p)^(1/p)
+  ```sql
+  WITH docs as (
+    select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features
+    union all
+    select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+    union all
+    select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+  ) 
+  select
+    l.docid as doc1,
+    r.docid as doc2,
+    minkowski_distance(l.features, r.features, 1) as distance1, -- p=1 (manhattan_distance)
+    minkowski_distance(l.features, r.features, 2) as distance2, -- p=2 (euclid_distance)
+    minkowski_distance(l.features, r.features, 3) as distance3, -- p=3
+    manhattan_distance(l.features, r.features) as manhattan_distance,
+    euclid_distance(l.features, r.features) as euclid_distance
+  from 
+    docs l
+    CROSS JOIN docs r
+  where
+    l.docid != r.docid
+  order by 
+    doc1 asc,
+    distance1 asc;
+
+  doc1    doc2    distance1       distance2       distance3       manhattan_distance      euclid_distance
+  1       2       4.0     2.4494898       2.1544347       4.0     2.4494898
+  1       3       5.0     2.6457512       2.2239802       5.0     2.6457512
+  2       3       1.0     1.0     1.0     1.0     1.0
+  2       1       4.0     2.4494898       2.1544347       4.0     2.4494898
+  3       2       1.0     1.0     1.0     1.0     1.0
+  3       1       5.0     2.6457512       2.2239802       5.0     2.6457512
+  ```
 
 - `popcnt(a [, b])` - Returns a popcount value
+  ```sql
+  select 
+    popcnt(3),
+    popcnt("3"),  -- 3=0x11
+    popcnt(array(1,3));
+
+  2       2       3
+  ```
 
 # Locality-sensitive hashing
 
@@ -452,16 +632,132 @@ Reference: <a href="https://papers.nips.cc/paper/3848-adaptive-regularization-of
 # Similarity measures
 
 - `angular_similarity(ftvec1, ftvec2)` - Returns an angular similarity of the given two vectors
+  ```sql
+  WITH docs as (
+    select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features
+    union all
+    select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+    union all
+    select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+  ) 
+  select
+    l.docid as doc1,
+    r.docid as doc2,
+    angular_similarity(l.features, r.features) as similarity
+  from 
+    docs l
+    CROSS JOIN docs r
+  where
+    l.docid != r.docid
+  order by 
+    doc1 asc,
+    similarity desc;
+
+  doc1    doc2    similarity
+  1       3       0.68321645
+  1       2       0.6666666
+  2       3       0.9015807
+  2       1       0.6666666
+  3       2       0.9015807
+  3       1       0.68321645
+  ```
 
 - `cosine_similarity(ftvec1, ftvec2)` - Returns a cosine similarity of the given two vectors
+  ```sql
+  WITH docs as (
+    select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features
+    union all
+    select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+    union all
+    select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+  ) 
+  select
+    l.docid as doc1,
+    r.docid as doc2,
+    cosine_similarity(l.features, r.features) as similarity
+  from 
+    docs l
+    CROSS JOIN docs r
+  where
+    l.docid != r.docid
+  order by 
+    doc1 asc,
+    similarity desc;
+
+  doc1    doc2    similarity
+  1       3       0.5443311
+  1       2       0.5
+  2       3       0.9525793
+  2       1       0.5
+  3       2       0.9525793
+  3       1       0.5443311
+  ```
 
 - `dimsum_mapper(array<string> row, map<int col_id, double norm> colNorms [, const string options])` - Returns column-wise partial similarities
 
 - `distance2similarity(float d)` - Returns 1.0 / (1.0 + d)
 
 - `euclid_similarity(ftvec1, ftvec2)` - Returns a euclid distance based similarity, which is `1.0 / (1.0 + distance)`, of the given two vectors
+  ```sql
+  WITH docs as (
+    select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features
+    union all
+    select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+    union all
+    select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+  ) 
+  select
+    l.docid as doc1,
+    r.docid as doc2,
+    euclid_similarity(l.features, r.features) as similarity
+  from 
+    docs l
+    CROSS JOIN docs r
+  where
+    l.docid != r.docid
+  order by 
+    doc1 asc,
+    similarity desc;
+
+  doc1    doc2    similarity
+  1       2       0.28989795
+  1       3       0.2742919
+  2       3       0.5
+  2       1       0.28989795
+  3       2       0.5
+  3       1       0.2742919
+  ```
 
 - `jaccard_similarity(A, B [,int k])` - Returns Jaccard similarity coefficient of A and B
+  ```sql
+  WITH docs as (
+    select 1 as docid, array('apple:1.0', 'orange:2.0', 'banana:1.0', 'kuwi:0') as features
+    union all
+    select 2 as docid, array('apple:1.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+    union all
+    select 3 as docid, array('apple:2.0', 'orange:0', 'banana:2.0', 'kuwi:1.0') as features
+  ) 
+  select
+    l.docid as doc1,
+    r.docid as doc2,
+    jaccard_similarity(l.features, r.features) as similarity
+  from 
+    docs l
+    CROSS JOIN docs r
+  where
+    l.docid != r.docid
+  order by 
+    doc1 asc,
+    similarity desc;
+
+  doc1    doc2    similarity
+  1       2       0.14285715
+  1       3       0.0
+  2       3       0.6
+  2       1       0.14285715
+  3       2       0.6
+  3       1       0.0
+  ```
 
 # Evaluation