You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hivemall.apache.org by my...@apache.org on 2017/06/07 08:02:25 UTC

[2/2] incubator-hivemall git commit: Applied minor fixes to the previous commit

Applied minor fixes to the previous commit


Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/1db53587
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/1db53587
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/1db53587

Branch: refs/heads/master
Commit: 1db5358767bb30a8c433e4530c39d8591bc28a36
Parents: 1fbf90a
Author: myui <yu...@gmail.com>
Authored: Wed Jun 7 17:01:47 2017 +0900
Committer: myui <yu...@gmail.com>
Committed: Wed Jun 7 17:01:47 2017 +0900

----------------------------------------------------------------------
 .../knn/similarity/DIMSUMMapperUDTF.java        | 26 +++++++++-----------
 docs/gitbook/recommend/item_based_cf.md         |  6 ++---
 2 files changed, 15 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1db53587/core/src/main/java/hivemall/knn/similarity/DIMSUMMapperUDTF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/knn/similarity/DIMSUMMapperUDTF.java b/core/src/main/java/hivemall/knn/similarity/DIMSUMMapperUDTF.java
index 73e218f..740b2da 100644
--- a/core/src/main/java/hivemall/knn/similarity/DIMSUMMapperUDTF.java
+++ b/core/src/main/java/hivemall/knn/similarity/DIMSUMMapperUDTF.java
@@ -29,8 +29,6 @@ import hivemall.utils.lang.Primitives;
 
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.Options;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.hive.ql.exec.Description;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
@@ -52,25 +50,24 @@ import java.util.Map;
         name = "dimsum_mapper",
         value = "_FUNC_(array<string> row, map<int col_id, double norm> colNorms [, const string options]) "
                 + "- Returns column-wise partial similarities")
-public class DIMSUMMapperUDTF extends UDTFWithOptions {
-    private static final Log logger = LogFactory.getLog(DIMSUMMapperUDTF.class);
+public final class DIMSUMMapperUDTF extends UDTFWithOptions {
 
-    protected ListObjectInspector rowOI;
-    protected MapObjectInspector colNormsOI;
+    private ListObjectInspector rowOI;
+    private MapObjectInspector colNormsOI;
 
     @Nullable
-    protected Feature[] probes;
+    private Feature[] probes;
 
     @Nonnull
-    protected PRNG rnd;
+    private PRNG rnd;
 
-    protected double threshold;
-    protected double sqrtGamma;
-    protected boolean symmetricOutput;
-    protected boolean parseFeatureAsInt;
+    private double threshold;
+    private double sqrtGamma;
+    private boolean symmetricOutput;
+    private boolean parseFeatureAsInt;
 
-    protected Map<Object, Double> colNorms;
-    protected Map<Object, Double> colProbs;
+    private Map<Object, Double> colNorms;
+    private Map<Object, Double> colProbs;
 
     @Override
     protected Options getOptions() {
@@ -158,6 +155,7 @@ public class DIMSUMMapperUDTF extends UDTFWithOptions {
         return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
     }
 
+    @SuppressWarnings("unchecked")
     @Override
     public void process(Object[] args) throws HiveException {
         Feature[] row = parseFeatures(args[0]);

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1db53587/docs/gitbook/recommend/item_based_cf.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/recommend/item_based_cf.md b/docs/gitbook/recommend/item_based_cf.md
index 2b9097e..9515184 100644
--- a/docs/gitbook/recommend/item_based_cf.md
+++ b/docs/gitbook/recommend/item_based_cf.md
@@ -517,7 +517,7 @@ group by
 
 # Efficient similarity computation
 
-Since naive similarity computation takes `O(n^2)` computational complexity, utilizing a certain approximation scheme is practically important to improve efficiency and feasibility. In particular, Hivemall enables you to use one of two sophisticated approximation schemes, [MinHash](https://en.wikipedia.org/wiki/MinHash#Jaccard_similarity_and_minimum_hash_values) and [DIMSUM](https://blog.twitter.com/engineering/en_us/a/2014/all-pairs-similarity-via-dimsum.html).
+Since naive similarity computation takes `O(n^2)` computational complexity, utilizing a certain approximation scheme is practically important to improve efficiency and feasibility. In particular, Hivemall enables you to use one of two sophisticated approximation schemes, [MinHash](##minhash-compute-pseudo-jaccard-similarity) and [DIMSUM](#dimsum-approximated-all-pairs-cosine-similarity-computation).
 
 ## MinHash: Compute "pseudo" Jaccard similarity
 
@@ -604,12 +604,12 @@ from
   topk;
 ```
 
-## DIMSUM: Approximated all-pairs similarity computation
+## DIMSUM: Approximated all-pairs "Cosine" similarity computation
 
 > #### Note
 > This feature is supported from Hivemall v0.5-rc.1 or later.
 
-DIMSUM is a technique to efficiently and approximately compute similarities for all-pairs of items. You can refer to [an article in Twitter's Engineering blog](https://blog.twitter.com/engineering/en_us/a/2014/all-pairs-similarity-via-dimsum.html) to learn how DIMSUM reduces running time.
+DIMSUM is a technique to efficiently and approximately compute [Cosine similarities](https://en.wikipedia.org/wiki/Cosine_similarity) for all-pairs of items. You can refer to [an article in Twitter's Engineering blog](https://blog.twitter.com/engineering/en_us/a/2014/all-pairs-similarity-via-dimsum.html) to learn how DIMSUM reduces running time.
 
 Here, let us begin with the `user_purchased` table. `item_similarity` table can be obtained as follows: