You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hivemall.apache.org by my...@apache.org on 2019/06/10 06:51:15 UTC

[incubator-hivemall] branch HIVEMALL-253-2 created (now 49f3b63)

This is an automated email from the ASF dual-hosted git repository.

myui pushed a change to branch HIVEMALL-253-2
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git.


      at 49f3b63  Fixed comments

This branch includes the following new commits:

     new 6d708ab  merged from https://github.com/Solodye/incubator-hivemall.git master ignoring pom.xml updates
     new 61e122e  Updated DDLs using bin/update_ddls.sh
     new b15903d  Removed author tag
     new ec2d22e  Applied formatter
     new c4a78cd  Changed instance method to static
     new dde8ec9  Revised UDF comment
     new 44ff4bb  minor refactoring
     new 4d3d3a7  relocated private methods
     new fc756f4  Added curly braces
     new 49f3b63  Fixed comments

The 10 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[incubator-hivemall] 01/10: merged from https://github.com/Solodye/incubator-hivemall.git master ignoring pom.xml updates

Posted by my...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

myui pushed a commit to branch HIVEMALL-253-2
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git

commit 6d708abbb46bc740b52bab09ba4eda943dadaf85
Author: Makoto Yui <my...@apache.org>
AuthorDate: Mon Jun 10 15:29:26 2019 +0900

    merged from https://github.com/Solodye/incubator-hivemall.git master ignoring pom.xml updates
---
 .../java/hivemall/tools/map/MapRouletteUDF.java    | 192 +++++++++++++++++++++
 .../hivemall/tools/map/MapRouletteUDFTest.java     | 148 ++++++++++++++++
 docs/gitbook/misc/generic_funcs.md                 |  38 +++-
 resources/ddl/define-all.hive                      |   3 +
 4 files changed, 380 insertions(+), 1 deletion(-)

diff --git a/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java b/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
new file mode 100644
index 0000000..e69dd53
--- /dev/null
+++ b/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.tools.map;
+
+import hivemall.utils.hadoop.HiveUtils;
+import org.apache.hadoop.hive.ql.exec.*;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFType;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
+import java.util.*;
+import static hivemall.HivemallConstants.*;
+
+/**
+ * The map_roulette() can be use to do roulette, according to each map.entry 's weight.
+ * 
+ * @author Wang, Yizheng
+ */
+@Description(name = "map_roulette", value = "_FUNC_(Map<K, number> map)"
+        + " - Returns the key K which determine to its weight , the bigger weight is ,the more probability K will return. "
+        + "Number is a probability value or a positive weight")
+@UDFType(deterministic = false, stateful = false) // it is false because it return value base on probability
+public class MapRouletteUDF extends GenericUDF {
+
+    /**
+     * The map passed in saved all the value and its weight
+     *
+     * @param m A map contains a lot of item as key, with their weight as value
+     * @return The key that computer selected according to key's weight
+     */
+    private Object algorithm(Map<Object, Double> m) {
+        // normalize the weight
+        double sum = 0;
+        for (Map.Entry<Object, Double> entry : m.entrySet()) {
+            sum += entry.getValue();
+        }
+        for (Map.Entry<Object, Double> entry : m.entrySet()) {
+            entry.setValue(entry.getValue() / sum);
+        }
+
+        // sort and generate a number axis
+        List<Map.Entry<Object, Double>> entryList = new ArrayList<>(m.entrySet());
+        Collections.sort(entryList, new MapRouletteUDF.KvComparator());
+        double tmp = 0;
+        for (Map.Entry<Object, Double> entry : entryList) {
+            tmp += entry.getValue();
+            entry.setValue(tmp);
+        }
+
+        // judge last value
+        if (entryList.get(entryList.size() - 1).getValue() > 1.0) {
+            entryList.get(entryList.size() - 1).setValue(1.0);
+        }
+
+        // pick a Object base on its weight
+        double cursor = Math.random();
+        for (Map.Entry<Object, Double> entry : entryList) {
+            if (cursor < entry.getValue()) {
+                return entry.getKey();
+            }
+        }
+        return null;
+    }
+
+    private transient MapObjectInspector mapOI;
+    private transient PrimitiveObjectInspector valueOI;
+
+    @Override
+    public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
+        if (arguments.length != 1)
+            throw new UDFArgumentLengthException(
+                "Expected one arguments for map_find_max_prob: " + arguments.length);
+        if (arguments[0].getCategory() != ObjectInspector.Category.MAP) {
+            throw new UDFArgumentTypeException(0,
+                "Only map type arguments are accepted for the key but " + arguments[0].getTypeName()
+                        + " was passed as parameter 1.");
+        }
+        mapOI = HiveUtils.asMapOI(arguments[0]);
+        ObjectInspector keyOI = mapOI.getMapKeyObjectInspector();
+
+        //judge valueOI is a number
+        valueOI = (PrimitiveObjectInspector) mapOI.getMapValueObjectInspector();
+        switch (valueOI.getTypeName()) {
+            case INT_TYPE_NAME:
+            case DOUBLE_TYPE_NAME:
+            case BIGINT_TYPE_NAME:
+            case FLOAT_TYPE_NAME:
+            case SMALLINT_TYPE_NAME:
+            case TINYINT_TYPE_NAME:
+            case DECIMAL_TYPE_NAME:
+            case STRING_TYPE_NAME:
+                // Pass an empty map or a map full of {null, null} will get string type
+                // An number in string format like "3.5" also support
+                break;
+            default:
+                throw new UDFArgumentException(
+                    "Expected a number but get: " + valueOI.getTypeName());
+        }
+        return keyOI;
+    }
+
+    @Override
+    public Object evaluate(DeferredObject[] arguments) throws HiveException {
+        Map<Object, Double> input = processObjectDoubleMap(arguments[0]);
+        if (input == null) {
+            return null;
+        }
+        // handle empty map
+        if (input.isEmpty()) {
+            return null;
+        }
+        return algorithm(input);
+    }
+
+    /**
+     * Process the data passed by user.
+     * 
+     * @param argument data passed by user
+     * @return If all the value is ,
+     * @throws HiveException If get the wrong weight value like {key = "Wang", value = "Zhang"},
+     *         "Zhang" isn't a number ,this Method will throw exception when
+     *         convertPrimitiveToDouble("Zhang", valueOD)
+     */
+    private Map<Object, Double> processObjectDoubleMap(DeferredObject argument)
+            throws HiveException {
+        // get
+        Map<?, ?> m = mapOI.getMap(argument.get());
+        if (m == null) {
+            return null;
+        }
+        if (m.size() == 0) {
+            return null;
+        }
+        // convert
+        Map<Object, Double> input = new HashMap<>();
+        Double avg = 0.0;
+        for (Map.Entry<?, ?> entry : m.entrySet()) {
+            Object key = entry.getKey();
+            Double value = null;
+            if (entry.getValue() != null) {
+                value = PrimitiveObjectInspectorUtils.convertPrimitiveToDouble(entry.getValue(),
+                    valueOI);
+                if (value < 0) {
+                    throw new UDFArgumentException(entry.getValue() + " < 0");
+                }
+                avg += value;
+            }
+            input.put(key, value);
+        }
+        avg /= m.size();
+        for (Map.Entry<?, ?> entry : input.entrySet()) {
+            if (entry.getValue() == null) {
+                Object key = entry.getKey();
+                input.put(key, avg);
+            }
+        }
+        return input;
+    }
+
+    @Override
+    public String getDisplayString(String[] children) {
+        return "map_roulette(" + Arrays.toString(children) + ")";
+    }
+
+    private static class KvComparator implements Comparator<Map.Entry<Object, Double>> {
+
+        @Override
+        public int compare(Map.Entry<Object, Double> o1, Map.Entry<Object, Double> o2) {
+            return o1.getValue().compareTo(o2.getValue());
+        }
+    }
+
+}
diff --git a/core/src/test/java/hivemall/tools/map/MapRouletteUDFTest.java b/core/src/test/java/hivemall/tools/map/MapRouletteUDFTest.java
new file mode 100644
index 0000000..a7497d8
--- /dev/null
+++ b/core/src/test/java/hivemall/tools/map/MapRouletteUDFTest.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.tools.map;
+
+import hivemall.TestUtils;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.junit.Assert;
+import org.junit.Test;
+import java.io.IOException;
+import java.util.*;
+
+/**
+ * Unit test for {@link hivemall.tools.map.MapRouletteUDF}
+ * 
+ * @author Wang, Yizheng
+ */
+public class MapRouletteUDFTest {
+
+    /**
+     * Tom, Jerry, Amy, Wong, Zhao joined a roulette. Jerry has 0.2 weight to win. Zhao's weight is
+     * highest, he has more chance to win. During data processing ,Tom 's weight was Lost. Algorithm
+     * treat Tom 's weight as average. After 1000000 times of roulette, Zhao wins the most. Jerry
+     * wins less than Zhao but more than the other.
+     *
+     * @throws HiveException fmp.initialize may throws UDFArgumentException when checking parameter,
+     *         org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector#getMap(java.lang.Object)
+     *         may throw Hive Exception
+     */
+    @Test
+    public void testRoulette() throws HiveException {
+        MapRouletteUDF fmp = new MapRouletteUDF();
+        fmp.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardMapObjectInspector(
+            PrimitiveObjectInspectorFactory.javaStringObjectInspector,
+            PrimitiveObjectInspectorFactory.javaDoubleObjectInspector)});
+        Map<Object, Integer> solve = new HashMap<>();
+        solve.put("Tom", 0);
+        solve.put("Jerry", 0);
+        solve.put("Amy", 0);
+        solve.put("Wong", 0);
+        solve.put("Zhao", 0);
+        int T = 1000000;
+        while (T-- > 0) {
+            Map<Object, Double> m = new HashMap<>();
+            m.put("Tom", null);
+            m.put("Jerry", 0.2);
+            m.put("Amy", 0.1);
+            m.put("Wong", 0.1);
+            m.put("Zhao", 0.5);
+            GenericUDF.DeferredObject[] arguments =
+                    new GenericUDF.DeferredObject[] {new GenericUDF.DeferredJavaObject(m)};
+            Object key = fmp.evaluate(arguments);
+            solve.put(key, solve.get(key) + 1);
+        }
+        List<Map.Entry<Object, Integer>> solveList = new ArrayList<>(solve.entrySet());
+        Collections.sort(solveList, new KvComparator());
+        Object highestSolve = solveList.get(solveList.size() - 1).getKey();
+        Assert.assertEquals(highestSolve.toString(), "Zhao");
+        Object secondarySolve = solveList.get(solveList.size() - 2).getKey();
+        Assert.assertEquals(secondarySolve.toString(), "Jerry");
+    }
+
+    private static class KvComparator implements Comparator<Map.Entry<Object, Integer>> {
+
+        @Override
+        public int compare(Map.Entry<Object, Integer> o1, Map.Entry<Object, Integer> o2) {
+            return o1.getValue().compareTo(o2.getValue());
+        }
+    }
+
+    @Test
+    public void testSerialization() throws HiveException, IOException {
+        Map<Object, Double> m = new HashMap<>();
+        m.put("Tom", 0.1);
+        m.put("Jerry", 0.2);
+        m.put("Amy", 0.1);
+        m.put("Wong", 0.1);
+        m.put("Zhao", null);
+
+        TestUtils.testGenericUDFSerialization(MapRouletteUDF.class,
+            new ObjectInspector[] {ObjectInspectorFactory.getStandardMapObjectInspector(
+                PrimitiveObjectInspectorFactory.javaStringObjectInspector,
+                PrimitiveObjectInspectorFactory.javaDoubleObjectInspector)},
+            new Object[] {m});
+        byte[] serialized = TestUtils.serializeObjectByKryo(new MapRouletteUDFTest());
+        TestUtils.deserializeObjectByKryo(serialized, MapRouletteUDFTest.class);
+    }
+
+    @Test
+    public void testEmptyMapAndAllNullMap() throws HiveException {
+        MapRouletteUDF udf = new MapRouletteUDF();
+        Map<Object, Double> m = new HashMap<>();
+        udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardMapObjectInspector(
+            PrimitiveObjectInspectorFactory.javaStringObjectInspector,
+            PrimitiveObjectInspectorFactory.javaDoubleObjectInspector)});
+        GenericUDF.DeferredObject[] arguments =
+                new GenericUDF.DeferredObject[] {new GenericUDF.DeferredJavaObject(m)};
+        Assert.assertNull(udf.evaluate(arguments));
+        m.put(null, null);
+        arguments = new GenericUDF.DeferredObject[] {new GenericUDF.DeferredJavaObject(m)};
+        Assert.assertNull(udf.evaluate(arguments));
+    }
+
+    @Test
+    public void testOnlyOne() throws HiveException {
+        MapRouletteUDF udf = new MapRouletteUDF();
+        Map<Object, Double> m = new HashMap<>();
+        udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardMapObjectInspector(
+            PrimitiveObjectInspectorFactory.javaStringObjectInspector,
+            PrimitiveObjectInspectorFactory.javaDoubleObjectInspector)});
+        m.put("One", 324.6);
+        GenericUDF.DeferredObject[] arguments =
+                new GenericUDF.DeferredObject[] {new GenericUDF.DeferredJavaObject(m)};
+        Assert.assertEquals(udf.evaluate(arguments), "One");
+    }
+
+    @Test
+    public void testString() throws HiveException {
+        MapRouletteUDF udf = new MapRouletteUDF();
+        Map<Object, String> m = new HashMap<>();
+        udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardMapObjectInspector(
+            PrimitiveObjectInspectorFactory.javaStringObjectInspector,
+            PrimitiveObjectInspectorFactory.javaStringObjectInspector)});
+        m.put("One", "0.7");
+        GenericUDF.DeferredObject[] arguments =
+                new GenericUDF.DeferredObject[] {new GenericUDF.DeferredJavaObject(m)};
+        Assert.assertEquals(udf.evaluate(arguments), "One");
+    }
+}
diff --git a/docs/gitbook/misc/generic_funcs.md b/docs/gitbook/misc/generic_funcs.md
index 4f53f4d..328969b 100644
--- a/docs/gitbook/misc/generic_funcs.md
+++ b/docs/gitbook/misc/generic_funcs.md
@@ -539,7 +539,43 @@ This page describes a list of useful Hivemall generic functions. See also a [lis
       to_ordered_map(key, value, -100)    -- {3:"banana",4:"candy",10:"apple"} (tail-100)
   from t
   ```
-
+  
+- `map_roulette(Map<key, number> map)` -  Returns the `key` which determine to its `number` weight, the bigger weight is ,the more probability K will return.`Number` is a probability value or a positive weight
+  
+  We can use `map_roulette()` on a `Map<key, number>` that was secured from data.
+  ```sql
+  select map_roulette(to_map(a, b)) -- 25% Tom, 21% Zhang, 54% Wang
+  from(
+      select 'Wang' as a, 54 as b
+      union
+      select 'Zhang' as a, 21 as b
+      union
+      select 'Tom' as a, 25 as b
+  )tmp;
+  ```
+  We can pass an `empty map` or a map full of `null` value. Then we will get `null`.
+  ```sql
+  select map_roulette(map(null, null, null, null)); -- NULL
+  select map_roulette(map()); -- NULL
+  ```
+  An occasional `null` weight will be treated as average weight.
+  ```sql
+  select map_roulette(map(1, 0.5, 'Wang', null)); -- 50% Wang, 50% 1
+  select map_roulette(map(1, 0.5, 'Wang', null, 'Zhang', null)); -- 1/3 Wang, 1/3 1, 1/3 Zhang
+  ```
+  All the weight is zero will return `null`.
+  ```sql
+  select map_roulette(map(1, 0)); -- NULL
+  select map_roulette(map(1, 0, '5', 0)); -- NULL
+  ```
+  This udf isn't support non-numeric weight or negative weight.
+  ```sql
+  select map_roulette(map('Wong', 'A string', 'Zhao', 2)); 
+  --Failed with exception java.io.IOException:org.apache.hadoop.hive.ql.metadata.HiveException: Error evaluating map_roulette([map('Wong':'A string','Zhao':2)])
+  select map_roulette(map('Wong', 3, 'Zhao', -2));
+  -- Failed with exception java.io.IOException:org.apache.hadoop.hive.ql.exec.UDFArgumentException: -2 < 0
+  ```
+   
 # MapReduce
 
 - `distcache_gets(filepath, key, default_value [, parseKey])` - Returns map&lt;key_type, value_type&gt;|value_type
diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive
index e6f7c0b..4faaeed 100644
--- a/resources/ddl/define-all.hive
+++ b/resources/ddl/define-all.hive
@@ -507,6 +507,9 @@ create temporary function map_get as 'hivemall.tools.map.MapGetUDF';
 drop temporary function if exists map_key_values;
 create temporary function map_key_values as 'hivemall.tools.map.MapKeyValuesUDF';
 
+drop temporary function if exists map_roulette;
+create temporary function map_roulette as 'hivemall.tools.map.MapRouletteUDF';
+
 ---------------------
 -- list functions --
 ---------------------


[incubator-hivemall] 08/10: relocated private methods

Posted by my...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

myui pushed a commit to branch HIVEMALL-253-2
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git

commit 4d3d3a7aa3cdb0f2640bcb53b5176e7cd552933f
Author: Makoto Yui <my...@apache.org>
AuthorDate: Mon Jun 10 15:48:08 2019 +0900

    relocated private methods
---
 .../java/hivemall/tools/map/MapRouletteUDF.java    | 96 +++++++++++-----------
 1 file changed, 48 insertions(+), 48 deletions(-)

diff --git a/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java b/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
index 40d97c7..7aa0132 100644
--- a/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
+++ b/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
@@ -60,51 +60,6 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectIn
 @UDFType(deterministic = false, stateful = false) // it is false because it return value base on probability
 public final class MapRouletteUDF extends GenericUDF {
 
-    /**
-     * The map passed in saved all the value and its weight
-     *
-     * @param m A map contains a lot of item as key, with their weight as value
-     * @return The key that computer selected according to key's weight
-     */
-    @Nullable
-    private static Object algorithm(@Nonnull final Map<Object, Double> m) {
-        if (m.isEmpty()) {
-            return null;
-        }
-
-        // normalize the weight
-        double sum = 0;
-        for (Map.Entry<Object, Double> entry : m.entrySet()) {
-            sum += entry.getValue();
-        }
-        for (Map.Entry<Object, Double> entry : m.entrySet()) {
-            entry.setValue(entry.getValue() / sum);
-        }
-
-        // sort and generate a number axis
-        List<Map.Entry<Object, Double>> entryList = new ArrayList<>(m.entrySet());
-        Collections.sort(entryList, new KvComparator());
-        double tmp = 0;
-        for (Map.Entry<Object, Double> entry : entryList) {
-            tmp += entry.getValue();
-            entry.setValue(tmp);
-        }
-
-        // judge last value
-        if (entryList.get(entryList.size() - 1).getValue() > 1.0) {
-            entryList.get(entryList.size() - 1).setValue(1.0);
-        }
-
-        // pick a Object base on its weight
-        double cursor = Math.random();
-        for (Map.Entry<Object, Double> entry : entryList) {
-            if (cursor < entry.getValue()) {
-                return entry.getKey();
-            }
-        }
-        return null;
-    }
-
     private transient MapObjectInspector mapOI;
     private transient PrimitiveObjectInspector valueOI;
 
@@ -151,6 +106,11 @@ public final class MapRouletteUDF extends GenericUDF {
         return algorithm(input);
     }
 
+    @Override
+    public String getDisplayString(String[] children) {
+        return "map_roulette(" + StringUtils.join(children, ',') + ")";
+    }
+
     /**
      * Process the data passed by user.
      * 
@@ -196,9 +156,49 @@ public final class MapRouletteUDF extends GenericUDF {
         return input;
     }
 
-    @Override
-    public String getDisplayString(String[] children) {
-        return "map_roulette(" + StringUtils.join(children, ',') + ")";
+    /**
+     * The map passed in saved all the value and its weight
+     *
+     * @param m A map contains a lot of item as key, with their weight as value
+     * @return The key that computer selected according to key's weight
+     */
+    @Nullable
+    private static Object algorithm(@Nonnull final Map<Object, Double> m) {
+        if (m.isEmpty()) {
+            return null;
+        }
+
+        // normalize the weight
+        double sum = 0;
+        for (Map.Entry<Object, Double> entry : m.entrySet()) {
+            sum += entry.getValue();
+        }
+        for (Map.Entry<Object, Double> entry : m.entrySet()) {
+            entry.setValue(entry.getValue() / sum);
+        }
+
+        // sort and generate a number axis
+        List<Map.Entry<Object, Double>> entryList = new ArrayList<>(m.entrySet());
+        Collections.sort(entryList, new KvComparator());
+        double tmp = 0;
+        for (Map.Entry<Object, Double> entry : entryList) {
+            tmp += entry.getValue();
+            entry.setValue(tmp);
+        }
+
+        // judge last value
+        if (entryList.get(entryList.size() - 1).getValue() > 1.0) {
+            entryList.get(entryList.size() - 1).setValue(1.0);
+        }
+
+        // pick a Object base on its weight
+        double cursor = Math.random();
+        for (Map.Entry<Object, Double> entry : entryList) {
+            if (cursor < entry.getValue()) {
+                return entry.getKey();
+            }
+        }
+        return null;
     }
 
     private static class KvComparator implements Comparator<Map.Entry<Object, Double>> {


[incubator-hivemall] 03/10: Removed author tag

Posted by my...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

myui pushed a commit to branch HIVEMALL-253-2
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git

commit b15903df3df75d0014ee4b7ae914467cca1bf8da
Author: Makoto Yui <my...@apache.org>
AuthorDate: Mon Jun 10 15:36:37 2019 +0900

    Removed author tag
---
 core/src/main/java/hivemall/tools/map/MapRouletteUDF.java     | 2 --
 core/src/test/java/hivemall/tools/map/MapRouletteUDFTest.java | 2 --
 2 files changed, 4 deletions(-)

diff --git a/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java b/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
index e69dd53..b412457 100644
--- a/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
+++ b/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
@@ -32,8 +32,6 @@ import static hivemall.HivemallConstants.*;
 
 /**
  * The map_roulette() can be use to do roulette, according to each map.entry 's weight.
- * 
- * @author Wang, Yizheng
  */
 @Description(name = "map_roulette", value = "_FUNC_(Map<K, number> map)"
         + " - Returns the key K which determine to its weight , the bigger weight is ,the more probability K will return. "
diff --git a/core/src/test/java/hivemall/tools/map/MapRouletteUDFTest.java b/core/src/test/java/hivemall/tools/map/MapRouletteUDFTest.java
index a7497d8..bdef00d 100644
--- a/core/src/test/java/hivemall/tools/map/MapRouletteUDFTest.java
+++ b/core/src/test/java/hivemall/tools/map/MapRouletteUDFTest.java
@@ -31,8 +31,6 @@ import java.util.*;
 
 /**
  * Unit test for {@link hivemall.tools.map.MapRouletteUDF}
- * 
- * @author Wang, Yizheng
  */
 public class MapRouletteUDFTest {
 


[incubator-hivemall] 05/10: Changed instance method to static

Posted by my...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

myui pushed a commit to branch HIVEMALL-253-2
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git

commit c4a78cd46551660e6b2790b317f437400463b064
Author: Makoto Yui <my...@apache.org>
AuthorDate: Mon Jun 10 15:38:01 2019 +0900

    Changed instance method to static
---
 core/src/main/java/hivemall/tools/map/MapRouletteUDF.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java b/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
index efc0740..6964d60 100644
--- a/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
+++ b/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
@@ -64,7 +64,7 @@ public class MapRouletteUDF extends GenericUDF {
      * @param m A map contains a lot of item as key, with their weight as value
      * @return The key that computer selected according to key's weight
      */
-    private Object algorithm(Map<Object, Double> m) {
+    private static Object algorithm(Map<Object, Double> m) {
         // normalize the weight
         double sum = 0;
         for (Map.Entry<Object, Double> entry : m.entrySet()) {


[incubator-hivemall] 02/10: Updated DDLs using bin/update_ddls.sh

Posted by my...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

myui pushed a commit to branch HIVEMALL-253-2
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git

commit 61e122e8df7bb921bda0f255789346d7eb37d890
Author: Makoto Yui <my...@apache.org>
AuthorDate: Mon Jun 10 15:32:01 2019 +0900

    Updated DDLs using bin/update_ddls.sh
---
 resources/ddl/define-all-as-permanent.hive | 4 ++++
 resources/ddl/define-all.spark             | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive
index 0c836f2..8ae1afe 100644
--- a/resources/ddl/define-all-as-permanent.hive
+++ b/resources/ddl/define-all-as-permanent.hive
@@ -515,6 +515,9 @@ CREATE FUNCTION map_get as 'hivemall.tools.map.MapGetUDF' USING JAR '${hivemall_
 DROP FUNCTION IF EXISTS map_key_values;
 CREATE FUNCTION map_key_values as 'hivemall.tools.map.MapKeyValuesUDF' USING JAR '${hivemall_jar}';
 
+DROP FUNCTION IF EXISTS map_roulette;
+CREATE FUNCTION map_roulette as 'hivemall.tools.map.MapRouletteUDF' USING JAR '${hivemall_jar}';
+
 ---------------------
 -- list functions --
 ---------------------
@@ -877,3 +880,4 @@ CREATE FUNCTION xgboost_predict AS 'hivemall.xgboost.tools.XGBoostPredictUDTF' U
 
 DROP FUNCTION xgboost_multiclass_predict;
 CREATE FUNCTION xgboost_multiclass_predict AS 'hivemall.xgboost.tools.XGBoostMulticlassPredictUDTF' USING JAR '${hivemall_jar}';
+
diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark
index e3ff216..c142af0 100644
--- a/resources/ddl/define-all.spark
+++ b/resources/ddl/define-all.spark
@@ -506,6 +506,9 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION map_get AS 'hivemall.tools.map.MapGetU
 sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_key_values")
 sqlContext.sql("CREATE TEMPORARY FUNCTION map_key_values AS 'hivemall.tools.map.MapKeyValuesUDF'")
 
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_roulette")
+sqlContext.sql("CREATE TEMPORARY FUNCTION map_roulette AS 'hivemall.tools.map.MapRouletteUDF'")
+
 /**
  * List functions
  */


[incubator-hivemall] 04/10: Applied formatter

Posted by my...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

myui pushed a commit to branch HIVEMALL-253-2
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git

commit ec2d22e1fe769a34461dbb89eb984ba0a26bb20b
Author: Makoto Yui <my...@apache.org>
AuthorDate: Mon Jun 10 15:37:21 2019 +0900

    Applied formatter
---
 .../java/hivemall/tools/map/MapRouletteUDF.java    | 25 +++++++++++++++++++---
 .../hivemall/tools/map/MapRouletteUDFTest.java     | 11 ++++++++--
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java b/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
index b412457..efc0740 100644
--- a/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
+++ b/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
@@ -18,8 +18,29 @@
  */
 package hivemall.tools.map;
 
+import static hivemall.HivemallConstants.BIGINT_TYPE_NAME;
+import static hivemall.HivemallConstants.DECIMAL_TYPE_NAME;
+import static hivemall.HivemallConstants.DOUBLE_TYPE_NAME;
+import static hivemall.HivemallConstants.FLOAT_TYPE_NAME;
+import static hivemall.HivemallConstants.INT_TYPE_NAME;
+import static hivemall.HivemallConstants.SMALLINT_TYPE_NAME;
+import static hivemall.HivemallConstants.STRING_TYPE_NAME;
+import static hivemall.HivemallConstants.TINYINT_TYPE_NAME;
+
 import hivemall.utils.hadoop.HiveUtils;
-import org.apache.hadoop.hive.ql.exec.*;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.ql.udf.UDFType;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
@@ -27,8 +48,6 @@ import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
-import java.util.*;
-import static hivemall.HivemallConstants.*;
 
 /**
  * The map_roulette() can be use to do roulette, according to each map.entry 's weight.
diff --git a/core/src/test/java/hivemall/tools/map/MapRouletteUDFTest.java b/core/src/test/java/hivemall/tools/map/MapRouletteUDFTest.java
index bdef00d..ce9f2b5 100644
--- a/core/src/test/java/hivemall/tools/map/MapRouletteUDFTest.java
+++ b/core/src/test/java/hivemall/tools/map/MapRouletteUDFTest.java
@@ -19,6 +19,15 @@
 package hivemall.tools.map;
 
 import hivemall.TestUtils;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
 import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
@@ -26,8 +35,6 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 import org.junit.Assert;
 import org.junit.Test;
-import java.io.IOException;
-import java.util.*;
 
 /**
  * Unit test for {@link hivemall.tools.map.MapRouletteUDF}


[incubator-hivemall] 06/10: Revised UDF comment

Posted by my...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

myui pushed a commit to branch HIVEMALL-253-2
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git

commit dde8ec9a85a914e1515b605a606061c8db48bac8
Author: Makoto Yui <my...@apache.org>
AuthorDate: Mon Jun 10 15:40:19 2019 +0900

    Revised UDF comment
---
 core/src/main/java/hivemall/tools/map/MapRouletteUDF.java | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java b/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
index 6964d60..084653b 100644
--- a/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
+++ b/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
@@ -50,11 +50,10 @@ import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
 
 /**
- * The map_roulette() can be use to do roulette, according to each map.entry 's weight.
+ * The map_roulette returns a map key based on weighted random sampling of map values.
  */
 @Description(name = "map_roulette", value = "_FUNC_(Map<K, number> map)"
-        + " - Returns the key K which determine to its weight , the bigger weight is ,the more probability K will return. "
-        + "Number is a probability value or a positive weight")
+        + " - Returns a map key based on weighted random sampling of map values")
 @UDFType(deterministic = false, stateful = false) // it is false because it return value base on probability
 public class MapRouletteUDF extends GenericUDF {
 


[incubator-hivemall] 07/10: minor refactoring

Posted by my...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

myui pushed a commit to branch HIVEMALL-253-2
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git

commit 44ff4bbb46ae9398d3f078f54fd81196599b0eba
Author: Makoto Yui <my...@apache.org>
AuthorDate: Mon Jun 10 15:46:36 2019 +0900

    minor refactoring
---
 .../java/hivemall/tools/map/MapRouletteUDF.java    | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java b/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
index 084653b..40d97c7 100644
--- a/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
+++ b/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
@@ -28,15 +28,18 @@ import static hivemall.HivemallConstants.STRING_TYPE_NAME;
 import static hivemall.HivemallConstants.TINYINT_TYPE_NAME;
 
 import hivemall.utils.hadoop.HiveUtils;
+import hivemall.utils.lang.StringUtils;
 
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+
 import org.apache.hadoop.hive.ql.exec.Description;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
@@ -55,7 +58,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectIn
 @Description(name = "map_roulette", value = "_FUNC_(Map<K, number> map)"
         + " - Returns a map key based on weighted random sampling of map values")
 @UDFType(deterministic = false, stateful = false) // it is false because it return value base on probability
-public class MapRouletteUDF extends GenericUDF {
+public final class MapRouletteUDF extends GenericUDF {
 
     /**
      * The map passed in saved all the value and its weight
@@ -63,7 +66,12 @@ public class MapRouletteUDF extends GenericUDF {
      * @param m A map contains a lot of item as key, with their weight as value
      * @return The key that computer selected according to key's weight
      */
-    private static Object algorithm(Map<Object, Double> m) {
+    @Nullable
+    private static Object algorithm(@Nonnull final Map<Object, Double> m) {
+        if (m.isEmpty()) {
+            return null;
+        }
+
         // normalize the weight
         double sum = 0;
         for (Map.Entry<Object, Double> entry : m.entrySet()) {
@@ -75,7 +83,7 @@ public class MapRouletteUDF extends GenericUDF {
 
         // sort and generate a number axis
         List<Map.Entry<Object, Double>> entryList = new ArrayList<>(m.entrySet());
-        Collections.sort(entryList, new MapRouletteUDF.KvComparator());
+        Collections.sort(entryList, new KvComparator());
         double tmp = 0;
         for (Map.Entry<Object, Double> entry : entryList) {
             tmp += entry.getValue();
@@ -140,10 +148,6 @@ public class MapRouletteUDF extends GenericUDF {
         if (input == null) {
             return null;
         }
-        // handle empty map
-        if (input.isEmpty()) {
-            return null;
-        }
         return algorithm(input);
     }
 
@@ -194,7 +198,7 @@ public class MapRouletteUDF extends GenericUDF {
 
     @Override
     public String getDisplayString(String[] children) {
-        return "map_roulette(" + Arrays.toString(children) + ")";
+        return "map_roulette(" + StringUtils.join(children, ',') + ")";
     }
 
     private static class KvComparator implements Comparator<Map.Entry<Object, Double>> {


[incubator-hivemall] 10/10: Fixed comments

Posted by my...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

myui pushed a commit to branch HIVEMALL-253-2
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git

commit 49f3b632a1712b275c74774ce3b8534b10705a30
Author: Makoto Yui <my...@apache.org>
AuthorDate: Mon Jun 10 15:51:03 2019 +0900

    Fixed comments
---
 core/src/main/java/hivemall/tools/map/MapRouletteUDF.java | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java b/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
index 2c78937..3a3f0f8 100644
--- a/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
+++ b/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
@@ -67,12 +67,11 @@ public final class MapRouletteUDF extends GenericUDF {
     public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
         if (arguments.length != 1) {
             throw new UDFArgumentLengthException(
-                "Expected one arguments for map_find_max_prob: " + arguments.length);
+                "Expected exactly one argument for map_roulette: " + arguments.length);
         }
         if (arguments[0].getCategory() != ObjectInspector.Category.MAP) {
             throw new UDFArgumentTypeException(0,
-                "Only map type arguments are accepted for the key but " + arguments[0].getTypeName()
-                        + " was passed as parameter 1.");
+                "Only map type argument is accepted but got " + arguments[0].getTypeName());
         }
         mapOI = HiveUtils.asMapOI(arguments[0]);
         ObjectInspector keyOI = mapOI.getMapKeyObjectInspector();


[incubator-hivemall] 09/10: Added curly braces

Posted by my...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

myui pushed a commit to branch HIVEMALL-253-2
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git

commit fc756f46e604a3bc99efaeef544883803a84e9cc
Author: Makoto Yui <my...@apache.org>
AuthorDate: Mon Jun 10 15:49:17 2019 +0900

    Added curly braces
---
 core/src/main/java/hivemall/tools/map/MapRouletteUDF.java | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java b/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
index 7aa0132..2c78937 100644
--- a/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
+++ b/core/src/main/java/hivemall/tools/map/MapRouletteUDF.java
@@ -65,9 +65,10 @@ public final class MapRouletteUDF extends GenericUDF {
 
     @Override
     public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
-        if (arguments.length != 1)
+        if (arguments.length != 1) {
             throw new UDFArgumentLengthException(
                 "Expected one arguments for map_find_max_prob: " + arguments.length);
+        }
         if (arguments[0].getCategory() != ObjectInspector.Category.MAP) {
             throw new UDFArgumentTypeException(0,
                 "Only map type arguments are accepted for the key but " + arguments[0].getTypeName()