You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hivemall.apache.org by my...@apache.org on 2019/06/27 18:02:45 UTC
[incubator-hivemall] branch master updated: [HIVEMALL-259][DOC]
Refactor feature_binning UDF
This is an automated email from the ASF dual-hosted git repository.
myui pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git
The following commit(s) were added to refs/heads/master by this push:
new 028306e [HIVEMALL-259][DOC] Refactor feature_binning UDF
028306e is described below
commit 028306e77cba45e587429412405b71339cab411b
Author: Makoto Yui <my...@apache.org>
AuthorDate: Fri Jun 28 03:02:38 2019 +0900
[HIVEMALL-259][DOC] Refactor feature_binning UDF
## What changes were proposed in this pull request?
Refactor feature_binning UDF and update the function usage
## What type of PR is it?
Documentation, Refactoring
## What is the Jira issue?
https://issues.apache.org/jira/browse/HIVEMALL-259
## How was this patch tested?
unit tests, manual tests on EMR
## How to use this feature?
```
WITH extracted as (
select
extract_feature(feature) as index,
extract_weight(feature) as value
from
input l
LATERAL VIEW explode(features) r as feature
),
mapping as (
select
index,
build_bins(value, 5, true) as quantiles -- 5 bins with auto bin shrinking
from
extracted
group by
index
),
bins as (
select
to_map(index, quantiles) as quantiles
from
mapping
)
select
l.features as original,
feature_binning(l.features, r.quantiles) as features
from
input l
cross join bins r
```
see https://gist.github.com/myui/f943fa3ce1a7e1ac3f2dd9a7f9fa703b
## Checklist
(Please remove this section if not needed; check `x` for YES, blank for NO)
- [x] Did you apply source code formatter, i.e., `./bin/format_code.sh`, for your commit?
- [x] Did you run system tests on Hive (or Spark)?
Author: Makoto Yui <my...@apache.org>
Closes #195 from myui/HIVEMALL-259.
---
.../hivemall/ftvec/binning/FeatureBinningUDF.java | 187 ++++++++++++---------
.../ftvec/binning/FeatureBinningUDFTest.java | 81 +++++++++
docs/gitbook/ft_engineering/binning.md | 178 +++++++++++++++-----
docs/gitbook/misc/funcs.md | 36 +++-
4 files changed, 360 insertions(+), 122 deletions(-)
diff --git a/core/src/main/java/hivemall/ftvec/binning/FeatureBinningUDF.java b/core/src/main/java/hivemall/ftvec/binning/FeatureBinningUDF.java
index f713937..2498154 100644
--- a/core/src/main/java/hivemall/ftvec/binning/FeatureBinningUDF.java
+++ b/core/src/main/java/hivemall/ftvec/binning/FeatureBinningUDF.java
@@ -18,7 +18,18 @@
*/
package hivemall.ftvec.binning;
+import hivemall.annotations.VisibleForTesting;
import hivemall.utils.hadoop.HiveUtils;
+import hivemall.utils.lang.StringUtils;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import javax.annotation.Nonnull;
+
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
@@ -37,12 +48,43 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspe
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
-import java.util.*;
-
+// @formatter:off
@Description(name = "feature_binning",
- value = "_FUNC_(array<features::string> features, const map<string, array<number>> quantiles_map)"
- + " / _FUNC_(number weight, const array<number> quantiles)"
- + " - Returns binned features as an array<features::string> / bin ID as int")
+ value = "_FUNC_(array<features::string> features, map<string, array<number>> quantiles_map)"
+ + " - returns a binned feature vector as an array<features::string>\n"
+ + "_FUNC_(number weight, array<number> quantiles) - returns bin ID as int",
+ extended = "WITH extracted as (\n" +
+ " select \n" +
+ " extract_feature(feature) as index,\n" +
+ " extract_weight(feature) as value\n" +
+ " from\n" +
+ " input l\n" +
+ " LATERAL VIEW explode(features) r as feature\n" +
+ "),\n" +
+ "mapping as (\n" +
+ " select\n" +
+ " index, \n" +
+ " build_bins(value, 5, true) as quantiles -- 5 bins with auto bin shrinking\n" +
+ " from\n" +
+ " extracted\n" +
+ " group by\n" +
+ " index\n" +
+ "),\n" +
+ "bins as (\n" +
+ " select \n" +
+ " to_map(index, quantiles) as quantiles \n" +
+ " from\n" +
+ " mapping\n" +
+ ")\n" +
+ "select\n" +
+ " l.features as original,\n" +
+ " feature_binning(l.features, r.quantiles) as features\n" +
+ "from\n" +
+ " input l\n" +
+ " cross join bins r\n\n" +
+ "> [\"name#Jacob\",\"gender#Male\",\"age:20.0\"] [\"name#Jacob\",\"gender#Male\",\"age:2\"]\n" +
+ "> [\"name#Isabella\",\"gender#Female\",\"age:20.0\"] [\"name#Isabella\",\"gender#Female\",\"age:2\"]")
+// @formatter:on
@UDFType(deterministic = true, stateful = false)
public final class FeatureBinningUDF extends GenericUDF {
private boolean multiple = true;
@@ -53,38 +95,34 @@ public final class FeatureBinningUDF extends GenericUDF {
private StringObjectInspector keyOI;
private ListObjectInspector quantilesOI;
private PrimitiveObjectInspector quantileOI;
-
private PrimitiveObjectInspector weightOI;
- private Map<Text, double[]> quantilesMap = null;
- private double[] quantiles = null;
-
@Override
- public ObjectInspector initialize(ObjectInspector[] OIs) throws UDFArgumentException {
- if (OIs.length != 2) {
- throw new UDFArgumentLengthException("Specify two arguments");
+ public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
+ if (argOIs.length != 2) {
+ throw new UDFArgumentLengthException("Specify two arguments :" + argOIs.length);
}
- if (HiveUtils.isListOI(OIs[0]) && HiveUtils.isMapOI(OIs[1])) {
- // for (array<features::string> features, const map<string, array<number>> quantiles_map)
+ if (HiveUtils.isListOI(argOIs[0]) && HiveUtils.isMapOI(argOIs[1])) {
+ // feature_binning(array<features::string> features, map<string, array<number>> quantiles_map)
if (!HiveUtils.isStringOI(
- ((ListObjectInspector) OIs[0]).getListElementObjectInspector())) {
+ ((ListObjectInspector) argOIs[0]).getListElementObjectInspector())) {
throw new UDFArgumentTypeException(0,
- "Only array<string> type argument is acceptable but " + OIs[0].getTypeName()
- + " was passed as `features`");
+ "Only array<string> type argument can be accepted but "
+ + argOIs[0].getTypeName() + " was passed as `features`");
}
- featuresOI = HiveUtils.asListOI(OIs[0]);
+ featuresOI = HiveUtils.asListOI(argOIs[0]);
featureOI = HiveUtils.asStringOI(featuresOI.getListElementObjectInspector());
- quantilesMapOI = HiveUtils.asMapOI(OIs[1]);
+ quantilesMapOI = HiveUtils.asMapOI(argOIs[1]);
if (!HiveUtils.isStringOI(quantilesMapOI.getMapKeyObjectInspector())
|| !HiveUtils.isListOI(quantilesMapOI.getMapValueObjectInspector())
|| !HiveUtils.isNumberOI(
((ListObjectInspector) quantilesMapOI.getMapValueObjectInspector()).getListElementObjectInspector())) {
throw new UDFArgumentTypeException(1,
- "Only map<string, array<number>> type argument is acceptable but "
- + OIs[1].getTypeName() + " was passed as `quantiles_map`");
+ "Only map<string, array<number>> type argument can be accepted but "
+ + argOIs[1].getTypeName() + " was passed as `quantiles_map`");
}
keyOI = HiveUtils.asStringOI(quantilesMapOI.getMapKeyObjectInspector());
quantilesOI = HiveUtils.asListOI(quantilesMapOI.getMapValueObjectInspector());
@@ -95,16 +133,16 @@ public final class FeatureBinningUDF extends GenericUDF {
return ObjectInspectorFactory.getStandardListObjectInspector(
PrimitiveObjectInspectorFactory.writableStringObjectInspector);
- } else if (HiveUtils.isPrimitiveOI(OIs[0]) && HiveUtils.isListOI(OIs[1])) {
- // for (number weight, const array<number> quantiles)
+ } else if (HiveUtils.isPrimitiveOI(argOIs[0]) && HiveUtils.isListOI(argOIs[1])) {
+ // feature_binning(number weight, array<number> quantiles)
- weightOI = HiveUtils.asDoubleCompatibleOI(OIs[0]);
+ weightOI = HiveUtils.asDoubleCompatibleOI(argOIs[0]);
- quantilesOI = HiveUtils.asListOI(OIs[1]);
+ quantilesOI = HiveUtils.asListOI(argOIs[1]);
if (!HiveUtils.isNumberOI(quantilesOI.getListElementObjectInspector())) {
throw new UDFArgumentTypeException(1,
- "Only array<number> type argument is acceptable but " + OIs[1].getTypeName()
- + " was passed as `quantiles`");
+ "Only array<number> type argument can be accepted but "
+ + argOIs[1].getTypeName() + " was passed as `quantiles`");
}
quantileOI =
HiveUtils.asDoubleCompatibleOI(quantilesOI.getListElementObjectInspector());
@@ -115,86 +153,81 @@ public final class FeatureBinningUDF extends GenericUDF {
} else {
throw new UDFArgumentTypeException(0,
"Only <array<features::string>, map<string, array<number>>> "
- + "or <number, array<number>> type arguments are accepted but <"
- + OIs[0].getTypeName() + ", " + OIs[1].getTypeName() + "> was passed.");
+ + "or <number, array<number>> type arguments can be accepted but <"
+ + argOIs[0].getTypeName() + ", " + argOIs[1].getTypeName()
+ + "> was passed.");
}
}
+ private transient Map<String, double[]> quantilesMap;
+ private transient double[] quantilesArray;
+
@Override
- public Object evaluate(DeferredObject[] dObj) throws HiveException {
+ public Object evaluate(DeferredObject[] args) throws HiveException {
+ final Object arg0 = args[0].get();
+ if (arg0 == null) {
+ return null;
+ }
+ final Object arg1 = args[1].get();
+ if (arg1 == null) {
+ throw new UDFArgumentException(
+ "The second argument (i.e., quantiles) MUST be non-null value");
+ }
+
if (multiple) {
- // init quantilesMap
if (quantilesMap == null) {
- quantilesMap = new HashMap<Text, double[]>();
- final Map<?, ?> _quantilesMap = quantilesMapOI.getMap(dObj[1].get());
-
- for (Object _key : _quantilesMap.keySet()) {
- final Text key = new Text(keyOI.getPrimitiveJavaObject(_key));
- final double[] val = HiveUtils.asDoubleArray(_quantilesMap.get(key),
- quantilesOI, quantileOI);
- quantilesMap.put(key, val);
+ final Map<?, ?> map = quantilesMapOI.getMap(arg1);
+ quantilesMap = new HashMap<String, double[]>(map.size() * 2);
+ for (Map.Entry<?, ?> e : map.entrySet()) {
+ String k = keyOI.getPrimitiveJavaObject(e.getKey());
+ double[] v = HiveUtils.asDoubleArray(e.getValue(), quantilesOI, quantileOI);
+ quantilesMap.put(k, v);
}
}
- final List<?> fs = featuresOI.getList(dObj[0].get());
+ final List<?> features = featuresOI.getList(arg0);
final List<Text> result = new ArrayList<Text>();
- for (Object f : fs) {
+ for (Object f : features) {
final String entry = featureOI.getPrimitiveJavaObject(f);
- final int pos = entry.indexOf(":");
- if (pos < 0) {
- // categorical
+ final int pos = entry.indexOf(':');
+ if (pos < 0) { // categorical
result.add(new Text(entry));
- } else {
- // quantitative
- final Text key = new Text(entry.substring(0, pos));
- String val = entry.substring(pos + 1);
-
- // binning
- if (quantilesMap.containsKey(key)) {
- val = String.valueOf(
- findBin(quantilesMap.get(key), Double.parseDouble(val)));
+ } else { // quantitative
+ final String k = entry.substring(0, pos);
+ String v = entry.substring(pos + 1);
+ final double[] bins = quantilesMap.get(k);
+ if (bins != null) { // binning
+ v = String.valueOf(findBin(bins, Double.parseDouble(v)));
}
- result.add(new Text(key + ":" + val));
+ result.add(new Text(k + ':' + v));
}
}
-
return result;
} else {
- // init quantiles
- if (quantiles == null) {
- quantiles = HiveUtils.asDoubleArray(dObj[1].get(), quantilesOI, quantileOI);
+ if (quantilesArray == null) {
+ quantilesArray = HiveUtils.asDoubleArray(arg1, quantilesOI, quantileOI);
}
- return new IntWritable(findBin(quantiles,
- PrimitiveObjectInspectorUtils.getDouble(dObj[0].get(), weightOI)));
+ return new IntWritable(
+ findBin(quantilesArray, PrimitiveObjectInspectorUtils.getDouble(arg0, weightOI)));
}
}
- private int findBin(double[] _quantiles, double d) throws HiveException {
- if (_quantiles.length < 3) {
+ @VisibleForTesting
+ static int findBin(@Nonnull final double[] quantiles, final double value) throws HiveException {
+ if (quantiles.length < 3) {
throw new HiveException(
"Length of `quantiles` should be greater than or equal to three but "
- + _quantiles.length + ".");
+ + quantiles.length + ".");
}
- int res = Arrays.binarySearch(_quantiles, d);
- return (res < 0) ? ~res - 1 : (res == 0) ? 0 : res - 1;
+ final int pos = Arrays.binarySearch(quantiles, value);
+ return (pos < 0) ? ~pos - 1 : (pos == 0) ? 0 : pos - 1;
}
@Override
public String getDisplayString(String[] children) {
- final StringBuilder sb = new StringBuilder();
- sb.append("feature_binning");
- sb.append("(");
- if (children.length > 0) {
- sb.append(children[0]);
- for (int i = 1; i < children.length; i++) {
- sb.append(", ");
- sb.append(children[i]);
- }
- }
- sb.append(")");
- return sb.toString();
+ return "feature_binning(" + StringUtils.join(children, ',') + ')';
}
}
diff --git a/core/src/test/java/hivemall/ftvec/binning/FeatureBinningUDFTest.java b/core/src/test/java/hivemall/ftvec/binning/FeatureBinningUDFTest.java
new file mode 100644
index 0000000..651f90c
--- /dev/null
+++ b/core/src/test/java/hivemall/ftvec/binning/FeatureBinningUDFTest.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.ftvec.binning;
+
+import static java.lang.Double.NEGATIVE_INFINITY;
+import static java.lang.Double.NaN;
+import static java.lang.Double.POSITIVE_INFINITY;
+import static org.junit.Assert.assertEquals;
+
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.junit.Test;
+
+public class FeatureBinningUDFTest {
+
+ @Test
+ public void testNaN() throws HiveException {
+ // If num_bins = 3, the bins become something like [-Inf, 1], (1, 10], (10, Inf].
+ final double[] bin = new double[] {NEGATIVE_INFINITY, 1.d, 10.d, POSITIVE_INFINITY};
+ assertEquals(2, FeatureBinningUDF.findBin(bin, POSITIVE_INFINITY));
+ assertEquals(3, FeatureBinningUDF.findBin(bin, NaN));
+ }
+
+ @Test
+ public void test3Bins() throws HiveException {
+ // If num_bins = 3, the bins become something like [-Inf, 1], (1, 10], (10, Inf].
+ final double[] bin = new double[] {NEGATIVE_INFINITY, 1.d, 10.d, POSITIVE_INFINITY};
+ assertEquals(0, FeatureBinningUDF.findBin(bin, NEGATIVE_INFINITY));
+ assertEquals(0, FeatureBinningUDF.findBin(bin, 1.d));
+
+ assertEquals(1, FeatureBinningUDF.findBin(bin, 1.1d));
+ assertEquals(1, FeatureBinningUDF.findBin(bin, 10.d));
+
+ assertEquals(2, FeatureBinningUDF.findBin(bin, 10.1d));
+ assertEquals(2, FeatureBinningUDF.findBin(bin, POSITIVE_INFINITY));
+ }
+
+ @Test
+ public void test4Bins() throws HiveException {
+ // If num_bins = 4, the bins become something like [-Inf, 0.111], (0.111, 0.222], (0.222, 0.333], (0.333, Inf].
+ final double[] bin =
+ new double[] {NEGATIVE_INFINITY, 0.111d, 0.222d, 0.333d, POSITIVE_INFINITY};
+ assertEquals(0, FeatureBinningUDF.findBin(bin, NEGATIVE_INFINITY));
+ assertEquals(0, FeatureBinningUDF.findBin(bin, -1.d));
+ assertEquals(0, FeatureBinningUDF.findBin(bin, 0.110d));
+ assertEquals(0, FeatureBinningUDF.findBin(bin, 0.111d));
+
+ assertEquals(1, FeatureBinningUDF.findBin(bin, 0.112d));
+ assertEquals(1, FeatureBinningUDF.findBin(bin, 0.2d));
+ assertEquals(1, FeatureBinningUDF.findBin(bin, 0.222d));
+ assertEquals(1, FeatureBinningUDF.findBin(bin, 0.2220d));
+
+ assertEquals(2, FeatureBinningUDF.findBin(bin, 0.223d));
+ assertEquals(2, FeatureBinningUDF.findBin(bin, 0.3d));
+ assertEquals(2, FeatureBinningUDF.findBin(bin, 0.332d));
+ assertEquals(2, FeatureBinningUDF.findBin(bin, 0.333d));
+
+ assertEquals(3, FeatureBinningUDF.findBin(bin, 0.334d));
+ assertEquals(3, FeatureBinningUDF.findBin(bin, 0.4d));
+ assertEquals(3, FeatureBinningUDF.findBin(bin, 10000d));
+ assertEquals(3, FeatureBinningUDF.findBin(bin, POSITIVE_INFINITY));
+
+ assertEquals(4, FeatureBinningUDF.findBin(bin, NaN));
+ }
+
+}
diff --git a/docs/gitbook/ft_engineering/binning.md b/docs/gitbook/ft_engineering/binning.md
index cd1ecbb..4634f92 100644
--- a/docs/gitbook/ft_engineering/binning.md
+++ b/docs/gitbook/ft_engineering/binning.md
@@ -17,8 +17,9 @@
under the License.
-->
-Feature binning is a method of dividing quantitative variables into categorical values.
-It groups quantitative values into a pre-defined number of bins.
+Feature binning is a method of dividing quantitative variables into categorical values. It groups quantitative values into a pre-defined number of bins.
+
+If the number of bins is set to 3, the bin ranges become something like `[-Inf, 1], (1, 10], (10, Inf]`.
*Note: This feature is supported from Hivemall v0.5-rc.1 or later.*
@@ -30,38 +31,91 @@ Prepare sample data (*users* table) first as follows:
``` sql
CREATE TABLE users (
- name string, age int, gender string
+ rowid int, name string, age int, gender string
);
-
INSERT INTO users VALUES
- ('Jacob', 20, 'Male'),
- ('Mason', 22, 'Male'),
- ('Sophia', 35, 'Female'),
- ('Ethan', 55, 'Male'),
- ('Emma', 15, 'Female'),
- ('Noah', 46, 'Male'),
- ('Isabella', 20, 'Female');
+ (1, 'Jacob', 20, 'Male'),
+ (2, 'Mason', 22, 'Male'),
+ (3, 'Sophia', 35, 'Female'),
+ (4, 'Ethan', 55, 'Male'),
+ (5, 'Emma', 15, 'Female'),
+ (6, 'Noah', 46, 'Male'),
+ (7, 'Isabella', 20, 'Female')
+;
+
+CREATE TABLE input as
+SELECT
+ rowid,
+ array_concat(
+ categorical_features(
+ array('name', 'gender'),
+ name, gender
+ ),
+ quantitative_features(
+ array('age'),
+ age
+ )
+ ) AS features
+FROM
+ users;
+
+select * from input limit 2;
```
-## A. Feature Vector trasformation by applying Feature Binning
+| input.rowid | input.features |
+|:--|:--|
+|1 | ["name#Jacob","gender#Male","age:20.0"] |
+|2 | ["name#Mason","gender#Male","age:22.0"] |
-``` sql
-WITH t AS (
+## Feature Vector trasformation by applying Feature Binning
+
+Now, converting `age` values into 3 bins.
+
+```sql
+SELECT
+ map('age', build_bins(age, 3)) AS quantiles_map
+FROM
+ users
+```
+
+> {"age":[-Infinity,18.333333333333332,30.666666666666657,Infinity]}
+
+In the above query result, you can find 4 values for age in `quantiles_map`. It's a threshold of 3 bins.
+
+```sql
+WITH bins as (
SELECT
- array_concat(
- categorical_features(
- array('name', 'gender'),
- name, gender
- ),
- quantitative_features(
- array('age'),
- age
- )
- ) AS features
+ map('age', build_bins(age, 3)) AS quantiles_map
FROM
users
-),
-bins AS (
+)
+select
+ feature_binning(
+ array('age:-Infinity', 'age:-1', 'age:0', 'age:1', 'age:18.333333333333331', 'age:18.333333333333332'), quantiles_map
+ ),
+ feature_binning(
+ array('age:18.3333333333333333', 'age:18.33333333333334', 'age:19', 'age:30', 'age:30.666666666666656', 'age:30.666666666666657'), quantiles_map
+ ),
+ feature_binning(
+ array('age:666666666666658', 'age:30.66666666666666', 'age:31', 'age:99', 'age:Infinity'), quantiles_map
+ ),
+ feature_binning(
+ array('age:NaN'), quantiles_map
+ ),
+ feature_binning( -- not in map
+ array('weight:60.3'), quantiles_map
+ )
+from
+ bins
+```
+
+> ["age:0","age:0","age:0","age:0","age:0","age:0"] ["age:0","age:1","age:1","age:1","age:1","age:1"] ["age:2","a
+ge:2","age:2","age:2","age:2"] ["age:3"] ["weight:60.3"]
+
+The following query shows more practical usage:
+
+``` sql
+WITH bins AS (
SELECT
map('age', build_bins(age, 3)) AS quantiles_map
FROM
@@ -70,23 +124,63 @@ bins AS (
SELECT
feature_binning(features, quantiles_map) AS features
FROM
- t CROSS JOIN bins;
+ input
+ CROSS JOIN bins;
```
-*Result*
-
| features: `array<features::string>` |
-| :-: |
+| :-- |
| ["name#Jacob","gender#Male","age:1"] |
| ["name#Mason","gender#Male","age:1"] |
| ["name#Sophia","gender#Female","age:2"] |
| ["name#Ethan","gender#Male","age:2"] |
-| ["name#Emma","gender#Female","age:0"] |
-| ["name#Noah","gender#Male","age:2"] |
-| ["name#Isabella","gender#Female","age:1"] |
+| ... |
+## More practical Example
-## B. Get a mapping table by Feature Binning
+Hivemall's
+
+```sql
+WITH extracted as (
+ select
+ extract_feature(feature) as index,
+ extract_weight(feature) as value
+ from
+ input l
+ LATERAL VIEW explode(features) r as feature
+),
+mapping as (
+ select
+ index,
+ build_bins(value, 5, true) as quantiles -- 5 bins with auto bin shrinking
+ from
+ extracted
+ group by
+ index
+),
+bins as (
+ select
+ to_map(index, quantiles) as quantiles
+ from
+ mapping
+)
+select
+ l.features as original,
+ feature_binning(l.features, r.quantiles) as features
+from
+ input l
+ cross join bins r
+-- limit 10;
+```
+
+| original | features |
+|:--|:--|
+| ["name#Jacob","gender#Male","age:20.0"] | ["name#Jacob","gender#Male","age:2"] |
+| ["name#Isabella","gender#Female","age:20.0"] | ["name#Isabella","gender#Female","age:2"] |
+| ... | ... |
+
+
+## Get a mapping table by Feature Binning
```sql
WITH bins AS (
@@ -99,8 +193,6 @@ FROM
users CROSS JOIN bins;
```
-*Result*
-
| age:` int` | bin: `int` |
|:-:|:-:|
| 20 | 1 |
@@ -111,17 +203,17 @@ FROM
| 46 | 2 |
| 20 | 1 |
-# Function Signature
+# Function Signatures
-## [UDAF] `build_bins(weight, num_of_bins[, auto_shrink])`
+### UDAF `build_bins(weight, num_of_bins[, auto_shrink])`
-### Input
+#### Input
| weight: int|bigint|float|double | num\_of\_bins: `int` | [auto\_shrink: `boolean` = false] |
| :-: | :-: | :-: |
| weight | 2 <= | behavior when separations are repeated: T=\>skip, F=\>exception |
-### Output
+#### Output
| quantiles: `array<double>` |
| :-: |
@@ -131,9 +223,7 @@ FROM
> There is the possibility quantiles are repeated because of too many `num_of_bins` or too few data.
> If `auto_shrink` is true, skip duplicated quantiles. If not, throw an exception.
-## [UDF] `feature_binning(features, quantiles_map)/(weight, quantiles)`
-
-### Variation: A
+### UDF `feature_binning(features, quantiles_map)`
#### Input
@@ -147,7 +237,7 @@ FROM
| :-: |
| serialized and binned features |
-### Variation: B
+### UDF `feature_binning((weight, quantiles)`
#### Input
diff --git a/docs/gitbook/misc/funcs.md b/docs/gitbook/misc/funcs.md
index 1b1b280..d860dba 100644
--- a/docs/gitbook/misc/funcs.md
+++ b/docs/gitbook/misc/funcs.md
@@ -263,7 +263,41 @@ Reference: <a href="https://papers.nips.cc/paper/3848-adaptive-regularization-of
- `build_bins(number weight, const int num_of_bins[, const boolean auto_shrink = false])` - Return quantiles representing bins: array<double>
-- `feature_binning(array<features::string> features, const map<string, array<number>> quantiles_map)` / _FUNC_(number weight, const array<number> quantiles) - Returns binned features as an array<features::string> / bin ID as int
+- `feature_binning(array<features::string> features, map<string, array<number>> quantiles_map)` - returns a binned feature vector as an array<features::string> _FUNC_(number weight, array<number> quantiles) - returns bin ID as int
+ ```sql
+ WITH extracted as (
+ select
+ extract_feature(feature) as index,
+ extract_weight(feature) as value
+ from
+ input l
+ LATERAL VIEW explode(features) r as feature
+ ),
+ mapping as (
+ select
+ index,
+ build_bins(value, 5, true) as quantiles -- 5 bins with auto bin shrinking
+ from
+ extracted
+ group by
+ index
+ ),
+ bins as (
+ select
+ to_map(index, quantiles) as quantiles
+ from
+ mapping
+ )
+ select
+ l.features as original,
+ feature_binning(l.features, r.quantiles) as features
+ from
+ input l
+ cross join bins r
+
+ > ["name#Jacob","gender#Male","age:20.0"] ["name#Jacob","gender#Male","age:2"]
+ > ["name#Isabella","gender#Female","age:20.0"] ["name#Isabella","gender#Female","age:2"]
+ ```
## Feature format conversion