You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hivemall.apache.org by ta...@apache.org on 2017/10/04 03:06:38 UTC
incubator-hivemall git commit: [HIVEMALL-146] Yet another UDF to
generate n-grams
Repository: incubator-hivemall
Updated Branches:
refs/heads/master 1e4238757 -> 7bb5d047d
[HIVEMALL-146] Yet another UDF to generate n-grams
## What changes were proposed in this pull request?
Add a new UDF `to_ngrams(array<string> words, int minSize, int maxSize)` which returns list of n-grams `minSize <= n <= maxSize` for given words. This UDF can be alternative of the original Hive `ngrams` function.
## What type of PR is it?
Feature
## What is the Jira issue?
https://issues.apache.org/jira/browse/HIVEMALL-146
## How was this patch tested?
Unit test, manual tests both on EMR and local Hive
## How to use this feature?
as documented
## Checklist
(Please remove this section if not needed; check `x` for YES, blank for NO)
- [x] Did you apply source code formatter, i.e., `mvn formatter:format`, for your commit?
- [x] Did you run system tests on Hive (or Spark)?
Author: Takuya Kitazawa <k....@gmail.com>
Closes #118 from takuti/HIVEMALL-146-ngrams.
Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/7bb5d047
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/7bb5d047
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/7bb5d047
Branch: refs/heads/master
Commit: 7bb5d047dcce7e97336d4b73bb3bd078f2a6fc8a
Parents: 1e42387
Author: Takuya Kitazawa <k....@gmail.com>
Authored: Wed Oct 4 12:06:26 2017 +0900
Committer: Takuya Kitazawa <ta...@apache.org>
Committed: Wed Oct 4 12:06:26 2017 +0900
----------------------------------------------------------------------
.../java/hivemall/tools/text/WordNgramsUDF.java | 90 ++++++++++++++++++++
.../hivemall/tools/text/WordNgramsUDFTest.java | 87 +++++++++++++++++++
docs/gitbook/misc/generic_funcs.md | 8 ++
resources/ddl/define-all-as-permanent.hive | 3 +
resources/ddl/define-all.hive | 3 +
resources/ddl/define-all.spark | 3 +
resources/ddl/define-udfs.td.hql | 1 +
7 files changed, 195 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7bb5d047/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java b/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java
new file mode 100644
index 0000000..e4e5504
--- /dev/null
+++ b/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.tools.text;
+
+import hivemall.utils.lang.StringUtils;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDF;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFType;
+import org.apache.hadoop.io.Text;
+
+import javax.annotation.Nonnegative;
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+
+import java.util.ArrayList;
+import java.util.List;
+
+@Description(name = "word_ngrams", value = "_FUNC_(array<string> words, int minSize, int maxSize])"
+ + " - Returns list of n-grams for given words, where `minSize <= n <= maxSize`")
+@UDFType(deterministic = true, stateful = false)
+public final class WordNgramsUDF extends UDF {
+
+ @Nullable
+ public List<Text> evaluate(@Nullable final List<Text> words, final int minSize,
+ final int maxSize) throws HiveException {
+ if (words == null) {
+ return null;
+ }
+ if (minSize <= 0) {
+ throw new UDFArgumentException("`minSize` must be greater than zero: " + minSize);
+ }
+ if (minSize > maxSize) {
+ throw new UDFArgumentException("`maxSize` must be greater than or equal to `minSize`: "
+ + maxSize);
+ }
+ return getNgrams(words, minSize, maxSize);
+ }
+
+ @Nonnull
+ private static List<Text> getNgrams(@Nonnull final List<Text> words,
+ @Nonnegative final int minSize, @Nonnegative final int maxSize) throws HiveException {
+ final List<Text> ngrams = new ArrayList<Text>();
+ final StringBuilder ngram = new StringBuilder();
+
+ for (int i = 0, numWords = words.size(); i < numWords; i++) {
+ for (int ngramSize = minSize; ngramSize <= maxSize; ngramSize++) {
+ final int end = i + ngramSize;
+ if (end > numWords) { // exceeds the final element
+ continue;
+ }
+
+ StringUtils.clear(ngram);
+ for (int j = i; j < end; j++) {
+ final Text word = words.get(j);
+ if (word == null) {
+ throw new UDFArgumentException(
+ "`array<string> words` must not contain NULL element");
+ }
+ if (j > i) { // insert single whitespace between elements
+ ngram.append(" ");
+ }
+ ngram.append(word.toString());
+ }
+ ngrams.add(new Text(ngram.toString()));
+ }
+ }
+
+ return ngrams;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7bb5d047/core/src/test/java/hivemall/tools/text/WordNgramsUDFTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/hivemall/tools/text/WordNgramsUDFTest.java b/core/src/test/java/hivemall/tools/text/WordNgramsUDFTest.java
new file mode 100644
index 0000000..9b15e68
--- /dev/null
+++ b/core/src/test/java/hivemall/tools/text/WordNgramsUDFTest.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.tools.text;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.io.Text;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class WordNgramsUDFTest {
+
+ private WordNgramsUDF udf;
+
+ @Before
+ public void setUp() {
+ this.udf = new WordNgramsUDF();
+ }
+
+ @Test
+ public void testBigram() throws HiveException {
+ final List<Text> words = new ArrayList<Text>();
+ words.add(new Text("machine"));
+ words.add(new Text("learning"));
+
+ final List<Text> ngrams = udf.evaluate(words, 2, 2);
+
+ Assert.assertTrue(ngrams.size() == 1);
+ Assert.assertTrue(ngrams.contains(new Text("machine learning")));
+ }
+
+ @Test
+ public void testUniBigram() throws HiveException {
+ final List<Text> words = new ArrayList<Text>();
+ words.add(new Text("machine"));
+ words.add(new Text("learning"));
+
+ final List<Text> ngrams = udf.evaluate(words, 1, 2);
+
+ Assert.assertTrue(ngrams.size() == 3);
+ Assert.assertTrue(ngrams.contains(new Text("machine")));
+ Assert.assertTrue(ngrams.contains(new Text("learning")));
+ Assert.assertTrue(ngrams.contains(new Text("machine learning")));
+ }
+
+ @Test(expected = UDFArgumentException.class)
+ public void testWordsWithNull() throws HiveException {
+ final List<Text> words = new ArrayList<Text>();
+ words.add(new Text("machine"));
+ words.add(null);
+ words.add(new Text("learning"));
+
+ udf.evaluate(words, 1, 2);
+ }
+
+ @Test(expected = UDFArgumentException.class)
+ public void testInvalidMinSize() throws HiveException {
+ udf.evaluate(new ArrayList<Text>(), 0, 2);
+ }
+
+ @Test(expected = UDFArgumentException.class)
+ public void testInvalidMaxSize() throws HiveException {
+ udf.evaluate(new ArrayList<Text>(), 2, 1);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7bb5d047/docs/gitbook/misc/generic_funcs.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/misc/generic_funcs.md b/docs/gitbook/misc/generic_funcs.md
index 9775439..b6c7c62 100644
--- a/docs/gitbook/misc/generic_funcs.md
+++ b/docs/gitbook/misc/generic_funcs.md
@@ -257,6 +257,14 @@ The compression level must be in range [-1,9]
> ["kuromoji","使う","分かち書き","テスト","第","二","引数","normal","search","extended","指定","デフォルト","normal"," モード"]
```
+- `word_ngrams(array<string> words, int minSize, int maxSize)` - Returns list of n-grams where `minSize <= n <= maxSize`
+
+ ```sql
+ select word_ngrams(tokenize('Machine learning is fun!', true), 1, 2);
+
+ > ["machine","machine learning","learning","learning is","is","is fun","fun"]
+ ```
+
# Other functions
- `convert_label(const int|const float)` - Convert from -1|1 to 0.0f|1.0f, or from 0.0f|1.0f to -1|1
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7bb5d047/resources/ddl/define-all-as-permanent.hive
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive
index d2f0b9f..7906375 100644
--- a/resources/ddl/define-all-as-permanent.hive
+++ b/resources/ddl/define-all-as-permanent.hive
@@ -553,6 +553,9 @@ CREATE FUNCTION base91 as 'hivemall.tools.text.Base91UDF' USING JAR '${hivemall_
DROP FUNCTION IF EXISTS unbase91;
CREATE FUNCTION unbase91 as 'hivemall.tools.text.Unbase91UDF' USING JAR '${hivemall_jar}';
+DROP FUNCTION IF EXISTS word_ngrams;
+CREATE FUNCTION word_ngrams as 'hivemall.tools.text.WordNgramsUDF' USING JAR '${hivemall_jar}';
+
---------------------------------
-- Dataset generator functions --
---------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7bb5d047/resources/ddl/define-all.hive
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive
index 0ef36c3..1b1a035 100644
--- a/resources/ddl/define-all.hive
+++ b/resources/ddl/define-all.hive
@@ -545,6 +545,9 @@ create temporary function base91 as 'hivemall.tools.text.Base91UDF';
drop temporary function if exists unbase91;
create temporary function unbase91 as 'hivemall.tools.text.Unbase91UDF';
+drop temporary function if exists word_ngrams;
+create temporary function word_ngrams as 'hivemall.tools.text.WordNgramsUDF';
+
---------------------------------
-- Dataset generator functions --
---------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7bb5d047/resources/ddl/define-all.spark
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark
index 97307c2..7e6cacd 100644
--- a/resources/ddl/define-all.spark
+++ b/resources/ddl/define-all.spark
@@ -529,6 +529,9 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION base91 AS 'hivemall.tools.text.Base91U
sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS unbase91")
sqlContext.sql("CREATE TEMPORARY FUNCTION unbase91 AS 'hivemall.tools.text.Unbase91UDF'")
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS word_ngrams")
+sqlContext.sql("CREATE TEMPORARY FUNCTION word_ngrams AS 'hivemall.tools.text.WordNgramsUDF'")
+
/**
* Dataset generator functions
*/
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7bb5d047/resources/ddl/define-udfs.td.hql
----------------------------------------------------------------------
diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql
index a281b72..4b67fea 100644
--- a/resources/ddl/define-udfs.td.hql
+++ b/resources/ddl/define-udfs.td.hql
@@ -182,6 +182,7 @@ create temporary function to_ordered_list as 'hivemall.tools.list.UDAFToOrderedL
create temporary function singularize as 'hivemall.tools.text.SingularizeUDF';
create temporary function train_slim as 'hivemall.recommend.SlimUDTF';
create temporary function hitrate as 'hivemall.evaluation.HitRateUDAF';
+create temporary function word_ngrams as 'hivemall.tools.text.WordNgramsUDF';
-- NLP features
create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';