You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hivemall.apache.org by my...@apache.org on 2019/04/13 20:09:57 UTC
[incubator-hivemall] branch master updated: [HIVEMALL-248] UDF for
Kuromoji stoptags
This is an automated email from the ASF dual-hosted git repository.
myui pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git
The following commit(s) were added to refs/heads/master by this push:
new 465cd9d [HIVEMALL-248] UDF for Kuromoji stoptags
465cd9d is described below
commit 465cd9dc04d2ce1675de7b119cb31e1ec96260ef
Author: Makoto Yui <my...@apache.org>
AuthorDate: Sun Apr 14 05:09:38 2019 +0900
[HIVEMALL-248] UDF for Kuromoji stoptags
## What changes were proposed in this pull request?
In tokenize_ja, user need to provide stoptags that matched tokens removed from the token stream. So, stoptag is "exclusive" rule.
## What type of PR is it?
Feature
## What is the Jira issue?
https://issues.apache.org/jira/browse/HIVEMALL-248
## How was this patch tested?
unit tests, functional test on EMR
## How to use this feature?
```sql
select tokenize_ja("kuromojiを使った分かち書きのテストです。", "normal", array("kuromoji"), stoptags_exclude(array("名詞")));
```
> ["分かち書き","テスト"]
`stoptags_exclude(array<string> tags, [, const string lang='ja'])` is a useful UDF for getting [stoptags](https://github.com/apache/lucene-solr/blob/master/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stoptags.txt) excluding given part-of-speech tags as seen below:
```sql
select stoptags_exclude(array("名詞-固有名詞"));
```
> ["その他","その他-間投","フィラー","副詞","副詞-一般","副詞-助詞類接続","助動詞","助詞","助詞-並立助詞"
,"助詞-係助詞","助詞-副助詞","助詞-副助詞/並立助詞/終助詞","助詞-副詞化","助詞-接続助詞","助詞-格助詞
","助詞-格助詞-一般","助詞-格助詞-引用","助詞-格助詞-連語","助詞-特殊","助詞-終助詞","助詞-連体化","助
詞-間投助詞","動詞","動詞-接尾","動詞-自立","動詞-非自立","名詞","名詞-サ変接続","名詞-ナイ形容詞語幹",
"名詞-一般","名詞-代名詞","名詞-代名詞-一般","名詞-代名詞-縮約","名詞-副詞可能","名詞-動詞非自立的","名
詞-引用文字列","名詞-形容動詞語幹","名詞-接尾","名詞-接尾-サ変接続","名詞-接尾-一般","名詞-接尾-人名","
名詞-接尾-副詞可能","名詞-接尾-助動詞語幹","名詞-接尾-助数詞","名詞-接尾-地域","名詞-接尾-形容動詞語幹"
,"名詞-接尾-特殊","名詞-接続詞的","名詞-数","名詞-特殊","名詞-特殊-助動詞語幹","名詞-非自立","名詞-非自
立-一般","名詞-非自立-副詞可能","名詞-非自立-助動詞語幹","名詞-非自立-形容動詞語幹","形容詞","形容詞-接
尾","形容詞-自立","形容詞-非自立","感動詞","接続詞","接頭詞","接頭詞-動詞接続","接頭詞-名詞接続","接頭
詞-形容詞接続","接頭詞-数接","未知語","記号","記号-アルファベット","記号-一般","記号-句点","記号-括弧閉
","記号-括弧開","記号-空白","記号-読点","語断片","連体詞","非言語音"]
## Checklist
- [x] Did you apply source code formatter, i.e., `./bin/format_code.sh`, for your commit?
- [x] Did you run system tests on Hive (or Spark)?
Author: Makoto Yui <my...@apache.org>
Closes #189 from myui/HIVEMALL-248.
---
.../main/java/hivemall/utils/lang/ArrayUtils.java | 6 +-
.../test/java/hivemall/utils/ArrayUtilsTest.java | 15 +-
docs/gitbook/misc/tokenizer.md | 24 ++
.../hivemall/nlp/tokenizer/StoptagsExcludeUDF.java | 171 +++++++++++++
.../nlp/tokenizer/StoptagsExcludeUDFTest.java | 277 +++++++++++++++++++++
resources/ddl/define-additional.hive | 3 +
6 files changed, 490 insertions(+), 6 deletions(-)
diff --git a/core/src/main/java/hivemall/utils/lang/ArrayUtils.java b/core/src/main/java/hivemall/utils/lang/ArrayUtils.java
index 1bce603..5df63d9 100644
--- a/core/src/main/java/hivemall/utils/lang/ArrayUtils.java
+++ b/core/src/main/java/hivemall/utils/lang/ArrayUtils.java
@@ -756,11 +756,7 @@ public final class ArrayUtils {
*/
@Nonnull
public static List<String> asKryoSerializableList(@Nonnull final String[] array) {
- final List<String> list = new ArrayList<>(array.length);
- for (String e : array) {
- list.add(e);
- }
- return list;
+ return new ArrayList<>(Arrays.asList(array));
}
}
diff --git a/core/src/test/java/hivemall/utils/ArrayUtilsTest.java b/core/src/test/java/hivemall/utils/ArrayUtilsTest.java
index 8987fd8..3f81c39 100644
--- a/core/src/test/java/hivemall/utils/ArrayUtilsTest.java
+++ b/core/src/test/java/hivemall/utils/ArrayUtilsTest.java
@@ -20,10 +20,13 @@ package hivemall.utils;
import hivemall.utils.lang.ArrayUtils;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
import java.util.Random;
-import org.junit.Test;
import org.junit.Assert;
+import org.junit.Test;
public class ArrayUtilsTest {
@@ -39,4 +42,14 @@ public class ArrayUtilsTest {
}
}
+ @Test
+ public void asKryoSerializableListTest() {
+ String[] array = new String[] {"1, 2, 3", "4, 5, 6", "7, 8, 9", "10, 11, 12"};
+ List<String> actual = ArrayUtils.asKryoSerializableList(array);
+
+ Assert.assertEquals(Arrays.asList(array), actual);
+
+ Assert.assertEquals(ArrayList.class, actual.getClass());
+ }
+
}
diff --git a/docs/gitbook/misc/tokenizer.md b/docs/gitbook/misc/tokenizer.md
index 016830c..e578198 100644
--- a/docs/gitbook/misc/tokenizer.md
+++ b/docs/gitbook/misc/tokenizer.md
@@ -63,6 +63,30 @@ select tokenize_ja("kuromojiを使った分かち書きのテストです。", "
> ["を","使う","た","の","テスト","です"]
+```sql
+select tokenize_ja("kuromojiを使った分かち書きのテストです。", "normal", array("kuromoji"), stoptags_exclude(array("名詞")));
+```
+> ["分かち書き","テスト"]
+
+`stoptags_exclude(array<string> tags, [, const string lang='ja'])` is a useful UDF for getting [stoptags](https://github.com/apache/lucene-solr/blob/master/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stoptags.txt) excluding given part-of-speech tags as seen below:
+
+
+```sql
+select stoptags_exclude(array("名詞-固有名詞"));
+```
+> ["その他","その他-間投","フィラー","副詞","副詞-一般","副詞-助詞類接続","助動詞","助詞","助詞-並立助詞"
+,"助詞-係助詞","助詞-副助詞","助詞-副助詞/並立助詞/終助詞","助詞-副詞化","助詞-接続助詞","助詞-格助詞
+","助詞-格助詞-一般","助詞-格助詞-引用","助詞-格助詞-連語","助詞-特殊","助詞-終助詞","助詞-連体化","助
+詞-間投助詞","動詞","動詞-接尾","動詞-自立","動詞-非自立","名詞","名詞-サ変接続","名詞-ナイ形容詞語幹",
+"名詞-一般","名詞-代名詞","名詞-代名詞-一般","名詞-代名詞-縮約","名詞-副詞可能","名詞-動詞非自立的","名
+詞-引用文字列","名詞-形容動詞語幹","名詞-接尾","名詞-接尾-サ変接続","名詞-接尾-一般","名詞-接尾-人名","
+名詞-接尾-副詞可能","名詞-接尾-助動詞語幹","名詞-接尾-助数詞","名詞-接尾-地域","名詞-接尾-形容動詞語幹"
+,"名詞-接尾-特殊","名詞-接続詞的","名詞-数","名詞-特殊","名詞-特殊-助動詞語幹","名詞-非自立","名詞-非自
+立-一般","名詞-非自立-副詞可能","名詞-非自立-助動詞語幹","名詞-非自立-形容動詞語幹","形容詞","形容詞-接
+尾","形容詞-自立","形容詞-非自立","感動詞","接続詞","接頭詞","接頭詞-動詞接続","接頭詞-名詞接続","接頭
+詞-形容詞接続","接頭詞-数接","未知語","記号","記号-アルファベット","記号-一般","記号-句点","記号-括弧閉
+","記号-括弧開","記号-空白","記号-読点","語断片","連体詞","非言語音"]
+
Moreover, the fifth argument `userDict` enables you to register a user-defined custom dictionary in [Kuromoji official format](https://github.com/atilika/kuromoji/blob/909fd6b32bf4e9dc86b7599de5c9b50ca8f004a1/kuromoji-core/src/test/resources/userdict.txt):
```sql
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/StoptagsExcludeUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/StoptagsExcludeUDF.java
new file mode 100644
index 0000000..1be92b0
--- /dev/null
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/StoptagsExcludeUDF.java
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.nlp.tokenizer;
+
+import hivemall.annotations.VisibleForTesting;
+import hivemall.utils.hadoop.HiveUtils;
+import hivemall.utils.lang.ArrayUtils;
+import hivemall.utils.lang.StringUtils;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Objects;
+
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFType;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+
+@Description(name = "stoptags_exclude",
+ value = "_FUNC_(array<string> excludeTags, [, const string lang='ja']) - Returns stoptags excluding given tags",
+ extended = "SELECT stoptags_exclude(array('名詞-固有名詞', '形容詞'))")
+@UDFType(deterministic = true, stateful = false)
+public final class StoptagsExcludeUDF extends GenericUDF {
+
+ static final String[] STOPTAGS_JA;
+ static {
+ STOPTAGS_JA = new String[] {"名詞", "名詞-一般", "名詞-固有名詞", "名詞-固有名詞-一般", "名詞-固有名詞-人名",
+ "名詞-固有名詞-人名-一般", "名詞-固有名詞-人名-姓", "名詞-固有名詞-人名-名", "名詞-固有名詞-組織", "名詞-固有名詞-地域",
+ "名詞-固有名詞-地域-一般", "名詞-固有名詞-地域-国", "名詞-代名詞", "名詞-代名詞-一般", "名詞-代名詞-縮約", "名詞-副詞可能",
+ "名詞-サ変接続", "名詞-形容動詞語幹", "名詞-数", "名詞-非自立", "名詞-非自立-一般", "名詞-非自立-副詞可能",
+ "名詞-非自立-助動詞語幹", "名詞-非自立-形容動詞語幹", "名詞-特殊", "名詞-特殊-助動詞語幹", "名詞-接尾", "名詞-接尾-一般",
+ "名詞-接尾-人名", "名詞-接尾-地域", "名詞-接尾-サ変接続", "名詞-接尾-助動詞語幹", "名詞-接尾-形容動詞語幹", "名詞-接尾-副詞可能",
+ "名詞-接尾-助数詞", "名詞-接尾-特殊", "名詞-接続詞的", "名詞-動詞非自立的", "名詞-引用文字列", "名詞-ナイ形容詞語幹", "接頭詞",
+ "接頭詞-名詞接続", "接頭詞-動詞接続", "接頭詞-形容詞接続", "接頭詞-数接", "動詞", "動詞-自立", "動詞-非自立", "動詞-接尾",
+ "形容詞", "形容詞-自立", "形容詞-非自立", "形容詞-接尾", "副詞", "副詞-一般", "副詞-助詞類接続", "連体詞", "接続詞", "助詞",
+ "助詞-格助詞", "助詞-格助詞-一般", "助詞-格助詞-引用", "助詞-格助詞-連語", "助詞-接続助詞", "助詞-係助詞", "助詞-副助詞",
+ "助詞-間投助詞", "助詞-並立助詞", "助詞-終助詞", "助詞-副助詞/並立助詞/終助詞", "助詞-連体化", "助詞-副詞化", "助詞-特殊",
+ "助動詞", "感動詞", "記号", "記号-一般", "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉",
+ "記号-アルファベット", "その他", "その他-間投", "フィラー", "非言語音", "語断片", "未知語"};
+ Arrays.sort(STOPTAGS_JA);
+ }
+
+ private ListObjectInspector tagsOI;
+ private String[] stopTags;
+
+ @Nullable
+ private List<String> result;
+
+ @Override
+ public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
+ if (argOIs.length != 1 && argOIs.length != 2) {
+ throw new UDFArgumentException(
+ "stoptags_exclude(array<string> tags, [, const string lang='ja']) takes one or two arguments: "
+ + argOIs.length);
+ }
+
+ if (!HiveUtils.isStringListOI(argOIs[0])) {
+ throw new UDFArgumentException(
+ "stoptags_exclude(array<string> tags, [, const string lang='ja']) expects array<string> for the first argument : "
+ + argOIs[0].getTypeName());
+ }
+ this.tagsOI = HiveUtils.asListOI(argOIs[0]);
+
+ if (argOIs.length == 2) {
+ if (!HiveUtils.isConstString(argOIs[1])) {
+ throw new UDFArgumentException(
+ "stoptags_exclude(array<string> tags, [, const string lang='ja']) expects const string for the second argument: "
+ + argOIs[1].getTypeName());
+ }
+ String lang = HiveUtils.getConstString(argOIs[1]);
+ if (!"ja".equalsIgnoreCase(lang)) {
+ throw new UDFArgumentException("Unsupported lang: " + lang);
+ }
+ }
+ this.stopTags = STOPTAGS_JA;
+
+ if (ObjectInspectorUtils.isConstantObjectInspector(tagsOI)) {
+ String[] excludeTags = HiveUtils.getConstStringArray(tagsOI);
+ this.result = getStoptags(stopTags, excludeTags);
+ }
+
+ return ObjectInspectorFactory.getStandardListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector);
+ }
+
+ @Override
+ public List<String> evaluate(DeferredObject[] arguments) throws HiveException {
+ if (result != null) {
+ return result;
+ }
+ Objects.requireNonNull(stopTags);
+
+ final String[] excludeTags = HiveUtils.asStringArray(arguments[0], tagsOI);
+ if (excludeTags == null) {
+ return ArrayUtils.asKryoSerializableList(stopTags);
+ } else {
+ return getStoptags(stopTags, excludeTags);
+ }
+ }
+
+ @Nonnull
+ @VisibleForTesting
+ static List<String> getStoptags(@Nonnull final String[] stopTags,
+ @Nonnull final String[] excludeTags) {
+ final String[] mutableStopTags = stopTags.clone();
+ for (String tag : excludeTags) {
+ final int index = Arrays.binarySearch(stopTags, tag);
+ if (index < 0) {
+ continue;
+ }
+ // found prefix of given tag
+ for (int i = index; i < mutableStopTags.length; i++) {
+ final String stopTag = mutableStopTags[i];
+ if (stopTag == null) {
+ continue;
+ }
+ if (stopTag.startsWith(tag)) {
+ final int tagLen = tag.length();
+ if (stopTag.length() > tagLen) {
+ final char c = stopTag.charAt(tagLen);
+ if (c != '-') {
+ continue;
+ }
+ }
+ mutableStopTags[i] = null;
+ } else {
+ break;
+ }
+ }
+ }
+ final List<String> result = new ArrayList<>(mutableStopTags.length);
+ for (String tag : mutableStopTags) {
+ if (tag != null) {
+ result.add(tag);
+ }
+ }
+ return result;
+ }
+
+ @Override
+ public String getDisplayString(String[] children) {
+ return "stoptags_exclude(" + StringUtils.join(children, ',') + ')';
+ }
+
+}
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/StoptagsExcludeUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/StoptagsExcludeUDFTest.java
new file mode 100644
index 0000000..aacca2f
--- /dev/null
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/StoptagsExcludeUDFTest.java
@@ -0,0 +1,277 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.nlp.tokenizer;
+
+import hivemall.TestUtils;
+import hivemall.utils.hadoop.HiveUtils;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.io.Text;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class StoptagsExcludeUDFTest {
+
+ @Test
+ public void testGetStoptagsJA() {
+ List<String> actual = StoptagsExcludeUDF.getStoptags(StoptagsExcludeUDF.STOPTAGS_JA,
+ new String[] {"形容詞"});
+ String[] expected = new String[] {"名詞", "名詞-一般", "名詞-固有名詞", "名詞-固有名詞-一般", "名詞-固有名詞-人名",
+ "名詞-固有名詞-人名-一般", "名詞-固有名詞-人名-姓", "名詞-固有名詞-人名-名", "名詞-固有名詞-組織", "名詞-固有名詞-地域",
+ "名詞-固有名詞-地域-一般", "名詞-固有名詞-地域-国", "名詞-代名詞", "名詞-代名詞-一般", "名詞-代名詞-縮約", "名詞-副詞可能",
+ "名詞-サ変接続", "名詞-形容動詞語幹", "名詞-数", "名詞-非自立", "名詞-非自立-一般", "名詞-非自立-副詞可能",
+ "名詞-非自立-助動詞語幹", "名詞-非自立-形容動詞語幹", "名詞-特殊", "名詞-特殊-助動詞語幹", "名詞-接尾", "名詞-接尾-一般",
+ "名詞-接尾-人名", "名詞-接尾-地域", "名詞-接尾-サ変接続", "名詞-接尾-助動詞語幹", "名詞-接尾-形容動詞語幹", "名詞-接尾-副詞可能",
+ "名詞-接尾-助数詞", "名詞-接尾-特殊", "名詞-接続詞的", "名詞-動詞非自立的", "名詞-引用文字列", "名詞-ナイ形容詞語幹", "接頭詞",
+ "接頭詞-名詞接続", "接頭詞-動詞接続", "接頭詞-形容詞接続", "接頭詞-数接", "動詞", "動詞-自立", "動詞-非自立", "動詞-接尾",
+ // "形容詞", "形容詞-自立", "形容詞-非自立", "形容詞-接尾",
+ "副詞", "副詞-一般", "副詞-助詞類接続", "連体詞", "接続詞", "助詞", "助詞-格助詞", "助詞-格助詞-一般", "助詞-格助詞-引用",
+ "助詞-格助詞-連語", "助詞-接続助詞", "助詞-係助詞", "助詞-副助詞", "助詞-間投助詞", "助詞-並立助詞", "助詞-終助詞",
+ "助詞-副助詞/並立助詞/終助詞", "助詞-連体化", "助詞-副詞化", "助詞-特殊", "助動詞", "感動詞", "記号", "記号-一般",
+ "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉", "記号-アルファベット", "その他", "その他-間投",
+ "フィラー", "非言語音", "語断片", "未知語"};
+ Arrays.sort(expected);
+ Assert.assertEquals(Arrays.asList(expected), actual);
+ }
+
+ @Test
+ public void testGetStoptagsJA2() {
+ List<String> actual = StoptagsExcludeUDF.getStoptags(StoptagsExcludeUDF.STOPTAGS_JA,
+ new String[] {"形容詞", "フィラー"});
+ String[] expected = new String[] {"名詞", "名詞-一般", "名詞-固有名詞", "名詞-固有名詞-一般", "名詞-固有名詞-人名",
+ "名詞-固有名詞-人名-一般", "名詞-固有名詞-人名-姓", "名詞-固有名詞-人名-名", "名詞-固有名詞-組織", "名詞-固有名詞-地域",
+ "名詞-固有名詞-地域-一般", "名詞-固有名詞-地域-国", "名詞-代名詞", "名詞-代名詞-一般", "名詞-代名詞-縮約", "名詞-副詞可能",
+ "名詞-サ変接続", "名詞-形容動詞語幹", "名詞-数", "名詞-非自立", "名詞-非自立-一般", "名詞-非自立-副詞可能",
+ "名詞-非自立-助動詞語幹", "名詞-非自立-形容動詞語幹", "名詞-特殊", "名詞-特殊-助動詞語幹", "名詞-接尾", "名詞-接尾-一般",
+ "名詞-接尾-人名", "名詞-接尾-地域", "名詞-接尾-サ変接続", "名詞-接尾-助動詞語幹", "名詞-接尾-形容動詞語幹", "名詞-接尾-副詞可能",
+ "名詞-接尾-助数詞", "名詞-接尾-特殊", "名詞-接続詞的", "名詞-動詞非自立的", "名詞-引用文字列", "名詞-ナイ形容詞語幹", "接頭詞",
+ "接頭詞-名詞接続", "接頭詞-動詞接続", "接頭詞-形容詞接続", "接頭詞-数接", "動詞", "動詞-自立", "動詞-非自立", "動詞-接尾",
+ // "形容詞", "形容詞-自立", "形容詞-非自立", "形容詞-接尾",
+ "副詞", "副詞-一般", "副詞-助詞類接続", "連体詞", "接続詞", "助詞", "助詞-格助詞", "助詞-格助詞-一般", "助詞-格助詞-引用",
+ "助詞-格助詞-連語", "助詞-接続助詞", "助詞-係助詞", "助詞-副助詞", "助詞-間投助詞", "助詞-並立助詞", "助詞-終助詞",
+ "助詞-副助詞/並立助詞/終助詞", "助詞-連体化", "助詞-副詞化", "助詞-特殊", "助動詞", "感動詞", "記号", "記号-一般",
+ "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉", "記号-アルファベット", "その他", "その他-間投",
+ //"フィラー",
+ "非言語音", "語断片", "未知語"};
+ Arrays.sort(expected);
+ Assert.assertEquals(Arrays.asList(expected), actual);
+ }
+
+ @Test
+ public void testGetStoptagsJa3() {
+ List<String> actual = StoptagsExcludeUDF.getStoptags(StoptagsExcludeUDF.STOPTAGS_JA,
+ new String[] {"形容詞", "フィラー", "名詞-固有名詞", "名詞-数"});
+ String[] expected = new String[] {"名詞", "名詞-一般",
+ // "名詞-固有名詞", "名詞-固有名詞-一般", "名詞-固有名詞-人名",
+ // "名詞-固有名詞-人名-一般", "名詞-固有名詞-人名-姓", "名詞-固有名詞-人名-名", "名詞-固有名詞-組織", "名詞-固有名詞-地域",
+ // "名詞-固有名詞-地域-一般", "名詞-固有名詞-地域-国",
+ "名詞-代名詞", "名詞-代名詞-一般", "名詞-代名詞-縮約", "名詞-副詞可能", "名詞-サ変接続", "名詞-形容動詞語幹",
+ // "名詞-数",
+ "名詞-非自立", "名詞-非自立-一般", "名詞-非自立-副詞可能", "名詞-非自立-助動詞語幹", "名詞-非自立-形容動詞語幹", "名詞-特殊",
+ "名詞-特殊-助動詞語幹", "名詞-接尾", "名詞-接尾-一般", "名詞-接尾-人名", "名詞-接尾-地域", "名詞-接尾-サ変接続",
+ "名詞-接尾-助動詞語幹", "名詞-接尾-形容動詞語幹", "名詞-接尾-副詞可能", "名詞-接尾-助数詞", "名詞-接尾-特殊", "名詞-接続詞的",
+ "名詞-動詞非自立的", "名詞-引用文字列", "名詞-ナイ形容詞語幹", "接頭詞", "接頭詞-名詞接続", "接頭詞-動詞接続", "接頭詞-形容詞接続",
+ "接頭詞-数接", "動詞", "動詞-自立", "動詞-非自立", "動詞-接尾",
+ // "形容詞", "形容詞-自立", "形容詞-非自立", "形容詞-接尾",
+ "副詞", "副詞-一般", "副詞-助詞類接続", "連体詞", "接続詞", "助詞", "助詞-格助詞", "助詞-格助詞-一般", "助詞-格助詞-引用",
+ "助詞-格助詞-連語", "助詞-接続助詞", "助詞-係助詞", "助詞-副助詞", "助詞-間投助詞", "助詞-並立助詞", "助詞-終助詞",
+ "助詞-副助詞/並立助詞/終助詞", "助詞-連体化", "助詞-副詞化", "助詞-特殊", "助動詞", "感動詞", "記号", "記号-一般",
+ "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉", "記号-アルファベット", "その他", "その他-間投",
+ //"フィラー",
+ "非言語音", "語断片", "未知語"};
+ Arrays.sort(expected);
+ Assert.assertEquals(Arrays.asList(expected), actual);
+ }
+
+ @Test
+ public void testGetStoptagsJaContainsUnmatchedEntry() {
+ List<String> actual = StoptagsExcludeUDF.getStoptags(StoptagsExcludeUDF.STOPTAGS_JA,
+ new String[] {"形容詞", "フィラー", "名詞-非"});
+ String[] expected = new String[] {"名詞", "名詞-一般", "名詞-固有名詞", "名詞-固有名詞-一般", "名詞-固有名詞-人名",
+ "名詞-固有名詞-人名-一般", "名詞-固有名詞-人名-姓", "名詞-固有名詞-人名-名", "名詞-固有名詞-組織", "名詞-固有名詞-地域",
+ "名詞-固有名詞-地域-一般", "名詞-固有名詞-地域-国", "名詞-代名詞", "名詞-代名詞-一般", "名詞-代名詞-縮約", "名詞-副詞可能",
+ "名詞-サ変接続", "名詞-形容動詞語幹", "名詞-数", "名詞-非自立", "名詞-非自立-一般", "名詞-非自立-副詞可能",
+ "名詞-非自立-助動詞語幹", "名詞-非自立-形容動詞語幹", "名詞-特殊", "名詞-特殊-助動詞語幹", "名詞-接尾", "名詞-接尾-一般",
+ "名詞-接尾-人名", "名詞-接尾-地域", "名詞-接尾-サ変接続", "名詞-接尾-助動詞語幹", "名詞-接尾-形容動詞語幹", "名詞-接尾-副詞可能",
+ "名詞-接尾-助数詞", "名詞-接尾-特殊", "名詞-接続詞的", "名詞-動詞非自立的", "名詞-引用文字列", "名詞-ナイ形容詞語幹", "接頭詞",
+ "接頭詞-名詞接続", "接頭詞-動詞接続", "接頭詞-形容詞接続", "接頭詞-数接", "動詞", "動詞-自立", "動詞-非自立", "動詞-接尾",
+ // "形容詞", "形容詞-自立", "形容詞-非自立", "形容詞-接尾",
+ "副詞", "副詞-一般", "副詞-助詞類接続", "連体詞", "接続詞", "助詞", "助詞-格助詞", "助詞-格助詞-一般", "助詞-格助詞-引用",
+ "助詞-格助詞-連語", "助詞-接続助詞", "助詞-係助詞", "助詞-副助詞", "助詞-間投助詞", "助詞-並立助詞", "助詞-終助詞",
+ "助詞-副助詞/並立助詞/終助詞", "助詞-連体化", "助詞-副詞化", "助詞-特殊", "助動詞", "感動詞", "記号", "記号-一般",
+ "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉", "記号-アルファベット", "その他", "その他-間投",
+ //"フィラー",
+ "非言語音", "語断片", "未知語"};
+ Arrays.sort(expected);
+ Assert.assertEquals(Arrays.asList(expected), actual);
+ }
+
+ @Test
+ public void testOneArgument() throws IOException, HiveException {
+ StoptagsExcludeUDF udf = new StoptagsExcludeUDF();
+
+ udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector(
+ PrimitiveObjectInspectorFactory.writableStringObjectInspector)});
+
+ List<String> actual = udf.evaluate(new DeferredObject[] {new GenericUDF.DeferredJavaObject(
+ Arrays.asList(new Text("形容詞"), new Text("フィラー")))});
+ String[] expected = new String[] {"名詞", "名詞-一般", "名詞-固有名詞", "名詞-固有名詞-一般", "名詞-固有名詞-人名",
+ "名詞-固有名詞-人名-一般", "名詞-固有名詞-人名-姓", "名詞-固有名詞-人名-名", "名詞-固有名詞-組織", "名詞-固有名詞-地域",
+ "名詞-固有名詞-地域-一般", "名詞-固有名詞-地域-国", "名詞-代名詞", "名詞-代名詞-一般", "名詞-代名詞-縮約", "名詞-副詞可能",
+ "名詞-サ変接続", "名詞-形容動詞語幹", "名詞-数", "名詞-非自立", "名詞-非自立-一般", "名詞-非自立-副詞可能",
+ "名詞-非自立-助動詞語幹", "名詞-非自立-形容動詞語幹", "名詞-特殊", "名詞-特殊-助動詞語幹", "名詞-接尾", "名詞-接尾-一般",
+ "名詞-接尾-人名", "名詞-接尾-地域", "名詞-接尾-サ変接続", "名詞-接尾-助動詞語幹", "名詞-接尾-形容動詞語幹", "名詞-接尾-副詞可能",
+ "名詞-接尾-助数詞", "名詞-接尾-特殊", "名詞-接続詞的", "名詞-動詞非自立的", "名詞-引用文字列", "名詞-ナイ形容詞語幹", "接頭詞",
+ "接頭詞-名詞接続", "接頭詞-動詞接続", "接頭詞-形容詞接続", "接頭詞-数接", "動詞", "動詞-自立", "動詞-非自立", "動詞-接尾",
+ // "形容詞", "形容詞-自立", "形容詞-非自立", "形容詞-接尾",
+ "副詞", "副詞-一般", "副詞-助詞類接続", "連体詞", "接続詞", "助詞", "助詞-格助詞", "助詞-格助詞-一般", "助詞-格助詞-引用",
+ "助詞-格助詞-連語", "助詞-接続助詞", "助詞-係助詞", "助詞-副助詞", "助詞-間投助詞", "助詞-並立助詞", "助詞-終助詞",
+ "助詞-副助詞/並立助詞/終助詞", "助詞-連体化", "助詞-副詞化", "助詞-特殊", "助動詞", "感動詞", "記号", "記号-一般",
+ "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉", "記号-アルファベット", "その他", "その他-間投",
+ //"フィラー",
+ "非言語音", "語断片", "未知語"};
+ Arrays.sort(expected);
+ Assert.assertEquals(Arrays.asList(expected), actual);
+
+ actual = udf.evaluate(new DeferredObject[] {
+ new GenericUDF.DeferredJavaObject(Arrays.asList(new Text("形容詞")))});
+ expected = new String[] {"名詞", "名詞-一般", "名詞-固有名詞", "名詞-固有名詞-一般", "名詞-固有名詞-人名",
+ "名詞-固有名詞-人名-一般", "名詞-固有名詞-人名-姓", "名詞-固有名詞-人名-名", "名詞-固有名詞-組織", "名詞-固有名詞-地域",
+ "名詞-固有名詞-地域-一般", "名詞-固有名詞-地域-国", "名詞-代名詞", "名詞-代名詞-一般", "名詞-代名詞-縮約", "名詞-副詞可能",
+ "名詞-サ変接続", "名詞-形容動詞語幹", "名詞-数", "名詞-非自立", "名詞-非自立-一般", "名詞-非自立-副詞可能",
+ "名詞-非自立-助動詞語幹", "名詞-非自立-形容動詞語幹", "名詞-特殊", "名詞-特殊-助動詞語幹", "名詞-接尾", "名詞-接尾-一般",
+ "名詞-接尾-人名", "名詞-接尾-地域", "名詞-接尾-サ変接続", "名詞-接尾-助動詞語幹", "名詞-接尾-形容動詞語幹", "名詞-接尾-副詞可能",
+ "名詞-接尾-助数詞", "名詞-接尾-特殊", "名詞-接続詞的", "名詞-動詞非自立的", "名詞-引用文字列", "名詞-ナイ形容詞語幹", "接頭詞",
+ "接頭詞-名詞接続", "接頭詞-動詞接続", "接頭詞-形容詞接続", "接頭詞-数接", "動詞", "動詞-自立", "動詞-非自立", "動詞-接尾",
+ // "形容詞", "形容詞-自立", "形容詞-非自立", "形容詞-接尾",
+ "副詞", "副詞-一般", "副詞-助詞類接続", "連体詞", "接続詞", "助詞", "助詞-格助詞", "助詞-格助詞-一般", "助詞-格助詞-引用",
+ "助詞-格助詞-連語", "助詞-接続助詞", "助詞-係助詞", "助詞-副助詞", "助詞-間投助詞", "助詞-並立助詞", "助詞-終助詞",
+ "助詞-副助詞/並立助詞/終助詞", "助詞-連体化", "助詞-副詞化", "助詞-特殊", "助動詞", "感動詞", "記号", "記号-一般",
+ "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉", "記号-アルファベット", "その他", "その他-間投",
+ "フィラー", "非言語音", "語断片", "未知語"};
+ Arrays.sort(expected);
+ Assert.assertEquals(Arrays.asList(expected), actual);
+
+ udf.close();
+ }
+
+ @Test
+ public void testOneConstArgument() throws IOException, HiveException {
+ StoptagsExcludeUDF udf = new StoptagsExcludeUDF();
+
+ udf.initialize(
+ new ObjectInspector[] {ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector,
+ Arrays.asList(new Text("形容詞"), new Text("フィラー")))});
+
+ List<String> actual1 = udf.evaluate(new DeferredObject[] {});
+
+ String[] expected = new String[] {"名詞", "名詞-一般", "名詞-固有名詞", "名詞-固有名詞-一般", "名詞-固有名詞-人名",
+ "名詞-固有名詞-人名-一般", "名詞-固有名詞-人名-姓", "名詞-固有名詞-人名-名", "名詞-固有名詞-組織", "名詞-固有名詞-地域",
+ "名詞-固有名詞-地域-一般", "名詞-固有名詞-地域-国", "名詞-代名詞", "名詞-代名詞-一般", "名詞-代名詞-縮約", "名詞-副詞可能",
+ "名詞-サ変接続", "名詞-形容動詞語幹", "名詞-数", "名詞-非自立", "名詞-非自立-一般", "名詞-非自立-副詞可能",
+ "名詞-非自立-助動詞語幹", "名詞-非自立-形容動詞語幹", "名詞-特殊", "名詞-特殊-助動詞語幹", "名詞-接尾", "名詞-接尾-一般",
+ "名詞-接尾-人名", "名詞-接尾-地域", "名詞-接尾-サ変接続", "名詞-接尾-助動詞語幹", "名詞-接尾-形容動詞語幹", "名詞-接尾-副詞可能",
+ "名詞-接尾-助数詞", "名詞-接尾-特殊", "名詞-接続詞的", "名詞-動詞非自立的", "名詞-引用文字列", "名詞-ナイ形容詞語幹", "接頭詞",
+ "接頭詞-名詞接続", "接頭詞-動詞接続", "接頭詞-形容詞接続", "接頭詞-数接", "動詞", "動詞-自立", "動詞-非自立", "動詞-接尾",
+ // "形容詞", "形容詞-自立", "形容詞-非自立", "形容詞-接尾",
+ "副詞", "副詞-一般", "副詞-助詞類接続", "連体詞", "接続詞", "助詞", "助詞-格助詞", "助詞-格助詞-一般", "助詞-格助詞-引用",
+ "助詞-格助詞-連語", "助詞-接続助詞", "助詞-係助詞", "助詞-副助詞", "助詞-間投助詞", "助詞-並立助詞", "助詞-終助詞",
+ "助詞-副助詞/並立助詞/終助詞", "助詞-連体化", "助詞-副詞化", "助詞-特殊", "助動詞", "感動詞", "記号", "記号-一般",
+ "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉", "記号-アルファベット", "その他", "その他-間投",
+ //"フィラー",
+ "非言語音", "語断片", "未知語"};
+ Arrays.sort(expected);
+ Assert.assertEquals(Arrays.asList(expected), actual1);
+
+ List<String> actual2 = udf.evaluate(new DeferredObject[] {});
+ Assert.assertSame(actual2, actual1);
+
+ udf.close();
+ }
+
+ @Test
+ public void testTwoArguments() throws IOException, HiveException {
+ StoptagsExcludeUDF udf = new StoptagsExcludeUDF();
+
+ udf.initialize(new ObjectInspector[] {
+ ObjectInspectorFactory.getStandardListObjectInspector(
+ PrimitiveObjectInspectorFactory.writableStringObjectInspector),
+ HiveUtils.getConstStringObjectInspector("ja")});
+
+ List<String> actual = udf.evaluate(new DeferredObject[] {new GenericUDF.DeferredJavaObject(
+ Arrays.asList(new Text("形容詞"), new Text("フィラー")))});
+ String[] expected = new String[] {"名詞", "名詞-一般", "名詞-固有名詞", "名詞-固有名詞-一般", "名詞-固有名詞-人名",
+ "名詞-固有名詞-人名-一般", "名詞-固有名詞-人名-姓", "名詞-固有名詞-人名-名", "名詞-固有名詞-組織", "名詞-固有名詞-地域",
+ "名詞-固有名詞-地域-一般", "名詞-固有名詞-地域-国", "名詞-代名詞", "名詞-代名詞-一般", "名詞-代名詞-縮約", "名詞-副詞可能",
+ "名詞-サ変接続", "名詞-形容動詞語幹", "名詞-数", "名詞-非自立", "名詞-非自立-一般", "名詞-非自立-副詞可能",
+ "名詞-非自立-助動詞語幹", "名詞-非自立-形容動詞語幹", "名詞-特殊", "名詞-特殊-助動詞語幹", "名詞-接尾", "名詞-接尾-一般",
+ "名詞-接尾-人名", "名詞-接尾-地域", "名詞-接尾-サ変接続", "名詞-接尾-助動詞語幹", "名詞-接尾-形容動詞語幹", "名詞-接尾-副詞可能",
+ "名詞-接尾-助数詞", "名詞-接尾-特殊", "名詞-接続詞的", "名詞-動詞非自立的", "名詞-引用文字列", "名詞-ナイ形容詞語幹", "接頭詞",
+ "接頭詞-名詞接続", "接頭詞-動詞接続", "接頭詞-形容詞接続", "接頭詞-数接", "動詞", "動詞-自立", "動詞-非自立", "動詞-接尾",
+ // "形容詞", "形容詞-自立", "形容詞-非自立", "形容詞-接尾",
+ "副詞", "副詞-一般", "副詞-助詞類接続", "連体詞", "接続詞", "助詞", "助詞-格助詞", "助詞-格助詞-一般", "助詞-格助詞-引用",
+ "助詞-格助詞-連語", "助詞-接続助詞", "助詞-係助詞", "助詞-副助詞", "助詞-間投助詞", "助詞-並立助詞", "助詞-終助詞",
+ "助詞-副助詞/並立助詞/終助詞", "助詞-連体化", "助詞-副詞化", "助詞-特殊", "助動詞", "感動詞", "記号", "記号-一般",
+ "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉", "記号-アルファベット", "その他", "その他-間投",
+ //"フィラー",
+ "非言語音", "語断片", "未知語"};
+ Arrays.sort(expected);
+ Assert.assertEquals(Arrays.asList(expected), actual);
+
+ udf.close();
+ }
+
+ @Test(expected = UDFArgumentException.class)
+ public void testTwoArgumentsUnsupportedLang() throws IOException, HiveException {
+ StoptagsExcludeUDF udf = new StoptagsExcludeUDF();
+
+ udf.initialize(new ObjectInspector[] {
+ ObjectInspectorFactory.getStandardListObjectInspector(
+ PrimitiveObjectInspectorFactory.writableStringObjectInspector),
+ HiveUtils.getConstStringObjectInspector("kr")});
+
+ udf.close();
+ }
+
+ @Test
+ public void testSerialization() throws IOException, HiveException {
+ StoptagsExcludeUDF udf = new StoptagsExcludeUDF();
+
+ udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector(
+ PrimitiveObjectInspectorFactory.writableStringObjectInspector)});
+
+ // serialization after evaluation
+ byte[] serialized = TestUtils.serializeObjectByKryo(udf);
+ TestUtils.deserializeObjectByKryo(serialized, StoptagsExcludeUDF.class);
+
+ udf.close();
+ }
+
+}
diff --git a/resources/ddl/define-additional.hive b/resources/ddl/define-additional.hive
index af5cf82..5815fb7 100644
--- a/resources/ddl/define-additional.hive
+++ b/resources/ddl/define-additional.hive
@@ -12,6 +12,9 @@ create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';
drop temporary function if exists tokenize_cn;
create temporary function tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF';
+drop temporary function if exists stoptags_exclude;
+create temporary function stoptags_exclude as 'hivemall.nlp.tokenizer.StoptagsExcludeUDF';
+
------------------------------
-- XGBoost related features --
------------------------------