You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hivemall.apache.org by my...@apache.org on 2019/04/13 20:09:57 UTC
[incubator-hivemall] branch master updated: [HIVEMALL-248] UDF for Kuromoji stoptags

This is an automated email from the ASF dual-hosted git repository.

myui pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git


The following commit(s) were added to refs/heads/master by this push:
     new 465cd9d  [HIVEMALL-248] UDF for Kuromoji stoptags
465cd9d is described below

commit 465cd9dc04d2ce1675de7b119cb31e1ec96260ef
Author: Makoto Yui <my...@apache.org>
AuthorDate: Sun Apr 14 05:09:38 2019 +0900

    [HIVEMALL-248] UDF for Kuromoji stoptags
    
    ## What changes were proposed in this pull request?
    
    In tokenize_ja, user need to provide stoptags that matched tokens removed from the token stream. So, stoptag is "exclusive" rule.
    
    ## What type of PR is it?
    
    Feature
    
    ## What is the Jira issue?
    
    https://issues.apache.org/jira/browse/HIVEMALL-248
    
    ## How was this patch tested?
    
    unit tests, functional test on EMR
    
    ## How to use this feature?
    
    ```sql
    select tokenize_ja("kuromojiを使った分かち書きのテストです。", "normal", array("kuromoji"), stoptags_exclude(array("名詞")));
    ```
    > ["分かち書き","テスト"]
    
    `stoptags_exclude(array<string> tags, [, const string lang='ja'])` is a useful UDF for getting [stoptags](https://github.com/apache/lucene-solr/blob/master/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stoptags.txt) excluding given part-of-speech tags as seen below:
    
    ```sql
    select stoptags_exclude(array("名詞-固有名詞"));
    ```
    > ["その他","その他-間投","フィラー","副詞","副詞-一般","副詞-助詞類接続","助動詞","助詞","助詞-並立助詞"
    ,"助詞-係助詞","助詞-副助詞","助詞-副助詞／並立助詞／終助詞","助詞-副詞化","助詞-接続助詞","助詞-格助詞
    ","助詞-格助詞-一般","助詞-格助詞-引用","助詞-格助詞-連語","助詞-特殊","助詞-終助詞","助詞-連体化","助
    詞-間投助詞","動詞","動詞-接尾","動詞-自立","動詞-非自立","名詞","名詞-サ変接続","名詞-ナイ形容詞語幹",
    "名詞-一般","名詞-代名詞","名詞-代名詞-一般","名詞-代名詞-縮約","名詞-副詞可能","名詞-動詞非自立的","名
    詞-引用文字列","名詞-形容動詞語幹","名詞-接尾","名詞-接尾-サ変接続","名詞-接尾-一般","名詞-接尾-人名","
    名詞-接尾-副詞可能","名詞-接尾-助動詞語幹","名詞-接尾-助数詞","名詞-接尾-地域","名詞-接尾-形容動詞語幹"
    ,"名詞-接尾-特殊","名詞-接続詞的","名詞-数","名詞-特殊","名詞-特殊-助動詞語幹","名詞-非自立","名詞-非自
    立-一般","名詞-非自立-副詞可能","名詞-非自立-助動詞語幹","名詞-非自立-形容動詞語幹","形容詞","形容詞-接
    尾","形容詞-自立","形容詞-非自立","感動詞","接続詞","接頭詞","接頭詞-動詞接続","接頭詞-名詞接続","接頭
    詞-形容詞接続","接頭詞-数接","未知語","記号","記号-アルファベット","記号-一般","記号-句点","記号-括弧閉
    ","記号-括弧開","記号-空白","記号-読点","語断片","連体詞","非言語音"]
    
    ## Checklist
    
    - [x] Did you apply source code formatter, i.e., `./bin/format_code.sh`, for your commit?
    - [x] Did you run system tests on Hive (or Spark)?
    
    Author: Makoto Yui <my...@apache.org>
    
    Closes #189 from myui/HIVEMALL-248.
---
 .../main/java/hivemall/utils/lang/ArrayUtils.java  |   6 +-
 .../test/java/hivemall/utils/ArrayUtilsTest.java   |  15 +-
 docs/gitbook/misc/tokenizer.md                     |  24 ++
 .../hivemall/nlp/tokenizer/StoptagsExcludeUDF.java | 171 +++++++++++++
 .../nlp/tokenizer/StoptagsExcludeUDFTest.java      | 277 +++++++++++++++++++++
 resources/ddl/define-additional.hive               |   3 +
 6 files changed, 490 insertions(+), 6 deletions(-)

diff --git a/core/src/main/java/hivemall/utils/lang/ArrayUtils.java b/core/src/main/java/hivemall/utils/lang/ArrayUtils.java
index 1bce603..5df63d9 100644
--- a/core/src/main/java/hivemall/utils/lang/ArrayUtils.java
+++ b/core/src/main/java/hivemall/utils/lang/ArrayUtils.java
@@ -756,11 +756,7 @@ public final class ArrayUtils {
      */
     @Nonnull
     public static List<String> asKryoSerializableList(@Nonnull final String[] array) {
-        final List<String> list = new ArrayList<>(array.length);
-        for (String e : array) {
-            list.add(e);
-        }
-        return list;
+        return new ArrayList<>(Arrays.asList(array));
     }
 
 }
diff --git a/core/src/test/java/hivemall/utils/ArrayUtilsTest.java b/core/src/test/java/hivemall/utils/ArrayUtilsTest.java
index 8987fd8..3f81c39 100644
--- a/core/src/test/java/hivemall/utils/ArrayUtilsTest.java
+++ b/core/src/test/java/hivemall/utils/ArrayUtilsTest.java
@@ -20,10 +20,13 @@ package hivemall.utils;
 
 import hivemall.utils.lang.ArrayUtils;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
 import java.util.Random;
 
-import org.junit.Test;
 import org.junit.Assert;
+import org.junit.Test;
 
 public class ArrayUtilsTest {
 
@@ -39,4 +42,14 @@ public class ArrayUtilsTest {
         }
     }
 
+    @Test
+    public void asKryoSerializableListTest() {
+        String[] array = new String[] {"1, 2, 3", "4, 5, 6", "7, 8, 9", "10, 11, 12"};
+        List<String> actual = ArrayUtils.asKryoSerializableList(array);
+
+        Assert.assertEquals(Arrays.asList(array), actual);
+
+        Assert.assertEquals(ArrayList.class, actual.getClass());
+    }
+
 }
diff --git a/docs/gitbook/misc/tokenizer.md b/docs/gitbook/misc/tokenizer.md
index 016830c..e578198 100644
--- a/docs/gitbook/misc/tokenizer.md
+++ b/docs/gitbook/misc/tokenizer.md
@@ -63,6 +63,30 @@ select tokenize_ja("kuromojiを使った分かち書きのテストです。", "
 
 > ["を","使う","た","の","テスト","です"]
 
+```sql
+select tokenize_ja("kuromojiを使った分かち書きのテストです。", "normal", array("kuromoji"), stoptags_exclude(array("名詞")));
+```
+> ["分かち書き","テスト"]
+
+`stoptags_exclude(array<string> tags, [, const string lang='ja'])` is a useful UDF for getting [stoptags](https://github.com/apache/lucene-solr/blob/master/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stoptags.txt) excluding given part-of-speech tags as seen below:
+
+
+```sql
+select stoptags_exclude(array("名詞-固有名詞"));
+```
+> ["その他","その他-間投","フィラー","副詞","副詞-一般","副詞-助詞類接続","助動詞","助詞","助詞-並立助詞"
+,"助詞-係助詞","助詞-副助詞","助詞-副助詞／並立助詞／終助詞","助詞-副詞化","助詞-接続助詞","助詞-格助詞
+","助詞-格助詞-一般","助詞-格助詞-引用","助詞-格助詞-連語","助詞-特殊","助詞-終助詞","助詞-連体化","助
+詞-間投助詞","動詞","動詞-接尾","動詞-自立","動詞-非自立","名詞","名詞-サ変接続","名詞-ナイ形容詞語幹",
+"名詞-一般","名詞-代名詞","名詞-代名詞-一般","名詞-代名詞-縮約","名詞-副詞可能","名詞-動詞非自立的","名
+詞-引用文字列","名詞-形容動詞語幹","名詞-接尾","名詞-接尾-サ変接続","名詞-接尾-一般","名詞-接尾-人名","
+名詞-接尾-副詞可能","名詞-接尾-助動詞語幹","名詞-接尾-助数詞","名詞-接尾-地域","名詞-接尾-形容動詞語幹"
+,"名詞-接尾-特殊","名詞-接続詞的","名詞-数","名詞-特殊","名詞-特殊-助動詞語幹","名詞-非自立","名詞-非自
+立-一般","名詞-非自立-副詞可能","名詞-非自立-助動詞語幹","名詞-非自立-形容動詞語幹","形容詞","形容詞-接
+尾","形容詞-自立","形容詞-非自立","感動詞","接続詞","接頭詞","接頭詞-動詞接続","接頭詞-名詞接続","接頭
+詞-形容詞接続","接頭詞-数接","未知語","記号","記号-アルファベット","記号-一般","記号-句点","記号-括弧閉
+","記号-括弧開","記号-空白","記号-読点","語断片","連体詞","非言語音"]
+
 Moreover, the fifth argument `userDict` enables you to register a user-defined custom dictionary in [Kuromoji official format](https://github.com/atilika/kuromoji/blob/909fd6b32bf4e9dc86b7599de5c9b50ca8f004a1/kuromoji-core/src/test/resources/userdict.txt):
 
 ```sql
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/StoptagsExcludeUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/StoptagsExcludeUDF.java
new file mode 100644
index 0000000..1be92b0
--- /dev/null
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/StoptagsExcludeUDF.java
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.nlp.tokenizer;
+
+import hivemall.annotations.VisibleForTesting;
+import hivemall.utils.hadoop.HiveUtils;
+import hivemall.utils.lang.ArrayUtils;
+import hivemall.utils.lang.StringUtils;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Objects;
+
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFType;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+
+@Description(name = "stoptags_exclude",
+        value = "_FUNC_(array<string> excludeTags, [, const string lang='ja']) - Returns stoptags excluding given tags",
+        extended = "SELECT stoptags_exclude(array('名詞-固有名詞', '形容詞'))")
+@UDFType(deterministic = true, stateful = false)
+public final class StoptagsExcludeUDF extends GenericUDF {
+
+    static final String[] STOPTAGS_JA;
+    static {
+        STOPTAGS_JA = new String[] {"名詞", "名詞-一般", "名詞-固有名詞", "名詞-固有名詞-一般", "名詞-固有名詞-人名",
+                "名詞-固有名詞-人名-一般", "名詞-固有名詞-人名-姓", "名詞-固有名詞-人名-名", "名詞-固有名詞-組織", "名詞-固有名詞-地域",
+                "名詞-固有名詞-地域-一般", "名詞-固有名詞-地域-国", "名詞-代名詞", "名詞-代名詞-一般", "名詞-代名詞-縮約", "名詞-副詞可能",
+                "名詞-サ変接続", "名詞-形容動詞語幹", "名詞-数", "名詞-非自立", "名詞-非自立-一般", "名詞-非自立-副詞可能",
+                "名詞-非自立-助動詞語幹", "名詞-非自立-形容動詞語幹", "名詞-特殊", "名詞-特殊-助動詞語幹", "名詞-接尾", "名詞-接尾-一般",
+                "名詞-接尾-人名", "名詞-接尾-地域", "名詞-接尾-サ変接続", "名詞-接尾-助動詞語幹", "名詞-接尾-形容動詞語幹", "名詞-接尾-副詞可能",
+                "名詞-接尾-助数詞", "名詞-接尾-特殊", "名詞-接続詞的", "名詞-動詞非自立的", "名詞-引用文字列", "名詞-ナイ形容詞語幹", "接頭詞",
+                "接頭詞-名詞接続", "接頭詞-動詞接続", "接頭詞-形容詞接続", "接頭詞-数接", "動詞", "動詞-自立", "動詞-非自立", "動詞-接尾",
+                "形容詞", "形容詞-自立", "形容詞-非自立", "形容詞-接尾", "副詞", "副詞-一般", "副詞-助詞類接続", "連体詞", "接続詞", "助詞",
+                "助詞-格助詞", "助詞-格助詞-一般", "助詞-格助詞-引用", "助詞-格助詞-連語", "助詞-接続助詞", "助詞-係助詞", "助詞-副助詞",
+                "助詞-間投助詞", "助詞-並立助詞", "助詞-終助詞", "助詞-副助詞／並立助詞／終助詞", "助詞-連体化", "助詞-副詞化", "助詞-特殊",
+                "助動詞", "感動詞", "記号", "記号-一般", "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉",
+                "記号-アルファベット", "その他", "その他-間投", "フィラー", "非言語音", "語断片", "未知語"};
+        Arrays.sort(STOPTAGS_JA);
+    }
+
+    private ListObjectInspector tagsOI;
+    private String[] stopTags;
+
+    @Nullable
+    private List<String> result;
+
+    @Override
+    public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
+        if (argOIs.length != 1 && argOIs.length != 2) {
+            throw new UDFArgumentException(
+                "stoptags_exclude(array<string> tags, [, const string lang='ja']) takes one or two arguments: "
+                        + argOIs.length);
+        }
+
+        if (!HiveUtils.isStringListOI(argOIs[0])) {
+            throw new UDFArgumentException(
+                "stoptags_exclude(array<string> tags, [, const string lang='ja']) expects array<string> for the first argument : "
+                        + argOIs[0].getTypeName());
+        }
+        this.tagsOI = HiveUtils.asListOI(argOIs[0]);
+
+        if (argOIs.length == 2) {
+            if (!HiveUtils.isConstString(argOIs[1])) {
+                throw new UDFArgumentException(
+                    "stoptags_exclude(array<string> tags, [, const string lang='ja']) expects const string for the second argument: "
+                            + argOIs[1].getTypeName());
+            }
+            String lang = HiveUtils.getConstString(argOIs[1]);
+            if (!"ja".equalsIgnoreCase(lang)) {
+                throw new UDFArgumentException("Unsupported lang: " + lang);
+            }
+        }
+        this.stopTags = STOPTAGS_JA;
+
+        if (ObjectInspectorUtils.isConstantObjectInspector(tagsOI)) {
+            String[] excludeTags = HiveUtils.getConstStringArray(tagsOI);
+            this.result = getStoptags(stopTags, excludeTags);
+        }
+
+        return ObjectInspectorFactory.getStandardListObjectInspector(
+            PrimitiveObjectInspectorFactory.javaStringObjectInspector);
+    }
+
+    @Override
+    public List<String> evaluate(DeferredObject[] arguments) throws HiveException {
+        if (result != null) {
+            return result;
+        }
+        Objects.requireNonNull(stopTags);
+
+        final String[] excludeTags = HiveUtils.asStringArray(arguments[0], tagsOI);
+        if (excludeTags == null) {
+            return ArrayUtils.asKryoSerializableList(stopTags);
+        } else {
+            return getStoptags(stopTags, excludeTags);
+        }
+    }
+
+    @Nonnull
+    @VisibleForTesting
+    static List<String> getStoptags(@Nonnull final String[] stopTags,
+            @Nonnull final String[] excludeTags) {
+        final String[] mutableStopTags = stopTags.clone();
+        for (String tag : excludeTags) {
+            final int index = Arrays.binarySearch(stopTags, tag);
+            if (index < 0) {
+                continue;
+            }
+            // found prefix of given tag
+            for (int i = index; i < mutableStopTags.length; i++) {
+                final String stopTag = mutableStopTags[i];
+                if (stopTag == null) {
+                    continue;
+                }
+                if (stopTag.startsWith(tag)) {
+                    final int tagLen = tag.length();
+                    if (stopTag.length() > tagLen) {
+                        final char c = stopTag.charAt(tagLen);
+                        if (c != '-') {
+                            continue;
+                        }
+                    }
+                    mutableStopTags[i] = null;
+                } else {
+                    break;
+                }
+            }
+        }
+        final List<String> result = new ArrayList<>(mutableStopTags.length);
+        for (String tag : mutableStopTags) {
+            if (tag != null) {
+                result.add(tag);
+            }
+        }
+        return result;
+    }
+
+    @Override
+    public String getDisplayString(String[] children) {
+        return "stoptags_exclude(" + StringUtils.join(children, ',') + ')';
+    }
+
+}
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/StoptagsExcludeUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/StoptagsExcludeUDFTest.java
new file mode 100644
index 0000000..aacca2f
--- /dev/null
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/StoptagsExcludeUDFTest.java
@@ -0,0 +1,277 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.nlp.tokenizer;
+
+import hivemall.TestUtils;
+import hivemall.utils.hadoop.HiveUtils;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.io.Text;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class StoptagsExcludeUDFTest {
+
+    @Test
+    public void testGetStoptagsJA() {
+        List<String> actual = StoptagsExcludeUDF.getStoptags(StoptagsExcludeUDF.STOPTAGS_JA,
+            new String[] {"形容詞"});
+        String[] expected = new String[] {"名詞", "名詞-一般", "名詞-固有名詞", "名詞-固有名詞-一般", "名詞-固有名詞-人名",
+                "名詞-固有名詞-人名-一般", "名詞-固有名詞-人名-姓", "名詞-固有名詞-人名-名", "名詞-固有名詞-組織", "名詞-固有名詞-地域",
+                "名詞-固有名詞-地域-一般", "名詞-固有名詞-地域-国", "名詞-代名詞", "名詞-代名詞-一般", "名詞-代名詞-縮約", "名詞-副詞可能",
+                "名詞-サ変接続", "名詞-形容動詞語幹", "名詞-数", "名詞-非自立", "名詞-非自立-一般", "名詞-非自立-副詞可能",
+                "名詞-非自立-助動詞語幹", "名詞-非自立-形容動詞語幹", "名詞-特殊", "名詞-特殊-助動詞語幹", "名詞-接尾", "名詞-接尾-一般",
+                "名詞-接尾-人名", "名詞-接尾-地域", "名詞-接尾-サ変接続", "名詞-接尾-助動詞語幹", "名詞-接尾-形容動詞語幹", "名詞-接尾-副詞可能",
+                "名詞-接尾-助数詞", "名詞-接尾-特殊", "名詞-接続詞的", "名詞-動詞非自立的", "名詞-引用文字列", "名詞-ナイ形容詞語幹", "接頭詞",
+                "接頭詞-名詞接続", "接頭詞-動詞接続", "接頭詞-形容詞接続", "接頭詞-数接", "動詞", "動詞-自立", "動詞-非自立", "動詞-接尾",
+                // "形容詞", "形容詞-自立", "形容詞-非自立", "形容詞-接尾", 
+                "副詞", "副詞-一般", "副詞-助詞類接続", "連体詞", "接続詞", "助詞", "助詞-格助詞", "助詞-格助詞-一般", "助詞-格助詞-引用",
+                "助詞-格助詞-連語", "助詞-接続助詞", "助詞-係助詞", "助詞-副助詞", "助詞-間投助詞", "助詞-並立助詞", "助詞-終助詞",
+                "助詞-副助詞／並立助詞／終助詞", "助詞-連体化", "助詞-副詞化", "助詞-特殊", "助動詞", "感動詞", "記号", "記号-一般",
+                "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉", "記号-アルファベット", "その他", "その他-間投",
+                "フィラー", "非言語音", "語断片", "未知語"};
+        Arrays.sort(expected);
+        Assert.assertEquals(Arrays.asList(expected), actual);
+    }
+
+    @Test
+    public void testGetStoptagsJA2() {
+        List<String> actual = StoptagsExcludeUDF.getStoptags(StoptagsExcludeUDF.STOPTAGS_JA,
+            new String[] {"形容詞", "フィラー"});
+        String[] expected = new String[] {"名詞", "名詞-一般", "名詞-固有名詞", "名詞-固有名詞-一般", "名詞-固有名詞-人名",
+                "名詞-固有名詞-人名-一般", "名詞-固有名詞-人名-姓", "名詞-固有名詞-人名-名", "名詞-固有名詞-組織", "名詞-固有名詞-地域",
+                "名詞-固有名詞-地域-一般", "名詞-固有名詞-地域-国", "名詞-代名詞", "名詞-代名詞-一般", "名詞-代名詞-縮約", "名詞-副詞可能",
+                "名詞-サ変接続", "名詞-形容動詞語幹", "名詞-数", "名詞-非自立", "名詞-非自立-一般", "名詞-非自立-副詞可能",
+                "名詞-非自立-助動詞語幹", "名詞-非自立-形容動詞語幹", "名詞-特殊", "名詞-特殊-助動詞語幹", "名詞-接尾", "名詞-接尾-一般",
+                "名詞-接尾-人名", "名詞-接尾-地域", "名詞-接尾-サ変接続", "名詞-接尾-助動詞語幹", "名詞-接尾-形容動詞語幹", "名詞-接尾-副詞可能",
+                "名詞-接尾-助数詞", "名詞-接尾-特殊", "名詞-接続詞的", "名詞-動詞非自立的", "名詞-引用文字列", "名詞-ナイ形容詞語幹", "接頭詞",
+                "接頭詞-名詞接続", "接頭詞-動詞接続", "接頭詞-形容詞接続", "接頭詞-数接", "動詞", "動詞-自立", "動詞-非自立", "動詞-接尾",
+                // "形容詞", "形容詞-自立", "形容詞-非自立", "形容詞-接尾", 
+                "副詞", "副詞-一般", "副詞-助詞類接続", "連体詞", "接続詞", "助詞", "助詞-格助詞", "助詞-格助詞-一般", "助詞-格助詞-引用",
+                "助詞-格助詞-連語", "助詞-接続助詞", "助詞-係助詞", "助詞-副助詞", "助詞-間投助詞", "助詞-並立助詞", "助詞-終助詞",
+                "助詞-副助詞／並立助詞／終助詞", "助詞-連体化", "助詞-副詞化", "助詞-特殊", "助動詞", "感動詞", "記号", "記号-一般",
+                "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉", "記号-アルファベット", "その他", "その他-間投",
+                //"フィラー", 
+                "非言語音", "語断片", "未知語"};
+        Arrays.sort(expected);
+        Assert.assertEquals(Arrays.asList(expected), actual);
+    }
+
+    @Test
+    public void testGetStoptagsJa3() {
+        List<String> actual = StoptagsExcludeUDF.getStoptags(StoptagsExcludeUDF.STOPTAGS_JA,
+            new String[] {"形容詞", "フィラー", "名詞-固有名詞", "名詞-数"});
+        String[] expected = new String[] {"名詞", "名詞-一般",
+                // "名詞-固有名詞", "名詞-固有名詞-一般", "名詞-固有名詞-人名",
+                // "名詞-固有名詞-人名-一般", "名詞-固有名詞-人名-姓", "名詞-固有名詞-人名-名", "名詞-固有名詞-組織", "名詞-固有名詞-地域",
+                // "名詞-固有名詞-地域-一般", "名詞-固有名詞-地域-国", 
+                "名詞-代名詞", "名詞-代名詞-一般", "名詞-代名詞-縮約", "名詞-副詞可能", "名詞-サ変接続", "名詞-形容動詞語幹",
+                // "名詞-数",                 
+                "名詞-非自立", "名詞-非自立-一般", "名詞-非自立-副詞可能", "名詞-非自立-助動詞語幹", "名詞-非自立-形容動詞語幹", "名詞-特殊",
+                "名詞-特殊-助動詞語幹", "名詞-接尾", "名詞-接尾-一般", "名詞-接尾-人名", "名詞-接尾-地域", "名詞-接尾-サ変接続",
+                "名詞-接尾-助動詞語幹", "名詞-接尾-形容動詞語幹", "名詞-接尾-副詞可能", "名詞-接尾-助数詞", "名詞-接尾-特殊", "名詞-接続詞的",
+                "名詞-動詞非自立的", "名詞-引用文字列", "名詞-ナイ形容詞語幹", "接頭詞", "接頭詞-名詞接続", "接頭詞-動詞接続", "接頭詞-形容詞接続",
+                "接頭詞-数接", "動詞", "動詞-自立", "動詞-非自立", "動詞-接尾",
+                // "形容詞", "形容詞-自立", "形容詞-非自立", "形容詞-接尾", 
+                "副詞", "副詞-一般", "副詞-助詞類接続", "連体詞", "接続詞", "助詞", "助詞-格助詞", "助詞-格助詞-一般", "助詞-格助詞-引用",
+                "助詞-格助詞-連語", "助詞-接続助詞", "助詞-係助詞", "助詞-副助詞", "助詞-間投助詞", "助詞-並立助詞", "助詞-終助詞",
+                "助詞-副助詞／並立助詞／終助詞", "助詞-連体化", "助詞-副詞化", "助詞-特殊", "助動詞", "感動詞", "記号", "記号-一般",
+                "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉", "記号-アルファベット", "その他", "その他-間投",
+                //"フィラー", 
+                "非言語音", "語断片", "未知語"};
+        Arrays.sort(expected);
+        Assert.assertEquals(Arrays.asList(expected), actual);
+    }
+
+    @Test
+    public void testGetStoptagsJaContainsUnmatchedEntry() {
+        List<String> actual = StoptagsExcludeUDF.getStoptags(StoptagsExcludeUDF.STOPTAGS_JA,
+            new String[] {"形容詞", "フィラー", "名詞-非"});
+        String[] expected = new String[] {"名詞", "名詞-一般", "名詞-固有名詞", "名詞-固有名詞-一般", "名詞-固有名詞-人名",
+                "名詞-固有名詞-人名-一般", "名詞-固有名詞-人名-姓", "名詞-固有名詞-人名-名", "名詞-固有名詞-組織", "名詞-固有名詞-地域",
+                "名詞-固有名詞-地域-一般", "名詞-固有名詞-地域-国", "名詞-代名詞", "名詞-代名詞-一般", "名詞-代名詞-縮約", "名詞-副詞可能",
+                "名詞-サ変接続", "名詞-形容動詞語幹", "名詞-数", "名詞-非自立", "名詞-非自立-一般", "名詞-非自立-副詞可能",
+                "名詞-非自立-助動詞語幹", "名詞-非自立-形容動詞語幹", "名詞-特殊", "名詞-特殊-助動詞語幹", "名詞-接尾", "名詞-接尾-一般",
+                "名詞-接尾-人名", "名詞-接尾-地域", "名詞-接尾-サ変接続", "名詞-接尾-助動詞語幹", "名詞-接尾-形容動詞語幹", "名詞-接尾-副詞可能",
+                "名詞-接尾-助数詞", "名詞-接尾-特殊", "名詞-接続詞的", "名詞-動詞非自立的", "名詞-引用文字列", "名詞-ナイ形容詞語幹", "接頭詞",
+                "接頭詞-名詞接続", "接頭詞-動詞接続", "接頭詞-形容詞接続", "接頭詞-数接", "動詞", "動詞-自立", "動詞-非自立", "動詞-接尾",
+                // "形容詞", "形容詞-自立", "形容詞-非自立", "形容詞-接尾", 
+                "副詞", "副詞-一般", "副詞-助詞類接続", "連体詞", "接続詞", "助詞", "助詞-格助詞", "助詞-格助詞-一般", "助詞-格助詞-引用",
+                "助詞-格助詞-連語", "助詞-接続助詞", "助詞-係助詞", "助詞-副助詞", "助詞-間投助詞", "助詞-並立助詞", "助詞-終助詞",
+                "助詞-副助詞／並立助詞／終助詞", "助詞-連体化", "助詞-副詞化", "助詞-特殊", "助動詞", "感動詞", "記号", "記号-一般",
+                "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉", "記号-アルファベット", "その他", "その他-間投",
+                //"フィラー", 
+                "非言語音", "語断片", "未知語"};
+        Arrays.sort(expected);
+        Assert.assertEquals(Arrays.asList(expected), actual);
+    }
+
+    @Test
+    public void testOneArgument() throws IOException, HiveException {
+        StoptagsExcludeUDF udf = new StoptagsExcludeUDF();
+
+        udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector(
+            PrimitiveObjectInspectorFactory.writableStringObjectInspector)});
+
+        List<String> actual = udf.evaluate(new DeferredObject[] {new GenericUDF.DeferredJavaObject(
+            Arrays.asList(new Text("形容詞"), new Text("フィラー")))});
+        String[] expected = new String[] {"名詞", "名詞-一般", "名詞-固有名詞", "名詞-固有名詞-一般", "名詞-固有名詞-人名",
+                "名詞-固有名詞-人名-一般", "名詞-固有名詞-人名-姓", "名詞-固有名詞-人名-名", "名詞-固有名詞-組織", "名詞-固有名詞-地域",
+                "名詞-固有名詞-地域-一般", "名詞-固有名詞-地域-国", "名詞-代名詞", "名詞-代名詞-一般", "名詞-代名詞-縮約", "名詞-副詞可能",
+                "名詞-サ変接続", "名詞-形容動詞語幹", "名詞-数", "名詞-非自立", "名詞-非自立-一般", "名詞-非自立-副詞可能",
+                "名詞-非自立-助動詞語幹", "名詞-非自立-形容動詞語幹", "名詞-特殊", "名詞-特殊-助動詞語幹", "名詞-接尾", "名詞-接尾-一般",
+                "名詞-接尾-人名", "名詞-接尾-地域", "名詞-接尾-サ変接続", "名詞-接尾-助動詞語幹", "名詞-接尾-形容動詞語幹", "名詞-接尾-副詞可能",
+                "名詞-接尾-助数詞", "名詞-接尾-特殊", "名詞-接続詞的", "名詞-動詞非自立的", "名詞-引用文字列", "名詞-ナイ形容詞語幹", "接頭詞",
+                "接頭詞-名詞接続", "接頭詞-動詞接続", "接頭詞-形容詞接続", "接頭詞-数接", "動詞", "動詞-自立", "動詞-非自立", "動詞-接尾",
+                // "形容詞", "形容詞-自立", "形容詞-非自立", "形容詞-接尾", 
+                "副詞", "副詞-一般", "副詞-助詞類接続", "連体詞", "接続詞", "助詞", "助詞-格助詞", "助詞-格助詞-一般", "助詞-格助詞-引用",
+                "助詞-格助詞-連語", "助詞-接続助詞", "助詞-係助詞", "助詞-副助詞", "助詞-間投助詞", "助詞-並立助詞", "助詞-終助詞",
+                "助詞-副助詞／並立助詞／終助詞", "助詞-連体化", "助詞-副詞化", "助詞-特殊", "助動詞", "感動詞", "記号", "記号-一般",
+                "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉", "記号-アルファベット", "その他", "その他-間投",
+                //"フィラー", 
+                "非言語音", "語断片", "未知語"};
+        Arrays.sort(expected);
+        Assert.assertEquals(Arrays.asList(expected), actual);
+
+        actual = udf.evaluate(new DeferredObject[] {
+                new GenericUDF.DeferredJavaObject(Arrays.asList(new Text("形容詞")))});
+        expected = new String[] {"名詞", "名詞-一般", "名詞-固有名詞", "名詞-固有名詞-一般", "名詞-固有名詞-人名",
+                "名詞-固有名詞-人名-一般", "名詞-固有名詞-人名-姓", "名詞-固有名詞-人名-名", "名詞-固有名詞-組織", "名詞-固有名詞-地域",
+                "名詞-固有名詞-地域-一般", "名詞-固有名詞-地域-国", "名詞-代名詞", "名詞-代名詞-一般", "名詞-代名詞-縮約", "名詞-副詞可能",
+                "名詞-サ変接続", "名詞-形容動詞語幹", "名詞-数", "名詞-非自立", "名詞-非自立-一般", "名詞-非自立-副詞可能",
+                "名詞-非自立-助動詞語幹", "名詞-非自立-形容動詞語幹", "名詞-特殊", "名詞-特殊-助動詞語幹", "名詞-接尾", "名詞-接尾-一般",
+                "名詞-接尾-人名", "名詞-接尾-地域", "名詞-接尾-サ変接続", "名詞-接尾-助動詞語幹", "名詞-接尾-形容動詞語幹", "名詞-接尾-副詞可能",
+                "名詞-接尾-助数詞", "名詞-接尾-特殊", "名詞-接続詞的", "名詞-動詞非自立的", "名詞-引用文字列", "名詞-ナイ形容詞語幹", "接頭詞",
+                "接頭詞-名詞接続", "接頭詞-動詞接続", "接頭詞-形容詞接続", "接頭詞-数接", "動詞", "動詞-自立", "動詞-非自立", "動詞-接尾",
+                // "形容詞", "形容詞-自立", "形容詞-非自立", "形容詞-接尾", 
+                "副詞", "副詞-一般", "副詞-助詞類接続", "連体詞", "接続詞", "助詞", "助詞-格助詞", "助詞-格助詞-一般", "助詞-格助詞-引用",
+                "助詞-格助詞-連語", "助詞-接続助詞", "助詞-係助詞", "助詞-副助詞", "助詞-間投助詞", "助詞-並立助詞", "助詞-終助詞",
+                "助詞-副助詞／並立助詞／終助詞", "助詞-連体化", "助詞-副詞化", "助詞-特殊", "助動詞", "感動詞", "記号", "記号-一般",
+                "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉", "記号-アルファベット", "その他", "その他-間投",
+                "フィラー", "非言語音", "語断片", "未知語"};
+        Arrays.sort(expected);
+        Assert.assertEquals(Arrays.asList(expected), actual);
+
+        udf.close();
+    }
+
+    @Test
+    public void testOneConstArgument() throws IOException, HiveException {
+        StoptagsExcludeUDF udf = new StoptagsExcludeUDF();
+
+        udf.initialize(
+            new ObjectInspector[] {ObjectInspectorFactory.getStandardConstantListObjectInspector(
+                PrimitiveObjectInspectorFactory.javaStringObjectInspector,
+                Arrays.asList(new Text("形容詞"), new Text("フィラー")))});
+
+        List<String> actual1 = udf.evaluate(new DeferredObject[] {});
+
+        String[] expected = new String[] {"名詞", "名詞-一般", "名詞-固有名詞", "名詞-固有名詞-一般", "名詞-固有名詞-人名",
+                "名詞-固有名詞-人名-一般", "名詞-固有名詞-人名-姓", "名詞-固有名詞-人名-名", "名詞-固有名詞-組織", "名詞-固有名詞-地域",
+                "名詞-固有名詞-地域-一般", "名詞-固有名詞-地域-国", "名詞-代名詞", "名詞-代名詞-一般", "名詞-代名詞-縮約", "名詞-副詞可能",
+                "名詞-サ変接続", "名詞-形容動詞語幹", "名詞-数", "名詞-非自立", "名詞-非自立-一般", "名詞-非自立-副詞可能",
+                "名詞-非自立-助動詞語幹", "名詞-非自立-形容動詞語幹", "名詞-特殊", "名詞-特殊-助動詞語幹", "名詞-接尾", "名詞-接尾-一般",
+                "名詞-接尾-人名", "名詞-接尾-地域", "名詞-接尾-サ変接続", "名詞-接尾-助動詞語幹", "名詞-接尾-形容動詞語幹", "名詞-接尾-副詞可能",
+                "名詞-接尾-助数詞", "名詞-接尾-特殊", "名詞-接続詞的", "名詞-動詞非自立的", "名詞-引用文字列", "名詞-ナイ形容詞語幹", "接頭詞",
+                "接頭詞-名詞接続", "接頭詞-動詞接続", "接頭詞-形容詞接続", "接頭詞-数接", "動詞", "動詞-自立", "動詞-非自立", "動詞-接尾",
+                // "形容詞", "形容詞-自立", "形容詞-非自立", "形容詞-接尾", 
+                "副詞", "副詞-一般", "副詞-助詞類接続", "連体詞", "接続詞", "助詞", "助詞-格助詞", "助詞-格助詞-一般", "助詞-格助詞-引用",
+                "助詞-格助詞-連語", "助詞-接続助詞", "助詞-係助詞", "助詞-副助詞", "助詞-間投助詞", "助詞-並立助詞", "助詞-終助詞",
+                "助詞-副助詞／並立助詞／終助詞", "助詞-連体化", "助詞-副詞化", "助詞-特殊", "助動詞", "感動詞", "記号", "記号-一般",
+                "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉", "記号-アルファベット", "その他", "その他-間投",
+                //"フィラー", 
+                "非言語音", "語断片", "未知語"};
+        Arrays.sort(expected);
+        Assert.assertEquals(Arrays.asList(expected), actual1);
+
+        List<String> actual2 = udf.evaluate(new DeferredObject[] {});
+        Assert.assertSame(actual2, actual1);
+
+        udf.close();
+    }
+
+    @Test
+    public void testTwoArguments() throws IOException, HiveException {
+        StoptagsExcludeUDF udf = new StoptagsExcludeUDF();
+
+        udf.initialize(new ObjectInspector[] {
+                ObjectInspectorFactory.getStandardListObjectInspector(
+                    PrimitiveObjectInspectorFactory.writableStringObjectInspector),
+                HiveUtils.getConstStringObjectInspector("ja")});
+
+        List<String> actual = udf.evaluate(new DeferredObject[] {new GenericUDF.DeferredJavaObject(
+            Arrays.asList(new Text("形容詞"), new Text("フィラー")))});
+        String[] expected = new String[] {"名詞", "名詞-一般", "名詞-固有名詞", "名詞-固有名詞-一般", "名詞-固有名詞-人名",
+                "名詞-固有名詞-人名-一般", "名詞-固有名詞-人名-姓", "名詞-固有名詞-人名-名", "名詞-固有名詞-組織", "名詞-固有名詞-地域",
+                "名詞-固有名詞-地域-一般", "名詞-固有名詞-地域-国", "名詞-代名詞", "名詞-代名詞-一般", "名詞-代名詞-縮約", "名詞-副詞可能",
+                "名詞-サ変接続", "名詞-形容動詞語幹", "名詞-数", "名詞-非自立", "名詞-非自立-一般", "名詞-非自立-副詞可能",
+                "名詞-非自立-助動詞語幹", "名詞-非自立-形容動詞語幹", "名詞-特殊", "名詞-特殊-助動詞語幹", "名詞-接尾", "名詞-接尾-一般",
+                "名詞-接尾-人名", "名詞-接尾-地域", "名詞-接尾-サ変接続", "名詞-接尾-助動詞語幹", "名詞-接尾-形容動詞語幹", "名詞-接尾-副詞可能",
+                "名詞-接尾-助数詞", "名詞-接尾-特殊", "名詞-接続詞的", "名詞-動詞非自立的", "名詞-引用文字列", "名詞-ナイ形容詞語幹", "接頭詞",
+                "接頭詞-名詞接続", "接頭詞-動詞接続", "接頭詞-形容詞接続", "接頭詞-数接", "動詞", "動詞-自立", "動詞-非自立", "動詞-接尾",
+                // "形容詞", "形容詞-自立", "形容詞-非自立", "形容詞-接尾", 
+                "副詞", "副詞-一般", "副詞-助詞類接続", "連体詞", "接続詞", "助詞", "助詞-格助詞", "助詞-格助詞-一般", "助詞-格助詞-引用",
+                "助詞-格助詞-連語", "助詞-接続助詞", "助詞-係助詞", "助詞-副助詞", "助詞-間投助詞", "助詞-並立助詞", "助詞-終助詞",
+                "助詞-副助詞／並立助詞／終助詞", "助詞-連体化", "助詞-副詞化", "助詞-特殊", "助動詞", "感動詞", "記号", "記号-一般",
+                "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉", "記号-アルファベット", "その他", "その他-間投",
+                //"フィラー", 
+                "非言語音", "語断片", "未知語"};
+        Arrays.sort(expected);
+        Assert.assertEquals(Arrays.asList(expected), actual);
+
+        udf.close();
+    }
+
+    @Test(expected = UDFArgumentException.class)
+    public void testTwoArgumentsUnsupportedLang() throws IOException, HiveException {
+        StoptagsExcludeUDF udf = new StoptagsExcludeUDF();
+
+        udf.initialize(new ObjectInspector[] {
+                ObjectInspectorFactory.getStandardListObjectInspector(
+                    PrimitiveObjectInspectorFactory.writableStringObjectInspector),
+                HiveUtils.getConstStringObjectInspector("kr")});
+
+        udf.close();
+    }
+
+    @Test
+    public void testSerialization() throws IOException, HiveException {
+        StoptagsExcludeUDF udf = new StoptagsExcludeUDF();
+
+        udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardListObjectInspector(
+            PrimitiveObjectInspectorFactory.writableStringObjectInspector)});
+
+        // serialization after evaluation
+        byte[] serialized = TestUtils.serializeObjectByKryo(udf);
+        TestUtils.deserializeObjectByKryo(serialized, StoptagsExcludeUDF.class);
+
+        udf.close();
+    }
+
+}
diff --git a/resources/ddl/define-additional.hive b/resources/ddl/define-additional.hive
index af5cf82..5815fb7 100644
--- a/resources/ddl/define-additional.hive
+++ b/resources/ddl/define-additional.hive
@@ -12,6 +12,9 @@ create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';
 drop temporary function if exists tokenize_cn;
 create temporary function tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF';
 
+drop temporary function if exists stoptags_exclude;
+create temporary function stoptags_exclude as 'hivemall.nlp.tokenizer.StoptagsExcludeUDF';
+
 ------------------------------
 -- XGBoost related features --
 ------------------------------