You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hivemall.apache.org by my...@apache.org on 2021/04/23 10:17:21 UTC

[incubator-hivemall] branch master updated: [HIVEMALL-309] Enhance tokenize_ko to support stopwords and external user dict

This is an automated email from the ASF dual-hosted git repository.

myui pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git


The following commit(s) were added to refs/heads/master by this push:
     new 782c6e6  [HIVEMALL-309] Enhance tokenize_ko to support stopwords and external user dict
782c6e6 is described below

commit 782c6e6025d582ba7a5e49f76ab6b8848098319d
Author: Makoto Yui <my...@apache.org>
AuthorDate: Fri Apr 23 19:17:14 2021 +0900

    [HIVEMALL-309] Enhance tokenize_ko to support stopwords and external user dict
    
    ## What changes were proposed in this pull request?
    
    Enhance tokenize_ko to support stopwords and external user dict
    
    ## What type of PR is it?
    
    Improvement
    
    ## What is the Jira issue?
    
    https://issues.apache.org/jira/browse/HIVEMALL-309
    
    ## How was this patch tested?
    
    unit tests, manual tests on EMR
    
    ## How to use this feature?
    
    ```sql
    -- default stopward (null), default stoptags (null), custom dict
    select tokenize_ko('나는 C++ 언어를 프로그래밍 언어로 사랑한다.', '-mode discard', null, null, array('C++'));
    > ["나","c++","언어","프로그래밍","언어","사랑"]
    
    select tokenize_ko('나는 c++ 프로그래밍을 즐긴다.', '-mode discard', null, null, 'https://raw.githubusercontent.com/apache/lucene/main/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/userdict.txt');
    
    > ["나","c++","프로그래밍","즐기"]
    ```
    
    ## Checklist
    
    - [x] Did you apply source code formatter, i.e., `./bin/format_code.sh`, for your commit?
    - [x] Did you run system tests on Hive (or Spark)?
    
    Author: Makoto Yui <my...@apache.org>
    
    Closes #238 from myui/korean-enhancement.
---
 LICENSE                                            |   8 +
 docs/gitbook/misc/funcs.md                         |   6 +-
 docs/gitbook/misc/tokenizer.md                     | 100 ++-
 .../hivemall/nlp/tokenizer/KuromojiNEologdUDF.java |  11 +-
 .../java/hivemall/nlp/tokenizer/KuromojiUDF.java   |  11 +-
 .../java/hivemall/nlp/tokenizer/SmartcnUDF.java    |   7 +-
 .../java/hivemall/nlp/tokenizer/TokenizeKoUDF.java | 274 +++++++--
 .../hivemall/nlp/tokenizer/ext/KoreanAnalyzer.java | 118 ++++
 .../resources/META-INF/LICENSE-stopwords-ko.txt    |  21 +
 .../hivemall/nlp/tokenizer/ext/stopwords-ko.txt    | 680 +++++++++++++++++++++
 .../hivemall/nlp/tokenizer/KuromojiUDFTest.java    |  18 +
 .../hivemall/nlp/tokenizer/SmartcnUDFTest.java     |   6 +-
 .../hivemall/nlp/tokenizer/TokenizeKoUDFTest.java  | 251 +++++---
 .../nlp/tokenizer/ext/KoreanAnalyzerTest.java      |  88 +++
 .../hivemall/nlp/tokenizer/ext/userdict-ko.txt     |  11 +
 15 files changed, 1445 insertions(+), 165 deletions(-)

diff --git a/LICENSE b/LICENSE
index 5d99f40..b0940b8 100644
--- a/LICENSE
+++ b/LICENSE
@@ -296,6 +296,14 @@ For details, see http://fontawesome.io/
 
        src/site/resources/LICENSE-font_awesome-css.txt
 
+This product bundles collection of stopwords for Korean language
+which is licensed under the MIT license, specifically for tokenize_ko UDF.
+For details, see https://github.com/stopwords-iso/stopwords-ko
+
+   You can find a copy of the License at
+
+        nlp/src/main/resources/META-INF/LICENSE-stopwords-ko.txt
+
 ---------------------------------------------------------------------------
  The SIL Open Font License (https://opensource.org/licenses/OFL-1.1)
 ---------------------------------------------------------------------------
diff --git a/docs/gitbook/misc/funcs.md b/docs/gitbook/misc/funcs.md
index b40019a..3470695 100644
--- a/docs/gitbook/misc/funcs.md
+++ b/docs/gitbook/misc/funcs.md
@@ -1059,7 +1059,7 @@ Reference: <a href="https://papers.nips.cc/paper/3848-adaptive-regularization-of
 
 - `tokenize_cn(String line [, const list<string> stopWords])` - returns tokenized strings in array&lt;string&gt;
 
-- `tokenize_ja(String line [, const string mode = "normal", const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or string userDictURL)`]) - returns tokenized strings in array&lt;string&gt;
+- `tokenize_ja(String line [, const string mode = "normal", const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or const string userDictURL)`]) - returns tokenized strings in array&lt;string&gt;
   ```sql
   select tokenize_ja("kuromojiを使った分かち書きのテストです。第二引数にはnormal/search/extendedを指定できます。デフォルトではnormalモードです。");
 
@@ -1067,7 +1067,7 @@ Reference: <a href="https://papers.nips.cc/paper/3848-adaptive-regularization-of
 
   ```
 
-- `tokenize_ja_neologd(String line [, const string mode = "normal", const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or string userDictURL)`]) - returns tokenized strings in array&lt;string&gt;
+- `tokenize_ja_neologd(String line [, const string mode = "normal", const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or const string userDictURL)`]) - returns tokenized strings in array&lt;string&gt;
   ```sql
   select tokenize_ja_neologd("kuromojiを使った分かち書きのテストです。第二引数にはnormal/search/extendedを指定できます。デフォルトではnormalモードです。");
 
@@ -1075,7 +1075,7 @@ Reference: <a href="https://papers.nips.cc/paper/3848-adaptive-regularization-of
 
   ```
 
-- `tokenize_ko(String line [, const array<string> userDict, const string mode = "discard", const array<string> stopTags, boolean outputUnknownUnigrams])` - returns tokenized strings in array&lt;string&gt;
+- `tokenize_ko(String line [, const string mode = "discard" (or const string opts)`, const array&lt;string&gt; stopWords, const array&lt;string&gt; stopTags, const array&lt;string&gt; userDict (or const string userDictURL)]) - returns tokenized strings in array&lt;string&gt;
   ```sql
   select tokenize_ko("소설 무궁화꽃이 피었습니다.");
 
diff --git a/docs/gitbook/misc/tokenizer.md b/docs/gitbook/misc/tokenizer.md
index 5ddc93e..3992e2c 100644
--- a/docs/gitbook/misc/tokenizer.md
+++ b/docs/gitbook/misc/tokenizer.md
@@ -105,6 +105,8 @@ select stoptags_exclude(array("名詞-固有名詞"));
 詞-形容詞接続","接頭詞-数接","未知語","記号","記号-アルファベット","記号-一般","記号-句点","記号-括弧閉
 ","記号-括弧開","記号-空白","記号-読点","語断片","連体詞","非言語音"]
 
+### Custom dictionary
+
 Moreover, the fifth argument `userDict` enables you to register a user-defined custom dictionary in [Kuromoji official format](https://github.com/atilika/kuromoji/blob/909fd6b32bf4e9dc86b7599de5c9b50ca8f004a1/kuromoji-core/src/test/resources/userdict.txt):
 
 ```sql
@@ -136,8 +138,7 @@ select tokenize_ja("日本経済新聞&関西国際空港", "normal", null, nu
 For detailed APIs, please refer Javadoc of [JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html) as well.
 
 
-
-## Part-of-speech
+### Part-of-speech
 
 From Hivemall v0.6.0, the second argument can also accept the following option format:
 
@@ -196,12 +197,32 @@ Korean toknizer internally uses [lucene-analyzers-nori](analyzers-nori: Korean M
 The signature of the UDF is as follows:
 
 ```sql
-tokenize_ko(String line [,
-            const array<string> userDict,
-            const string mode = "discard",
-            const array<string> stopTags,
-            boolean outputUnknownUnigrams
-           ]) - returns tokenized strings in array<string>
+tokenize_ko(
+       String line [, const string mode = "discard" (or const string opts),
+       const array<string> stopWords,
+       const array<string>
+       stopTags,
+       const array<string> userDict (or const string userDictURL)]
+) - returns tokenized strings in array<string> 
+```
+
+> #### Note
+> Instead of mode, the 2nd argument can take options starting with `-`.
+
+You can get usage as follows:
+
+```sql
+select tokenize_ko("", "-help");
+
+usage: tokenize_ko(String line [, const string mode = "discard" (or const
+       string opts), const array<string> stopWords, const array<string>
+       stopTags, const array<string> userDict (or const string
+       userDictURL)]) - returns tokenized strings in array<string> [-help]
+       [-mode <arg>] [-outputUnknownUnigrams]
+ -help                    Show function help
+ -mode <arg>              The tokenization mode. One of ['node', 'discard'
+                          (default), 'mixed']
+ -outputUnknownUnigrams   outputs unigrams for unknown words.
 ```
 
 > #### Note
@@ -214,24 +235,69 @@ See the following examples for the usage.
 select tokenize_ko();
 > 8.8.2
 
-select tokenize_ko("소설 무궁화꽃이 피었습니다.");
+select tokenize_ko('소설 무궁화꽃이 피었습니다.');
 > ["소설","무궁","화","꽃","피"]
 
-select tokenize_ko("소설 무궁화꽃이 피었습니다.", null, "mixed");
+select tokenize_ko('소설 무궁화꽃이 피었습니다.', '-mode discard');
+> ["소설","무궁","화","꽃","피"]
+
+select tokenize_ko('소설 무궁화꽃이 피었습니다.', 'mixed');
 > ["소설","무궁화","무궁","화","꽃","피"]
 
-select tokenize_ko("소설 무궁화꽃이 피었습니다.", null, "discard", array("E", "VV"));
-> ["소설","무궁","화","꽃","이"]
+select tokenize_ko('소설 무궁화꽃이 피었습니다.', '-mode mixed');
+> ["소설","무궁화","무궁","화","꽃","피"]
 
-select tokenize_ko("Hello, world.", null, "none", array(), true);
-> ["h","e","l","l","o","w","o","r","l","d"]
+select tokenize_ko('소설 무궁화꽃이 피었습니다.', '-mode none');
+> ["소설","무궁화","꽃","피"]
 
-select tokenize_ko("Hello, world.", null, "none", array(), false);
+select tokenize_ko('Hello, world.', '-mode none');
 > ["hello","world"]
 
-select tokenize_ko("나는 C++ 언어를 프로그래밍 언어로 사랑한다.", null, "discard", array());
+select tokenize_ko('Hello, world.', '-mode none -outputUnknownUnigrams');
+> ["h","e","l","l","o","w","o","r","l","d"]
+
+-- default stopward (null), with stoptags
+select tokenize_ko('소설 무궁화꽃이 피었습니다.', 'discard', null, array('E'));
+> ["소설","무궁","화","꽃","이","피"]
+
+select tokenize_ko('소설 무궁화꽃이 피었습니다.', 'discard', null, array('E', 'VV'));
+> ["소설","무궁","화","꽃","이"]
+
+select tokenize_ko('나는 C++ 언어를 프로그래밍 언어로 사랑한다.', '-mode discard');
+> ["나","c","언어","프로그래밍","언어","사랑"]
+
+select tokenize_ko('나는 C++ 언어를 프로그래밍 언어로 사랑한다.', '-mode discard', array(), null);
 > ["나","는","c","언어","를","프로그래밍","언어","로","사랑","하","ᆫ다"]
 
-select tokenize_ko("나는 C++ 언어를 프로그래밍 언어로 사랑한다.", array("C++"), "discard", array());
+-- default stopward (null), default stoptags (null)
+select tokenize_ko('나는 C++ 언어를 프로그래밍 언어로 사랑한다.', '-mode discard');
+select tokenize_ko('나는 C++ 언어를 프로그래밍 언어로 사랑한다.', '-mode discard', null, null);
+> ["나","c","언어","프로그래밍","언어","사랑"]
+
+-- no stopward (empty array), default stoptags (null)
+select tokenize_ko('나는 C++ 언어를 프로그래밍 언어로 사랑한다.', '-mode discard', array());
+select tokenize_ko('나는 C++ 언어를 프로그래밍 언어로 사랑한다.', '-mode discard', array(), null);
+> ["나","c","언어","프로그래밍","언어","사랑"]
+
+-- no stopward (empty array), no stoptags (emptry array), custom dict
+select tokenize_ko('나는 C++ 언어를 프로그래밍 언어로 사랑한다.', '-mode discard', array(), array(), array('C++'));
 > ["나","는","c++","언어","를","프로그래밍","언어","로","사랑","하","ᆫ다"]
+
+> -- default stopward (null), default stoptags (null), custom dict
+select tokenize_ko('나는 C++ 언어를 프로그래밍 언어로 사랑한다.', '-mode discard', null, null, array('C++'));
+> ["나","c++","언어","프로그래밍","언어","사랑"]
+```
+
+### Custom dictionary
+
+Moreover, the fifth argument `userDictURL` enables you to register a user-defined custom dictionary placed in http/https accessible external site. Find the dictionary format [here from Lucene's one](https://raw.githubusercontent.com/apache/lucene/main/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/userdict.txt).
+
+
+```sql
+select tokenize_ko('나는 c++ 프로그래밍을 즐긴다.', '-mode discard', null, null, 'https://raw.githubusercontent.com/apache/lucene/main/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/userdict.txt');
+
+> ["나","c++","프로그래밍","즐기"]
 ```
+
+> #### Note
+> Dictionary SHOULD be accessible through http/https protocol. And, it SHOULD be compressed using gzip with `.gz` suffix because the maximum dictionary size is limited to 32MB and read timeout is set to 60 sec. Also, connection must be established in 10 sec.
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
index e7e4ace..b41d4dc 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
@@ -66,7 +66,7 @@ import org.apache.lucene.analysis.ja.neologd.tokenattributes.PartOfSpeechAttribu
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 
 @Description(name = "tokenize_ja_neologd",
-        value = "_FUNC_(String line [, const string mode = \"normal\", const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or string userDictURL)])"
+        value = "_FUNC_(String line [, const string mode = \"normal\", const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or const string userDictURL)])"
                 + " - returns tokenized strings in array<string>",
         extended = "select tokenize_ja_neologd(\"kuromojiを使った分かち書きのテストです。第二引数にはnormal/search/extendedを指定できます。デフォルトではnormalモードです。\");\n"
                 + "\n"
@@ -274,13 +274,15 @@ public final class KuromojiNEologdUDF extends UDFWithOptions {
     @Nonnull
     private static CharArraySet stopWords(@Nullable final String[] array)
             throws UDFArgumentException {
+        CharArraySet stopWords = JapaneseAnalyzer.getDefaultStopSet();
         if (array == null) {
-            return JapaneseAnalyzer.getDefaultStopSet();
+            return stopWords;
         }
         if (array.length == 0) {
             return CharArraySet.EMPTY_SET;
         }
-        return new CharArraySet(Arrays.asList(array), /* ignoreCase */true);
+        stopWords.addAll(Arrays.asList(array));
+        return stopWords;
     }
 
     @Nonnull
@@ -313,6 +315,9 @@ public final class KuromojiNEologdUDF extends UDFWithOptions {
         if (userDictArray == null) {
             return null;
         }
+        if (userDictArray.length == 0) {
+            return null;
+        }
 
         final StringBuilder builder = new StringBuilder();
         for (String row : userDictArray) {
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
index 8f05782..07059b2 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
@@ -66,7 +66,7 @@ import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 
 @Description(name = "tokenize_ja",
-        value = "_FUNC_(String line [, const string mode = \"normal\", const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or string userDictURL)])"
+        value = "_FUNC_(String line [, const string mode = \"normal\", const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or const string userDictURL)])"
                 + " - returns tokenized strings in array<string>",
         extended = "select tokenize_ja(\"kuromojiを使った分かち書きのテストです。第二引数にはnormal/search/extendedを指定できます。デフォルトではnormalモードです。\");\n"
                 + "\n"
@@ -274,13 +274,15 @@ public final class KuromojiUDF extends UDFWithOptions {
     @Nonnull
     private static CharArraySet stopWords(@Nullable final String[] array)
             throws UDFArgumentException {
+        CharArraySet stopWords = JapaneseAnalyzer.getDefaultStopSet();
         if (array == null) {
-            return JapaneseAnalyzer.getDefaultStopSet();
+            return stopWords;
         }
         if (array.length == 0) {
             return CharArraySet.EMPTY_SET;
         }
-        return new CharArraySet(Arrays.asList(array), /* ignoreCase */true);
+        stopWords.addAll(Arrays.asList(array));
+        return stopWords;
     }
 
     @Nonnull
@@ -313,6 +315,9 @@ public final class KuromojiUDF extends UDFWithOptions {
         if (userDictArray == null) {
             return null;
         }
+        if (userDictArray.length == 0) {
+            return null;
+        }
 
         final StringBuilder builder = new StringBuilder();
         for (String row : userDictArray) {
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
index 8bb5db9..aec8c3a 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
@@ -119,14 +119,15 @@ public final class SmartcnUDF extends GenericUDF {
     @Nonnull
     private static CharArraySet stopWords(@Nullable final String[] array)
             throws UDFArgumentException {
+        CharArraySet stopWords = SmartChineseAnalyzer.getDefaultStopSet();
         if (array == null) {
-            return SmartChineseAnalyzer.getDefaultStopSet();
+            return stopWords;
         }
         if (array.length == 0) {
             return CharArraySet.EMPTY_SET;
         }
-        CharArraySet results = new CharArraySet(Arrays.asList(array), true /* ignoreCase */);
-        return results;
+        stopWords.addAll(Arrays.asList(array));
+        return stopWords;
     }
 
     private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results)
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java
index 8c2a939..fb61633 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java
@@ -18,12 +18,22 @@
  */
 package hivemall.nlp.tokenizer;
 
+import hivemall.UDFWithOptions;
+import hivemall.nlp.tokenizer.ext.KoreanAnalyzer;
 import hivemall.utils.hadoop.HiveUtils;
+import hivemall.utils.io.HttpUtils;
 import hivemall.utils.io.IOUtils;
+import hivemall.utils.lang.ExceptionUtils;
 
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StringReader;
+import java.net.HttpURLConnection;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -36,18 +46,18 @@ import java.util.Set;
 import javax.annotation.Nonnull;
 import javax.annotation.Nullable;
 
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
 import org.apache.hadoop.hive.ql.exec.Description;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.ql.udf.UDFType;
-import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 import org.apache.hadoop.io.Text;
+import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.ko.KoreanAnalyzer;
-import org.apache.lucene.analysis.ko.KoreanPartOfSpeechStopFilter;
 import org.apache.lucene.analysis.ko.KoreanTokenizer;
 import org.apache.lucene.analysis.ko.KoreanTokenizer.DecompoundMode;
 import org.apache.lucene.analysis.ko.POS;
@@ -55,36 +65,83 @@ import org.apache.lucene.analysis.ko.dict.UserDictionary;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 
 @Description(name = "tokenize_ko",
-        value = "_FUNC_(String line [, const array<string> userDict, const string mode = \"discard\", const array<string> stopTags, boolean outputUnknownUnigrams])"
+        value = "_FUNC_(String line [, const string mode = \"discard\" (or const string opts), const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or const string userDictURL)])"
                 + " - returns tokenized strings in array<string>",
         extended = "select tokenize_ko(\"소설 무궁화꽃이 피었습니다.\");\n" + "\n"
                 + "> [\"소설\",\"무궁\",\"화\",\"꽃\",\"피\"]\n")
 @UDFType(deterministic = true, stateful = false)
-public final class TokenizeKoUDF extends GenericUDF {
-
-    @Nullable
-    private UserDictionary userDict;
+public final class TokenizeKoUDF extends UDFWithOptions {
+    private static final int CONNECT_TIMEOUT_MS = 10000; // 10 sec
+    private static final int READ_TIMEOUT_MS = 60000; // 60 sec
+    private static final long MAX_INPUT_STREAM_SIZE = 32L * 1024L * 1024L; // ~32MB
 
     private DecompoundMode mode;
+    @Nullable
+    private String[] stopWordsArray;
     private Set<POS.Tag> stopTags;
-    private boolean outputUnknownUnigrams;
+    private boolean outputUnknownUnigrams = false;
+
+    @Nullable
+    private Object userDictObj; // String[] or String
 
     private transient KoreanAnalyzer analyzer;
 
     @Override
+    protected Options getOptions() {
+        Options opts = new Options();
+        opts.addOption("mode", true,
+            "The tokenization mode. One of ['node', 'discard' (default), 'mixed']");
+        opts.addOption("outputUnknownUnigrams", false, "outputs unigrams for unknown words.");
+        return opts;
+    }
+
+    @Override
+    protected CommandLine processOptions(String optionValue) throws UDFArgumentException {
+        CommandLine cl = parseOptions(optionValue);
+        if (cl.hasOption("mode")) {
+            String modeStr = cl.getOptionValue("mode");
+            this.mode = decompoundMode(modeStr);
+        }
+        this.outputUnknownUnigrams = cl.hasOption("outputUnknownUnigrams");
+        return cl;
+    }
+
+    @Override
     public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
         final int arglen = arguments.length;
-        if (arglen > 5) {
-            throw new UDFArgumentException(
-                "Invalid number of arguments for `tokenize_ko`: " + arglen);
+        if (arglen > 6) {
+            showHelp("Invalid number of arguments for `tokenize_ko`: " + arglen);
         }
 
-        this.userDict = (arglen >= 3) ? parseUserDict(arguments[1]) : null;
-        this.mode = (arglen >= 3) ? parseDecompoundMode(arguments[2])
-                : KoreanTokenizer.DEFAULT_DECOMPOUND;
-        this.stopTags = (arglen >= 4) ? parseStopTags(arguments[3])
-                : KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS;
-        this.outputUnknownUnigrams = (arglen >= 5) && HiveUtils.getConstBoolean(arguments[4]);
+        this.mode = KoreanTokenizer.DEFAULT_DECOMPOUND;
+        if (arglen >= 2) {
+            String arg1 = HiveUtils.getConstString(arguments[1]);
+            if (arg1 != null) {
+                if (arg1.startsWith("-")) {
+                    processOptions(arg1);
+                } else {
+                    this.mode = decompoundMode(arg1);
+                }
+            }
+        }
+
+        if (arglen >= 3 && !HiveUtils.isVoidOI(arguments[2])) {
+            this.stopWordsArray = HiveUtils.getConstStringArray(arguments[2]);
+        }
+
+        this.stopTags =
+                (arglen >= 4) ? stopTags(arguments[3]) : KoreanAnalyzer.getDefaultStopTags();
+
+        if (arglen >= 5) {
+            if (HiveUtils.isConstListOI(arguments[4])) {
+                this.userDictObj = HiveUtils.getConstStringArray(arguments[4]);
+            } else if (HiveUtils.isConstString(arguments[4])) {
+                this.userDictObj = HiveUtils.getConstString(arguments[4]);
+            } else {
+                throw new UDFArgumentException(
+                    "User dictionary MUST be given as an array of constant string or constant string (URL)");
+            }
+        }
 
         this.analyzer = null;
 
@@ -106,7 +163,17 @@ public final class TokenizeKoUDF extends GenericUDF {
         }
 
         if (analyzer == null) {
-            this.analyzer = new KoreanAnalyzer(userDict, mode, stopTags, outputUnknownUnigrams);
+            CharArraySet stopWords = stopWords(stopWordsArray);
+
+            UserDictionary userDict = null;
+            if (userDictObj instanceof String[]) {
+                userDict = userDictionary((String[]) userDictObj);
+            } else if (userDictObj instanceof String) {
+                userDict = userDictionary((String) userDictObj);
+            }
+
+            this.analyzer =
+                    new KoreanAnalyzer(userDict, mode, stopWords, stopTags, outputUnknownUnigrams);
         }
 
         Object arg0 = arguments[0].get();
@@ -137,42 +204,9 @@ public final class TokenizeKoUDF extends GenericUDF {
         IOUtils.closeQuietly(analyzer);
     }
 
-    @Nullable
-    private static UserDictionary parseUserDict(@Nonnull final ObjectInspector oi)
-            throws UDFArgumentException {
-        if (HiveUtils.isVoidOI(oi)) {
-            return null;
-        }
-        final String[] array = HiveUtils.getConstStringArray(oi);
-        if (array == null) {
-            return null;
-        }
-        final int length = array.length;
-        if (length == 0) {
-            return null;
-        }
-        final StringBuilder builder = new StringBuilder();
-        for (int i = 0; i < length; i++) {
-            String row = array[i];
-            if (row != null) {
-                builder.append(row).append('\n');
-            }
-        }
-
-        final Reader reader = new StringReader(builder.toString());
-        try {
-            return UserDictionary.open(reader); // return null if empty
-        } catch (Throwable e) {
-            throw new UDFArgumentException(
-                "Failed to create user dictionary based on the given array<string>: "
-                        + builder.toString());
-        }
-    }
-
     @Nonnull
-    private static DecompoundMode parseDecompoundMode(@Nonnull final ObjectInspector oi)
+    private static DecompoundMode decompoundMode(@Nullable final String arg)
             throws UDFArgumentException {
-        String arg = HiveUtils.getConstString(oi);
         if (arg == null) {
             return KoreanTokenizer.DEFAULT_DECOMPOUND;
         }
@@ -191,14 +225,28 @@ public final class TokenizeKoUDF extends GenericUDF {
     }
 
     @Nonnull
-    private static Set<POS.Tag> parseStopTags(@Nonnull final ObjectInspector oi)
+    private static CharArraySet stopWords(@Nullable final String[] array)
+            throws UDFArgumentException {
+        final CharArraySet stopWords = KoreanAnalyzer.getDefaultStopSet();
+        if (array == null) {
+            return stopWords;
+        }
+        if (array.length == 0) {
+            return CharArraySet.EMPTY_SET;
+        }
+        stopWords.addAll(Arrays.asList(array));
+        return stopWords;
+    }
+
+    @Nonnull
+    private static Set<POS.Tag> stopTags(@Nonnull final ObjectInspector oi)
             throws UDFArgumentException {
         if (HiveUtils.isVoidOI(oi)) {
-            return KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS;
+            return KoreanAnalyzer.getDefaultStopTags();
         }
         final String[] array = HiveUtils.getConstStringArray(oi);
         if (array == null) {
-            return KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS;
+            return KoreanAnalyzer.getDefaultStopTags();
         }
         final int length = array.length;
         if (length == 0) {
@@ -219,6 +267,120 @@ public final class TokenizeKoUDF extends GenericUDF {
         return stopTags;
     }
 
+
+    @Nullable
+    private static UserDictionary userDictionary(@Nullable final String[] userDictArray)
+            throws UDFArgumentException {
+        if (userDictArray == null) {
+            return null;
+        }
+        if (userDictArray.length == 0) {
+            return null;
+        }
+
+        final StringBuilder builder = new StringBuilder();
+        for (String row : userDictArray) {
+            builder.append(row).append('\n');
+        }
+        final Reader reader = new StringReader(builder.toString());
+        try {
+            return UserDictionary.open(reader); // return null if empty
+        } catch (Throwable e) {
+            throw new UDFArgumentException(
+                "Failed to create user dictionary based on the given array<string>: "
+                        + builder.toString() + '\n' + ExceptionUtils.prettyPrintStackTrace(e));
+        }
+    }
+
+    @Nullable
+    private static UserDictionary userDictionary(@Nonnull final ObjectInspector oi)
+            throws UDFArgumentException {
+        if (HiveUtils.isVoidOI(oi)) {
+            return null;
+        }
+        final String[] array = HiveUtils.getConstStringArray(oi);
+        if (array == null) {
+            return null;
+        }
+        final int length = array.length;
+        if (length == 0) {
+            return null;
+        }
+        final StringBuilder builder = new StringBuilder();
+        for (int i = 0; i < length; i++) {
+            String row = array[i];
+            if (row != null) {
+                builder.append(row).append('\n');
+            }
+        }
+
+        final Reader reader = new StringReader(builder.toString());
+        try {
+            return UserDictionary.open(reader); // return null if empty
+        } catch (Throwable e) {
+            throw new UDFArgumentException(
+                "Failed to create user dictionary based on the given array<string>: "
+                        + builder.toString());
+        }
+    }
+
+
+    @Nullable
+    private static UserDictionary userDictionary(@Nullable final String userDictURL)
+            throws UDFArgumentException {
+        if (userDictURL == null) {
+            return null;
+        }
+
+        final HttpURLConnection conn;
+        try {
+            conn = HttpUtils.getHttpURLConnection(userDictURL);
+        } catch (IllegalArgumentException | IOException e) {
+            throw new UDFArgumentException("Failed to create HTTP connection to the URL: "
+                    + userDictURL + '\n' + ExceptionUtils.prettyPrintStackTrace(e));
+        }
+
+        // allow to read as a compressed GZIP file for efficiency
+        conn.setRequestProperty("Accept-Encoding", "gzip");
+
+        conn.setConnectTimeout(CONNECT_TIMEOUT_MS); // throw exception from connect()
+        conn.setReadTimeout(READ_TIMEOUT_MS); // throw exception from getXXX() methods
+
+        final int responseCode;
+        try {
+            responseCode = conn.getResponseCode();
+        } catch (IOException e) {
+            throw new UDFArgumentException("Failed to get response code: " + userDictURL + '\n'
+                    + ExceptionUtils.prettyPrintStackTrace(e));
+        }
+        if (responseCode != 200) {
+            throw new UDFArgumentException("Got invalid response code: " + responseCode);
+        }
+
+        final InputStream is;
+        try {
+            is = IOUtils.decodeInputStream(
+                HttpUtils.getLimitedInputStream(conn, MAX_INPUT_STREAM_SIZE));
+        } catch (NullPointerException | IOException e) {
+            throw new UDFArgumentException("Failed to get input stream from the connection: "
+                    + userDictURL + '\n' + ExceptionUtils.prettyPrintStackTrace(e));
+        }
+
+        CharsetDecoder decoder =
+                StandardCharsets.UTF_8.newDecoder()
+                                      .onMalformedInput(CodingErrorAction.REPORT)
+                                      .onUnmappableCharacter(CodingErrorAction.REPORT);
+        final Reader reader = new InputStreamReader(is, decoder);
+        try {
+            return UserDictionary.open(reader); // return null if empty
+        } catch (Throwable e) {
+            throw new UDFArgumentException(
+                "Failed to parse the file in CSV format (UTF-8 encoding is expected): "
+                        + userDictURL + '\n' + ExceptionUtils.prettyPrintStackTrace(e));
+        }
+    }
+
+
     private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results)
             throws IOException {
         // instantiate an attribute placeholder once
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/ext/KoreanAnalyzer.java b/nlp/src/main/java/hivemall/nlp/tokenizer/ext/KoreanAnalyzer.java
new file mode 100644
index 0000000..48a6569
--- /dev/null
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/ext/KoreanAnalyzer.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.nlp.tokenizer.ext;
+
+import static org.apache.lucene.analysis.TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY;
+
+import java.io.IOException;
+import java.util.Set;
+
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ko.KoreanPartOfSpeechStopFilter;
+import org.apache.lucene.analysis.ko.KoreanReadingFormFilter;
+import org.apache.lucene.analysis.ko.KoreanTokenizer;
+import org.apache.lucene.analysis.ko.KoreanTokenizer.DecompoundMode;
+import org.apache.lucene.analysis.ko.POS;
+import org.apache.lucene.analysis.ko.dict.UserDictionary;
+
+/**
+ * Korean analyzer supporting stopwords.
+ */
+public final class KoreanAnalyzer extends StopwordAnalyzerBase {
+
+    private final UserDictionary userDict;
+    private final KoreanTokenizer.DecompoundMode mode;
+    private final Set<POS.Tag> stopTags;
+    private final boolean outputUnknownUnigrams;
+
+    /**
+     * Creates a new KoreanAnalyzer.
+     */
+    public KoreanAnalyzer() {
+        this(null, KoreanTokenizer.DEFAULT_DECOMPOUND, DefaultSetHolder.DEFAULT_STOP_SET, KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS, false);
+    }
+
+    /**
+     * Creates a new KoreanAnalyzer.
+     *
+     * @param userDict Optional: if non-null, user dictionary.
+     * @param mode Decompound mode.
+     * @param stopTags The set of part of speech that should be filtered.
+     * @param outputUnknownUnigrams If true outputs unigrams for unknown words.
+     */
+    public KoreanAnalyzer(@Nullable UserDictionary userDict, @Nonnull DecompoundMode mode,
+            @Nullable CharArraySet stopwords, @Nonnull Set<POS.Tag> stopTags,
+            boolean outputUnknownUnigrams) {
+        super(stopwords);
+        this.userDict = userDict;
+        this.mode = mode;
+        this.stopTags = stopTags;
+        this.outputUnknownUnigrams = outputUnknownUnigrams;
+    }
+
+    @Nonnull
+    public static CharArraySet getDefaultStopSet() {
+        return DefaultSetHolder.DEFAULT_STOP_SET;
+    }
+
+    @Nonnull
+    public static Set<POS.Tag> getDefaultStopTags() {
+        return KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS;
+    }
+
+    private static class DefaultSetHolder {
+        static final CharArraySet DEFAULT_STOP_SET;
+
+        static {
+            try {
+                DEFAULT_STOP_SET =
+                        loadStopwordSet(true, KoreanAnalyzer.class, "stopwords-ko.txt", "#");
+            } catch (IOException ex) {
+                throw new RuntimeException("Unable to load default stopword set");
+            }
+        }
+    }
+
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new KoreanTokenizer(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, userDict, mode,
+            outputUnknownUnigrams);
+        TokenStream stream = new KoreanPartOfSpeechStopFilter(tokenizer, stopTags);
+        stream = new KoreanReadingFormFilter(stream);
+        stream = new LowerCaseFilter(stream);
+        return new TokenStreamComponents(tokenizer, stream);
+    }
+
+    @Override
+    protected TokenStream normalize(String fieldName, TokenStream in) {
+        return new LowerCaseFilter(in);
+    }
+
+    @Nonnull
+    public static TokenStream normalize(@Nonnull TokenStream in) {
+        return new LowerCaseFilter(in);
+    }
+}
diff --git a/nlp/src/main/resources/META-INF/LICENSE-stopwords-ko.txt b/nlp/src/main/resources/META-INF/LICENSE-stopwords-ko.txt
new file mode 100644
index 0000000..866a3b4
--- /dev/null
+++ b/nlp/src/main/resources/META-INF/LICENSE-stopwords-ko.txt
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2016 Gene Diaz
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/nlp/src/main/resources/hivemall/nlp/tokenizer/ext/stopwords-ko.txt b/nlp/src/main/resources/hivemall/nlp/tokenizer/ext/stopwords-ko.txt
new file mode 100644
index 0000000..0a72b07
--- /dev/null
+++ b/nlp/src/main/resources/hivemall/nlp/tokenizer/ext/stopwords-ko.txt
@@ -0,0 +1,680 @@
+# derived from https://github.com/stopwords-iso/stopwords-ko
+!
+"
+$
+%
+&
+'
+(
+)
+*
++
+,
+-
+.
+...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+;
+<
+=
+>
+?
+@
+\
+^
+_
+`
+|
+~
+·
+—
+——
+‘
+’
+“
+”
+…
+、
+。
+〈
+〉
+《
+》
+가
+가까스로
+가령
+각
+각각
+각자
+각종
+갖고말하자면
+같다
+같이
+개의치않고
+거니와
+거바
+거의
+것
+것과 같이
+것들
+게다가
+게우다
+겨우
+견지에서
+결과에 이르다
+결국
+결론을 낼 수 있다
+겸사겸사
+고려하면
+고로
+곧
+공동으로
+과
+과연
+관계가 있다
+관계없이
+관련이 있다
+관하여
+관한
+관해서는
+구
+구체적으로
+구토하다
+그
+그들
+그때
+그래
+그래도
+그래서
+그러나
+그러니
+그러니까
+그러면
+그러므로
+그러한즉
+그런 까닭에
+그런데
+그런즉
+그럼
+그럼에도 불구하고
+그렇게 함으로써
+그렇지
+그렇지 않다면
+그렇지 않으면
+그렇지만
+그렇지않으면
+그리고
+그리하여
+그만이다
+그에 따르는
+그위에
+그저
+그중에서
+그치지 않다
+근거로
+근거하여
+기대여
+기점으로
+기준으로
+기타
+까닭으로
+까악
+까지
+까지 미치다
+까지도
+꽈당
+끙끙
+끼익
+나
+나머지는
+남들
+남짓
+너
+너희
+너희들
+네
+넷
+년
+논하지 않다
+놀라다
+누가 알겠는가
+누구
+다른
+다른 방면으로
+다만
+다섯
+다소
+다수
+다시 말하자면
+다시말하면
+다음
+다음에
+다음으로
+단지
+답다
+당신
+당장
+대로 하다
+대하면
+대하여
+대해 말하자면
+대해서
+댕그
+더구나
+더군다나
+더라도
+더불어
+더욱더
+더욱이는
+도달하다
+도착하다
+동시에
+동안
+된바에야
+된이상
+두번째로
+둘
+둥둥
+뒤따라
+뒤이어
+든간에
+들
+등
+등등
+딩동
+따라
+따라서
+따위
+따지지 않다
+딱
+때
+때가 되어
+때문에
+또
+또한
+뚝뚝
+라 해도
+령
+로
+로 인하여
+로부터
+로써
+륙
+를
+마음대로
+마저
+마저도
+마치
+막론하고
+만 못하다
+만약
+만약에
+만은 아니다
+만이 아니다
+만일
+만큼
+말하자면
+말할것도 없고
+매
+매번
+메쓰겁다
+몇
+모
+모두
+무렵
+무릎쓰고
+무슨
+무엇
+무엇때문에
+물론
+및
+바꾸어말하면
+바꾸어말하자면
+바꾸어서 말하면
+바꾸어서 한다면
+바꿔 말하면
+바로
+바와같이
+밖에 안된다
+반대로
+반대로 말하자면
+반드시
+버금
+보는데서
+보다더
+보드득
+본대로
+봐
+봐라
+부류의 사람들
+부터
+불구하고
+불문하고
+붕붕
+비걱거리다
+비교적
+비길수 없다
+비로소
+비록
+비슷하다
+비추어 보아
+비하면
+뿐만 아니라
+뿐만아니라
+뿐이다
+삐걱
+삐걱거리다
+사
+삼
+상대적으로 말하자면
+생각한대로
+설령
+설마
+설사
+셋
+소생
+소인
+솨
+쉿
+습니까
+습니다
+시각
+시간
+시작하여
+시초에
+시키다
+실로
+심지어
+아
+아니
+아니나다를가
+아니라면
+아니면
+아니었다면
+아래윗
+아무거나
+아무도
+아야
+아울러
+아이
+아이고
+아이구
+아이야
+아이쿠
+아하
+아홉
+안 그러면
+않기 위하여
+않기 위해서
+알 수 있다
+알았어
+앗
+앞에서
+앞의것
+야
+약간
+양자
+어
+어기여차
+어느
+어느 년도
+어느것
+어느곳
+어느때
+어느쪽
+어느해
+어디
+어때
+어떠한
+어떤
+어떤것
+어떤것들
+어떻게
+어떻해
+어이
+어째서
+어쨋든
+어쩔수 없다
+어찌
+어찌됏든
+어찌됏어
+어찌하든지
+어찌하여
+언제
+언젠가
+얼마
+얼마 안 되는 것
+얼마간
+얼마나
+얼마든지
+얼마만큼
+얼마큼
+엉엉
+에
+에 가서
+에 달려 있다
+에 대해
+에 있다
+에 한하다
+에게
+에서
+여
+여기
+여덟
+여러분
+여보시오
+여부
+여섯
+여전히
+여차
+연관되다
+연이서
+영
+영차
+옆사람
+예
+예를 들면
+예를 들자면
+예컨대
+예하면
+오
+오로지
+오르다
+오자마자
+오직
+오호
+오히려
+와
+와 같은 사람들
+와르르
+와아
+왜
+왜냐하면
+외에도
+요만큼
+요만한 것
+요만한걸
+요컨대
+우르르
+우리
+우리들
+우선
+우에 종합한것과같이
+운운
+월
+위에서 서술한바와같이
+위하여
+위해서
+윙윙
+육
+으로
+으로 인하여
+으로서
+으로써
+을
+응
+응당
+의
+의거하여
+의지하여
+의해
+의해되다
+의해서
+이
+이 되다
+이 때문에
+이 밖에
+이 외에
+이 정도의
+이것
+이곳
+이때
+이라면
+이래
+이러이러하다
+이러한
+이런
+이럴정도로
+이렇게 많은 것
+이렇게되면
+이렇게말하자면
+이렇구나
+이로 인하여
+이르기까지
+이리하여
+이만큼
+이번
+이봐
+이상
+이어서
+이었다
+이와 같다
+이와 같은
+이와 반대로
+이와같다면
+이외에도
+이용하여
+이유만으로
+이젠
+이지만
+이쪽
+이천구
+이천육
+이천칠
+이천팔
+인 듯하다
+인젠
+일
+일것이다
+일곱
+일단
+일때
+일반적으로
+일지라도
+임에 틀림없다
+입각하여
+입장에서
+잇따라
+있다
+자
+자기
+자기집
+자마자
+자신
+잠깐
+잠시
+저
+저것
+저것만큼
+저기
+저쪽
+저희
+전부
+전자
+전후
+점에서 보아
+정도에 이르다
+제
+제각기
+제외하고
+조금
+조차
+조차도
+졸졸
+좀
+좋아
+좍좍
+주룩주룩
+주저하지 않고
+줄은 몰랏다
+줄은모른다
+중에서
+중의하나
+즈음하여
+즉
+즉시
+지든지
+지만
+지말고
+진짜로
+쪽으로
+차라리
+참
+참나
+첫번째로
+쳇
+총적으로
+총적으로 말하면
+총적으로 보면
+칠
+콸콸
+쾅쾅
+쿵
+타다
+타인
+탕탕
+토하다
+통하여
+툭
+퉤
+틈타
+팍
+팔
+퍽
+펄렁
+하
+하게될것이다
+하게하다
+하겠는가
+하고 있다
+하고있었다
+하곤하였다
+하구나
+하기 때문에
+하기 위하여
+하기는한데
+하기만 하면
+하기보다는
+하기에
+하나
+하느니
+하는 김에
+하는 편이 낫다
+하는것도
+하는것만 못하다
+하는것이 낫다
+하는바
+하더라도
+하도다
+하도록시키다
+하도록하다
+하든지
+하려고하다
+하마터면
+하면 할수록
+하면된다
+하면서
+하물며
+하여금
+하여야
+하자마자
+하지 않는다면
+하지 않도록
+하지마
+하지마라
+하지만
+하하
+한 까닭에
+한 이유는
+한 후
+한다면
+한다면 몰라도
+한데
+한마디
+한적이있다
+한켠으로는
+한항목
+할 따름이다
+할 생각이다
+할 줄 안다
+할 지경이다
+할 힘이 있다
+할때
+할만하다
+할망정
+할뿐
+할수있다
+할수있어
+할줄알다
+할지라도
+할지언정
+함께
+해도된다
+해도좋다
+해봐요
+해서는 안된다
+해야한다
+해요
+했어요
+향하다
+향하여
+향해서
+허
+허걱
+허허
+헉
+헉헉
+헐떡헐떡
+형식으로 쓰여
+혹시
+혹은
+혼자
+훨씬
+휘익
+휴
+흐흐
+흥
+힘입어
+︿
+!
+#
+$
+%
+&
+(
+)
+*
++
+,
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+;
+<
+>
+?
+@
+[
+]
+{
+|
+}
+~
+¥
\ No newline at end of file
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
index 480c5e9..658f605 100644
--- a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
@@ -63,6 +63,24 @@ public class KuromojiUDFTest {
         udf.close();
     }
 
+
+    @Test
+    public void testShowHelp() throws IOException {
+        GenericUDF udf = new KuromojiUDF();
+        ObjectInspector[] argOIs = new ObjectInspector[2];
+        argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+        argOIs[1] = HiveUtils.getConstStringObjectInspector("-help");
+        try {
+            udf.initialize(argOIs);
+            Assert.fail("should not reach here");
+        } catch (UDFArgumentException e) {
+            String errmsg = e.getMessage();
+            Assert.assertTrue(errmsg.contains("usage:"));
+        } finally {
+            udf.close();
+        }
+    }
+
     @Test
     public void testTwoArgument() throws UDFArgumentException, IOException {
         GenericUDF udf = new KuromojiUDF();
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
index a5b7288..e0bb30c 100644
--- a/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
@@ -84,8 +84,7 @@ public class SmartcnUDFTest {
             }
 
             @Override
-            public void prepare(int arg) throws HiveException {
-            }
+            public void prepare(int arg) throws HiveException {}
         };
         List<Text> tokens = udf.evaluate(args);
         Assert.assertTrue(tokens.size() >= 2);
@@ -110,8 +109,7 @@ public class SmartcnUDFTest {
             }
 
             @Override
-            public void prepare(int arg) throws HiveException {
-            }
+            public void prepare(int arg) throws HiveException {}
         };
         List<Text> tokens = udf.evaluate(args);
         Assert.assertTrue(tokens.size() >= 2);
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/TokenizeKoUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/TokenizeKoUDFTest.java
index 5365de0..8b97747 100644
--- a/nlp/src/test/java/hivemall/nlp/tokenizer/TokenizeKoUDFTest.java
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/TokenizeKoUDFTest.java
@@ -18,6 +18,8 @@
  */
 package hivemall.nlp.tokenizer;
 
+import hivemall.utils.hadoop.HiveUtils;
+
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.List;
@@ -30,7 +32,6 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
-import org.apache.hadoop.io.BooleanWritable;
 import org.apache.hadoop.io.Text;
 import org.junit.Assert;
 import org.junit.Before;
@@ -56,7 +57,24 @@ public class TokenizeKoUDFTest {
     }
 
     @Test
-    public void test() throws HiveException, IOException {
+    public void testShowHelp() throws IOException {
+        GenericUDF udf = new TokenizeKoUDF();
+        ObjectInspector[] argOIs = new ObjectInspector[2];
+        argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+        argOIs[1] = HiveUtils.getConstStringObjectInspector("-help");
+        try {
+            udf.initialize(argOIs);
+            Assert.fail("should not reach here");
+        } catch (UDFArgumentException e) {
+            String errmsg = e.getMessage();
+            Assert.assertTrue(errmsg.contains("usage:"));
+        } finally {
+            udf.close();
+        }
+    }
+
+    @Test
+    public void testOneArgument() throws HiveException, IOException {
         ObjectInspector[] argOIs = new ObjectInspector[1];
         argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
         udf.initialize(argOIs);
@@ -68,8 +86,7 @@ public class TokenizeKoUDFTest {
             }
 
             @Override
-            public void prepare(int arg) throws HiveException {
-            }
+            public void prepare(int arg) throws HiveException {}
         };
         List<Text> tokens = udf.evaluate(args);
 
@@ -81,15 +98,27 @@ public class TokenizeKoUDFTest {
     }
 
     @Test
-    public void testNullUserList() throws HiveException, IOException {
-        ObjectInspector[] argOIs = new ObjectInspector[2];
+    public void testNullUserDict() throws HiveException, IOException {
+        ObjectInspector[] argOIs = new ObjectInspector[5];
         // line
         argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+        // mode
+        PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
+        stringType.setTypeName("string");
+        argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+            stringType, null);
+        // stopWords
+        argOIs[2] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+            PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+        // stopTags
+        argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+            PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
         // userDict
-        argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+        argOIs[4] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
             PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
         udf.initialize(argOIs);
 
+
         GenericUDF.DeferredObject[] args = new GenericUDF.DeferredObject[1];
         args[0] = new GenericUDF.DeferredObject() {
             public Text get() throws HiveException {
@@ -97,8 +126,7 @@ public class TokenizeKoUDFTest {
             }
 
             @Override
-            public void prepare(int arg) throws HiveException {
-            }
+            public void prepare(int arg) throws HiveException {}
         };
         List<Text> tokens = udf.evaluate(args);
 
@@ -111,33 +139,27 @@ public class TokenizeKoUDFTest {
 
     @Test
     public void testNullMode() throws UDFArgumentException, IOException {
-        ObjectInspector[] argOIs = new ObjectInspector[3];
+        ObjectInspector[] argOIs = new ObjectInspector[2];
         // line
         argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
-        // userDict
-        argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
-            PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
         // mode
         PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
         stringType.setTypeName("string");
-        argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+        argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
             stringType, null);
         udf.initialize(argOIs);
         udf.close();
     }
 
     @Test
-    public void testMode() throws HiveException, IOException {
-        ObjectInspector[] argOIs = new ObjectInspector[3];
+    public void testModeMixed() throws HiveException, IOException {
+        ObjectInspector[] argOIs = new ObjectInspector[2];
         // line
         argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
-        // userDict
-        argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
-            PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
         // mode
         PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
         stringType.setTypeName("string");
-        argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+        argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
             stringType, new Text("mixed"));
         udf.initialize(argOIs);
 
@@ -148,8 +170,7 @@ public class TokenizeKoUDFTest {
             }
 
             @Override
-            public void prepare(int arg) throws HiveException {
-            }
+            public void prepare(int arg) throws HiveException {}
         };
         List<Text> tokens = udf.evaluate(args);
 
@@ -162,34 +183,37 @@ public class TokenizeKoUDFTest {
 
     @Test(expected = UDFArgumentException.class)
     public void testInvalidMode() throws IOException, HiveException {
-        ObjectInspector[] argOIs = new ObjectInspector[3];
+        ObjectInspector[] argOIs = new ObjectInspector[2];
         // line
         argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
-        // userDict
-        argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
-            PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
         // mode
         PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
         stringType.setTypeName("string");
-        argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+        argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
             stringType, new Text("unsupported mode"));
         udf.initialize(argOIs);
         udf.close();
     }
 
     @Test
-    public void testNonnullUserList() throws HiveException, IOException {
-        ObjectInspector[] argOIs = new ObjectInspector[3];
+    public void testUserDictArray() throws HiveException, IOException {
+        ObjectInspector[] argOIs = new ObjectInspector[5];
         // line
         argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
-        // userDict
-        argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
-            PrimitiveObjectInspectorFactory.javaStringObjectInspector, Arrays.asList("C++"));
         // mode
         PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
         stringType.setTypeName("string");
-        argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+        argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
             stringType, new Text("mixed"));
+        // stopWords
+        argOIs[2] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+            PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+        // stopTags
+        argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+            PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+        // userDict
+        argOIs[4] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+            PrimitiveObjectInspectorFactory.javaStringObjectInspector, Arrays.asList("C++"));
         udf.initialize(argOIs);
 
         GenericUDF.DeferredObject[] args = new GenericUDF.DeferredObject[1];
@@ -199,8 +223,7 @@ public class TokenizeKoUDFTest {
             }
 
             @Override
-            public void prepare(int arg) throws HiveException {
-            }
+            public void prepare(int arg) throws HiveException {}
         };
         List<Text> tokens = udf.evaluate(args);
 
@@ -212,21 +235,70 @@ public class TokenizeKoUDFTest {
     }
 
     @Test
-    public void testStopTags() throws HiveException, IOException {
-        ObjectInspector[] argOIs = new ObjectInspector[4];
+    public void testUserDictUrl() throws HiveException, IOException {
+        ObjectInspector[] argOIs = new ObjectInspector[5];
         // line
         argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
-        // userDict
-        argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+        // mode
+        PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
+        stringType.setTypeName("string");
+        argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+            stringType, new Text("discard"));
+        // stopWords
+        argOIs[2] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
             PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+        // stopTags
+        argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+            PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+        // userDict
+        argOIs[4] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+            stringType, new Text(
+                "https://raw.githubusercontent.com/apache/lucene/044d152d954f1e22aac5a53792011da54c680617/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/userdict.txt"));
+
+        udf.initialize(argOIs);
+
+        GenericUDF.DeferredObject[] args = new GenericUDF.DeferredObject[1];
+        args[0] = new GenericUDF.DeferredObject() {
+            public Text get() throws HiveException {
+                return new Text("나는 c++ 프로그래밍을 즐긴다");
+            }
+
+            @Override
+            public void prepare(int arg) throws HiveException {}
+        };
+        List<Text> tokens = udf.evaluate(args);
+
+        Assert.assertNotNull(tokens);
+        Assert.assertEquals(4, tokens.size());
+        Assert.assertEquals("나 c++ 프로그래밍 즐기", getString(tokens));
+
+        udf.close();
+    }
+
+    @Test
+    public void testStopTags() throws HiveException, IOException {
+        ObjectInspector[] argOIs = new ObjectInspector[5];
+        // line
+        argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+
         // mode
         PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
         stringType.setTypeName("string");
-        argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+        argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
             stringType, null);
+
+        // stopWords
+        argOIs[2] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+            PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+
         // stopTags
         argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
             PrimitiveObjectInspectorFactory.javaStringObjectInspector, Arrays.asList("E", "VV"));
+
+        // userDict
+        argOIs[4] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+            PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+
         udf.initialize(argOIs);
 
         GenericUDF.DeferredObject[] args = new GenericUDF.DeferredObject[1];
@@ -236,8 +308,7 @@ public class TokenizeKoUDFTest {
             }
 
             @Override
-            public void prepare(int arg) throws HiveException {
-            }
+            public void prepare(int arg) throws HiveException {}
         };
         List<Text> tokens = udf.evaluate(args);
 
@@ -248,19 +319,57 @@ public class TokenizeKoUDFTest {
         udf.close();
     }
 
+
+    @Test
+    public void testWithoutDictCplusplus() throws HiveException, IOException {
+        ObjectInspector[] argOIs = new ObjectInspector[4];
+        // line
+        argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+        // mode
+        PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
+        stringType.setTypeName("string");
+        argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+            stringType, new Text("discard"));
+        // stopWords
+        argOIs[2] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+            PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+        // stopTags
+        argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+            PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+
+        udf.initialize(argOIs);
+
+        GenericUDF.DeferredObject[] args = new GenericUDF.DeferredObject[1];
+        args[0] = new GenericUDF.DeferredObject() {
+            public Text get() throws HiveException {
+                return new Text("나는 c++ 프로그래밍을 즐긴다");
+            }
+
+            @Override
+            public void prepare(int arg) throws HiveException {}
+        };
+        List<Text> tokens = udf.evaluate(args);
+
+        Assert.assertNotNull(tokens);
+        Assert.assertEquals(4, tokens.size());
+        Assert.assertEquals("나 c 프로그래밍 즐기", getString(tokens));
+
+        udf.close();
+    }
+
     @Test(expected = UDFArgumentException.class)
     public void testInvalidStopTag() throws UDFArgumentException, IOException {
         ObjectInspector[] argOIs = new ObjectInspector[4];
         // line
         argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
-        // userDict
-        argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
-            PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
         // mode
         PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
         stringType.setTypeName("string");
-        argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+        argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
             stringType, null);
+        // stopWords
+        argOIs[2] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+            PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
         // stopTags
         argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
             PrimitiveObjectInspectorFactory.javaStringObjectInspector, Arrays.asList("E", "?"));
@@ -270,25 +379,21 @@ public class TokenizeKoUDFTest {
 
     @Test
     public void testOutputUnknownUnigramsTrue() throws HiveException, IOException {
-        ObjectInspector[] argOIs = new ObjectInspector[5];
+        ObjectInspector[] argOIs = new ObjectInspector[4];
         // line
         argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
-        // userDict
-        argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+
+        // opts
+        argOIs[1] = HiveUtils.getConstStringObjectInspector("-mode discard -outputUnknownUnigrams"); // mode        
+
+        // stopWords
+        argOIs[2] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
             PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
-        // mode
-        PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
-        stringType.setTypeName("string");
-        argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-            stringType, null);
+
         // stopTags
         argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
             PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
-        // outputUnknowUnigrams
-        PrimitiveTypeInfo booleanType = new PrimitiveTypeInfo();
-        booleanType.setTypeName("boolean");
-        argOIs[4] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-            booleanType, new BooleanWritable(true));
+
         udf.initialize(argOIs);
 
         GenericUDF.DeferredObject[] args = new GenericUDF.DeferredObject[1];
@@ -298,8 +403,7 @@ public class TokenizeKoUDFTest {
             }
 
             @Override
-            public void prepare(int arg) throws HiveException {
-            }
+            public void prepare(int arg) throws HiveException {}
         };
         List<Text> tokens = udf.evaluate(args);
 
@@ -312,25 +416,21 @@ public class TokenizeKoUDFTest {
 
     @Test
     public void testOutputUnknownUnigramsFalse() throws HiveException, IOException {
-        ObjectInspector[] argOIs = new ObjectInspector[5];
+        ObjectInspector[] argOIs = new ObjectInspector[4];
         // line
         argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
-        // userDict
-        argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+
+        // opts
+        argOIs[1] = HiveUtils.getConstStringObjectInspector("-mode discard"); // mode        
+
+        // stopWords
+        argOIs[2] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
             PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
-        // mode
-        PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
-        stringType.setTypeName("string");
-        argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-            stringType, null);
+
         // stopTags
         argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
             PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
-        // outputUnknowUnigrams
-        PrimitiveTypeInfo booleanType = new PrimitiveTypeInfo();
-        booleanType.setTypeName("boolean");
-        argOIs[4] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
-            booleanType, new BooleanWritable(false));
+
         udf.initialize(argOIs);
 
         GenericUDF.DeferredObject[] args = new GenericUDF.DeferredObject[1];
@@ -340,8 +440,7 @@ public class TokenizeKoUDFTest {
             }
 
             @Override
-            public void prepare(int arg) throws HiveException {
-            }
+            public void prepare(int arg) throws HiveException {}
         };
         List<Text> tokens = udf.evaluate(args);
 
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/ext/KoreanAnalyzerTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/ext/KoreanAnalyzerTest.java
new file mode 100644
index 0000000..6d44c1b
--- /dev/null
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/ext/KoreanAnalyzerTest.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.nlp.tokenizer.ext;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+import javax.annotation.Nonnull;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ko.dict.UserDictionary;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class KoreanAnalyzerTest {
+
+    @Test
+    public void testStopwords() throws IOException {
+        KoreanAnalyzer analyzer = new KoreanAnalyzer();
+        Assert.assertTrue(analyzer.getStopwordSet().size() > 10);
+
+        List<String> results = analyzeTokens(analyzer.tokenStream("", "소설 무궁화꽃이 피었습니다."));
+        Assert.assertEquals(5, results.size());
+        analyzer.close();
+    }
+
+    @Test
+    public void testUserDict() {
+        UserDictionary dict = readDict();
+        Assert.assertNotNull(dict);;
+    }
+
+    @Nonnull
+    public static UserDictionary readDict() {
+        InputStream is = KoreanAnalyzer.class.getResourceAsStream("userdict-ko.txt");
+        if (is == null) {
+            throw new RuntimeException("Cannot find userdict-ko.txt in test classpath!");
+        }
+        try {
+            try {
+                Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
+                return UserDictionary.open(reader);
+            } finally {
+                is.close();
+            }
+        } catch (IOException ioe) {
+            throw new RuntimeException(ioe);
+        }
+    }
+
+    @Nonnull
+    private static List<String> analyzeTokens(@Nonnull TokenStream stream) throws IOException {
+        final List<String> results = new ArrayList<String>();
+
+        // instantiate an attribute placeholder once
+        CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
+        stream.reset();
+
+        while (stream.incrementToken()) {
+            String term = termAttr.toString();
+            results.add(term);
+        }
+        return results;
+    }
+
+}
diff --git a/nlp/src/test/resources/hivemall/nlp/tokenizer/ext/userdict-ko.txt b/nlp/src/test/resources/hivemall/nlp/tokenizer/ext/userdict-ko.txt
new file mode 100644
index 0000000..ffd3613
--- /dev/null
+++ b/nlp/src/test/resources/hivemall/nlp/tokenizer/ext/userdict-ko.txt
@@ -0,0 +1,11 @@
+# Additional nouns
+c++
+C샤프
+세종
+세종시 세종 시
+대한민국날씨
+대한민국
+날씨
+21세기대한민국
+세기
+