You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hivemall.apache.org by my...@apache.org on 2019/04/19 07:04:08 UTC

[incubator-hivemall] branch master updated: [HIVEMALL-251] Add option to return PartOfSpeech information for tokenize_ja

This is an automated email from the ASF dual-hosted git repository.

myui pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git


The following commit(s) were added to refs/heads/master by this push:
     new 054e967  [HIVEMALL-251] Add option to return PartOfSpeech information for tokenize_ja
054e967 is described below

commit 054e9672d8cbc13eb7bd330e25dc3f362701f0bf
Author: Makoto Yui <my...@apache.org>
AuthorDate: Fri Apr 19 16:04:01 2019 +0900

    [HIVEMALL-251] Add option to return PartOfSpeech information for tokenize_ja
    
    ## What changes were proposed in this pull request?
    
    Add option to return PartOfSpeech information for `tokenize_ja` UDF.
    
    ## What type of PR is it?
    
    Feature, Improvement
    
    ## What is the Jira issue?
    
    https://issues.apache.org/jira/browse/HIVEMALL-251
    
    ## How was this patch tested?
    
    unit tests and manual tests on EMR
    
    ## How to use this feature?
    
    ```sql
    WITH tmp as (
      select
        tokenize_ja('kuromojiを使った分かち書きのテストです。','-mode search -pos') as r
    )
    select
      r.tokens,
      r.pos,
      r.tokens[0] as token0,
      r.pos[0] as pos0
    from
      tmp;
    ```
    
    | tokens |pos | token0 | pos0 |
    |:-:|:-:|:-:|:-:|
    | ["kuromoji","使う","分かち書き","テスト"] | ["名詞-一般","動詞-自立","名詞-一般","名詞-サ変接続"] | kuromoji | 名詞-一般 |
    
    ## Checklist
    
    - [x] Did you apply source code formatter, i.e., `./bin/format_code.sh`, for your commit?
    - [x] Did you run system tests on Hive (or Spark)?
    
    Author: Makoto Yui <my...@apache.org>
    
    Closes #191 from myui/HIVEMALL-251.
---
 .../java/hivemall/UDAFEvaluatorWithOptions.java    |   8 +-
 core/src/main/java/hivemall/UDFWithOptions.java    |  12 +-
 core/src/main/java/hivemall/UDTFWithOptions.java   |   7 +-
 .../java/hivemall/smile/data/AttributeType.java    |   2 +-
 .../java/hivemall/utils/lang/CommandLineUtils.java |   3 +
 docs/gitbook/misc/tokenizer.md                     |  36 ++++++
 .../java/hivemall/nlp/tokenizer/KuromojiUDF.java   | 138 +++++++++++++++++----
 .../hivemall/nlp/tokenizer/KuromojiUDFTest.java    | 104 ++++++++++++++--
 8 files changed, 277 insertions(+), 33 deletions(-)

diff --git a/core/src/main/java/hivemall/UDAFEvaluatorWithOptions.java b/core/src/main/java/hivemall/UDAFEvaluatorWithOptions.java
index de1564c..7817beb 100644
--- a/core/src/main/java/hivemall/UDAFEvaluatorWithOptions.java
+++ b/core/src/main/java/hivemall/UDAFEvaluatorWithOptions.java
@@ -88,7 +88,13 @@ public abstract class UDAFEvaluatorWithOptions extends GenericUDAFEvaluator {
         String[] args = optionValue.split("\\s+");
         Options opts = getOptions();
         opts.addOption("help", false, "Show function help");
-        CommandLine cl = CommandLineUtils.parseOptions(args, opts);
+
+        final CommandLine cl;
+        try {
+            cl = CommandLineUtils.parseOptions(args, opts);
+        } catch (IllegalArgumentException e) {
+            throw new UDFArgumentException(e);
+        }
 
         if (cl.hasOption("help")) {
             Description funcDesc = getClass().getAnnotation(Description.class);
diff --git a/core/src/main/java/hivemall/UDFWithOptions.java b/core/src/main/java/hivemall/UDFWithOptions.java
index f8272ce..4fb80a5 100644
--- a/core/src/main/java/hivemall/UDFWithOptions.java
+++ b/core/src/main/java/hivemall/UDFWithOptions.java
@@ -86,7 +86,13 @@ public abstract class UDFWithOptions extends GenericUDF {
         String[] args = optionValue.split("\\s+");
         Options opts = getOptions();
         opts.addOption("help", false, "Show function help");
-        CommandLine cl = CommandLineUtils.parseOptions(args, opts);
+
+        final CommandLine cl;
+        try {
+            cl = CommandLineUtils.parseOptions(args, opts);
+        } catch (IllegalArgumentException e) {
+            throw new UDFArgumentException(e);
+        }
 
         if (cl.hasOption("help")) {
             showHelp(opts);
@@ -95,6 +101,10 @@ public abstract class UDFWithOptions extends GenericUDF {
         return cl;
     }
 
+    protected void showHelp() throws UDFArgumentException {
+        showHelp(getOptions(), null);
+    }
+
     protected void showHelp(@Nullable String errMsg) throws UDFArgumentException {
         showHelp(getOptions(), errMsg);
     }
diff --git a/core/src/main/java/hivemall/UDTFWithOptions.java b/core/src/main/java/hivemall/UDTFWithOptions.java
index 43d9023..a71395e 100644
--- a/core/src/main/java/hivemall/UDTFWithOptions.java
+++ b/core/src/main/java/hivemall/UDTFWithOptions.java
@@ -94,8 +94,13 @@ public abstract class UDTFWithOptions extends GenericUDTF {
         String[] args = optionValue.split("\\s+");
         Options opts = getOptions();
         opts.addOption("help", false, "Show function help");
-        CommandLine cl = CommandLineUtils.parseOptions(args, opts);
 
+        final CommandLine cl;
+        try {
+            cl = CommandLineUtils.parseOptions(args, opts);
+        } catch (IllegalArgumentException e) {
+            throw new UDFArgumentException(e);
+        }
         if (cl.hasOption("help")) {
             showHelp(opts);
         }
diff --git a/core/src/main/java/hivemall/smile/data/AttributeType.java b/core/src/main/java/hivemall/smile/data/AttributeType.java
index 559ad2d..7aa0ef0 100644
--- a/core/src/main/java/hivemall/smile/data/AttributeType.java
+++ b/core/src/main/java/hivemall/smile/data/AttributeType.java
@@ -64,4 +64,4 @@ public enum AttributeType {
         return type;
     }
 
-}
\ No newline at end of file
+}
diff --git a/core/src/main/java/hivemall/utils/lang/CommandLineUtils.java b/core/src/main/java/hivemall/utils/lang/CommandLineUtils.java
index 61e844e..a8d406f 100644
--- a/core/src/main/java/hivemall/utils/lang/CommandLineUtils.java
+++ b/core/src/main/java/hivemall/utils/lang/CommandLineUtils.java
@@ -18,6 +18,8 @@
  */
 package hivemall.utils.lang;
 
+import javax.annotation.Nonnull;
+
 import org.apache.commons.cli.BasicParser;
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.Options;
@@ -27,6 +29,7 @@ public final class CommandLineUtils {
 
     private CommandLineUtils() {}
 
+    @Nonnull
     public static CommandLine parseOptions(final String[] args, final Options opts) {
         final BasicParser parser = new BasicParser();
         final CommandLine cl;
diff --git a/docs/gitbook/misc/tokenizer.md b/docs/gitbook/misc/tokenizer.md
index e578198..0c0e97e 100644
--- a/docs/gitbook/misc/tokenizer.md
+++ b/docs/gitbook/misc/tokenizer.md
@@ -110,8 +110,44 @@ select tokenize_ja("日本経済新聞&関西国際空港", "normal", null, nu
 
 > ["日本","経済","新聞","関西","国際","空港"]
 
+Dictionary SHOULD be accessible through http/https protocol. And, it SHOULD be compressed using gzip with `.gz` suffix because the maximum dictionary size is limited to 32MB and read timeout is set to 60 sec. Also, connection must be established in 10 sec.
+
+If you want to use HTTP Basic Authentication, please use the following form: `https://user:password@www.sitreurl.com/my_dict.txt.gz` (see Sec 3.1 of [rfc1738](https://www.ietf.org/rfc/rfc1738.txt))
+
 For detailed APIs, please refer Javadoc of [JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html) as well.
 
+## Part-of-speech
+
+The second argument can also accept the following option format:
+
+```
+ -mode <arg>   The tokenization mode. One of ['normal', 'search',
+               'extended', 'default' (normal)]
+ -pos          Return part-of-speech information
+```
+
+Then, you can get part-of-speech information as follows:
+
+```sql
+WITH tmp as (
+  select
+    tokenize_ja('kuromojiを使った分かち書きのテストです。','-mode search -pos') as r
+)
+select
+  r.tokens,
+  r.pos,
+  r.tokens[0] as token0,
+  r.pos[0] as pos0
+from
+  tmp;
+```
+
+| tokens |pos | token0 | pos0 |
+|:-:|:-:|:-:|:-:|
+| ["kuromoji","使う","分かち書き","テスト"] | ["名詞-一般","動詞-自立","名詞-一般","名詞-サ変接続"] | kuromoji | 名詞-一般 |
+
+Note that when `-pos` option is specified, `tokenize_ja` returns a struct record containing `array<string> tokens` and `array<string> pos` as the elements.
+
 ## Chinese Tokenizer
 
 Chinese text tokenizer UDF uses [SmartChineseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html). 
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
index 48b566f..0317d2a 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
@@ -18,6 +18,7 @@
  */
 package hivemall.nlp.tokenizer;
 
+import hivemall.UDFWithOptions;
 import hivemall.utils.hadoop.HiveUtils;
 import hivemall.utils.io.HttpUtils;
 import hivemall.utils.io.IOUtils;
@@ -37,16 +38,18 @@ import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Objects;
 import java.util.Set;
 
 import javax.annotation.Nonnull;
 import javax.annotation.Nullable;
 
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
 import org.apache.hadoop.hive.ql.exec.Description;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.ql.udf.UDFType;
-import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
@@ -56,9 +59,12 @@ import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
 import org.apache.lucene.analysis.ja.JapaneseTokenizer;
 import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
 import org.apache.lucene.analysis.ja.dict.UserDictionary;
+import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.util.CharArraySet;
 
+import com.clearspring.analytics.util.Preconditions;
+
 @Description(name = "tokenize_ja",
         value = "_FUNC_(String line [, const string mode = \"normal\", const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or string userDictURL)])"
                 + " - returns tokenized strings in array<string>",
@@ -66,12 +72,14 @@ import org.apache.lucene.analysis.util.CharArraySet;
                 + "\n"
                 + "> [\"kuromoji\",\"使う\",\"分かち書き\",\"テスト\",\"第\",\"二\",\"引数\",\"normal\",\"search\",\"extended\",\"指定\",\"デフォルト\",\"normal\",\" モード\"]\n")
 @UDFType(deterministic = true, stateful = false)
-public final class KuromojiUDF extends GenericUDF {
+public final class KuromojiUDF extends UDFWithOptions {
     private static final int CONNECT_TIMEOUT_MS = 10000; // 10 sec
     private static final int READ_TIMEOUT_MS = 60000; // 60 sec
     private static final long MAX_INPUT_STREAM_SIZE = 32L * 1024L * 1024L; // ~32MB
 
     private Mode _mode;
+    private boolean _returnPos;
+    private transient Object[] _result;
     @Nullable
     private String[] _stopWordsArray;
     private Set<String> _stopTags;
@@ -82,14 +90,43 @@ public final class KuromojiUDF extends GenericUDF {
     private transient JapaneseAnalyzer _analyzer;
 
     @Override
+    protected Options getOptions() {
+        Options opts = new Options();
+        opts.addOption("mode", true,
+            "The tokenization mode. One of ['normal', 'search', 'extended', 'default' (normal)]");
+        opts.addOption("pos", false, "Return part-of-speech information");
+        return opts;
+    }
+
+    @Override
+    protected CommandLine processOptions(String optionValue) throws UDFArgumentException {
+        CommandLine cl = parseOptions(optionValue);
+        if (cl.hasOption("mode")) {
+            String modeStr = cl.getOptionValue("mode");
+            this._mode = tokenizationMode(modeStr);
+        }
+        this._returnPos = cl.hasOption("pos");
+        return cl;
+    }
+
+    @Override
     public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
         final int arglen = arguments.length;
         if (arglen < 1 || arglen > 5) {
-            throw new UDFArgumentException(
-                "Invalid number of arguments for `tokenize_ja`: " + arglen);
+            showHelp("Invalid number of arguments for `tokenize_ja`: " + arglen);
         }
 
-        this._mode = (arglen >= 2) ? tokenizationMode(arguments[1]) : Mode.NORMAL;
+        this._mode = Mode.NORMAL;
+        if (arglen >= 2) {
+            String arg1 = HiveUtils.getConstString(arguments[1]);
+            if (arg1 != null) {
+                if (arg1.startsWith("-")) {
+                    processOptions(arg1);
+                } else {
+                    this._mode = tokenizationMode(arg1);
+                }
+            }
+        }
 
         if (arglen >= 3 && !HiveUtils.isVoidOI(arguments[2])) {
             this._stopWordsArray = HiveUtils.getConstStringArray(arguments[2]);
@@ -111,12 +148,25 @@ public final class KuromojiUDF extends GenericUDF {
 
         this._analyzer = null;
 
-        return ObjectInspectorFactory.getStandardListObjectInspector(
-            PrimitiveObjectInspectorFactory.writableStringObjectInspector);
+        if (_returnPos) {
+            this._result = new Object[2];
+            ArrayList<String> fieldNames = new ArrayList<String>();
+            ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
+            fieldNames.add("tokens");
+            fieldOIs.add(ObjectInspectorFactory.getStandardListObjectInspector(
+                PrimitiveObjectInspectorFactory.writableStringObjectInspector));
+            fieldNames.add("pos");
+            fieldOIs.add(ObjectInspectorFactory.getStandardListObjectInspector(
+                PrimitiveObjectInspectorFactory.writableStringObjectInspector));
+            return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
+        } else {
+            return ObjectInspectorFactory.getStandardListObjectInspector(
+                PrimitiveObjectInspectorFactory.writableStringObjectInspector);
+        }
     }
 
     @Override
-    public List<Text> evaluate(DeferredObject[] arguments) throws HiveException {
+    public Object evaluate(DeferredObject[] arguments) throws HiveException {
         if (_analyzer == null) {
             CharArraySet stopWords = stopWords(_stopWordsArray);
 
@@ -136,20 +186,55 @@ public final class KuromojiUDF extends GenericUDF {
         }
         String line = arg0.toString();
 
-        final List<Text> results = new ArrayList<Text>(32);
+        if (_returnPos) {
+            return parseLine(_analyzer, line, _result);
+        } else {
+            return parseLine(_analyzer, line);
+        }
+    }
+
+    @Nonnull
+    private static Object[] parseLine(@Nonnull JapaneseAnalyzer analyzer, @Nonnull String line,
+            @Nonnull Object[] result) throws HiveException {
+        Objects.requireNonNull(result);
+        Preconditions.checkArgument(result.length == 2);
+
+        final List<Text> tokens = new ArrayList<Text>(32);
+        final List<Text> pos = new ArrayList<Text>(32);
         TokenStream stream = null;
         try {
-            stream = _analyzer.tokenStream("", line);
+            stream = analyzer.tokenStream("", line);
             if (stream != null) {
-                analyzeTokens(stream, results);
+                analyzeTokens(stream, tokens, pos);
             }
         } catch (IOException e) {
-            IOUtils.closeQuietly(_analyzer);
+            IOUtils.closeQuietly(analyzer);
             throw new HiveException(e);
         } finally {
             IOUtils.closeQuietly(stream);
         }
-        return results;
+        result[0] = tokens;
+        result[1] = pos;
+        return result;
+    }
+
+    @Nonnull
+    private static List<Text> parseLine(@Nonnull JapaneseAnalyzer analyzer, @Nonnull String line)
+            throws HiveException {
+        final List<Text> tokens = new ArrayList<Text>(32);
+        TokenStream stream = null;
+        try {
+            stream = analyzer.tokenStream("", line);
+            if (stream != null) {
+                analyzeTokens(stream, tokens);
+            }
+        } catch (IOException e) {
+            IOUtils.closeQuietly(analyzer);
+            throw new HiveException(e);
+        } finally {
+            IOUtils.closeQuietly(stream);
+        }
+        return tokens;
     }
 
     @Override
@@ -158,12 +243,7 @@ public final class KuromojiUDF extends GenericUDF {
     }
 
     @Nonnull
-    private static Mode tokenizationMode(@Nonnull final ObjectInspector oi)
-            throws UDFArgumentException {
-        String arg = HiveUtils.getConstString(oi);
-        if (arg == null) {
-            return Mode.NORMAL;
-        }
+    private static Mode tokenizationMode(@Nonnull final String arg) throws UDFArgumentException {
         final Mode mode;
         if ("NORMAL".equalsIgnoreCase(arg)) {
             mode = Mode.NORMAL;
@@ -292,15 +372,31 @@ public final class KuromojiUDF extends GenericUDF {
         }
     }
 
-    private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results)
+    private static void analyzeTokens(@Nonnull final TokenStream stream,
+            @Nonnull final List<Text> tokens) throws IOException {
+        // instantiate an attribute placeholder once
+        CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
+        stream.reset();
+
+        while (stream.incrementToken()) {
+            String term = termAttr.toString();
+            tokens.add(new Text(term));
+        }
+    }
+
+    private static void analyzeTokens(@Nonnull final TokenStream stream,
+            @Nonnull final List<Text> tokenResult, @Nonnull final List<Text> posResult)
             throws IOException {
         // instantiate an attribute placeholder once
         CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
+        PartOfSpeechAttribute posAttr = stream.addAttribute(PartOfSpeechAttribute.class);
         stream.reset();
 
         while (stream.incrementToken()) {
             String term = termAttr.toString();
-            results.add(new Text(term));
+            tokenResult.add(new Text(term));
+            String pos = posAttr.getPartOfSpeech();
+            posResult.add(new Text(pos));
         }
     }
 
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
index 244075d..2a3de26 100644
--- a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
@@ -19,9 +19,12 @@
 package hivemall.nlp.tokenizer;
 
 import hivemall.TestUtils;
+import hivemall.utils.hadoop.HiveUtils;
+import hivemall.utils.lang.PrivilegedAccessor;
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 
 import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
@@ -33,6 +36,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
 import org.apache.hadoop.io.Text;
+import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
+import org.hamcrest.CoreMatchers;
 import org.junit.Assert;
 import org.junit.Test;
 
@@ -208,12 +213,14 @@ public class KuromojiUDFTest {
             @Override
             public void prepare(int arg) throws HiveException {}
         };
-        List<Text> tokens = udf.evaluate(args);
+        @SuppressWarnings("unchecked")
+        List<Text> tokens = (List<Text>) udf.evaluate(args);
         Assert.assertNotNull(tokens);
         Assert.assertEquals(5, tokens.size());
         udf.close();
     }
 
+    @SuppressWarnings("unchecked")
     @Test
     public void testEvaluateTwoRows() throws IOException, HiveException {
         KuromojiUDF udf = new KuromojiUDF();
@@ -231,7 +238,7 @@ public class KuromojiUDFTest {
             @Override
             public void prepare(int arg) throws HiveException {}
         };
-        List<Text> tokens = udf.evaluate(args);
+        List<Text> tokens = (List<Text>) udf.evaluate(args);
         Assert.assertNotNull(tokens);
         Assert.assertEquals(5, tokens.size());
 
@@ -243,7 +250,7 @@ public class KuromojiUDFTest {
             @Override
             public void prepare(int arg) throws HiveException {}
         };
-        tokens = udf.evaluate(args);
+        tokens = (List<Text>) udf.evaluate(args);
         Assert.assertNotNull(tokens);
         Assert.assertEquals(4, tokens.size());
 
@@ -268,7 +275,8 @@ public class KuromojiUDFTest {
             @Override
             public void prepare(int arg) throws HiveException {}
         };
-        List<Text> tokens = udf.evaluate(args);
+        @SuppressWarnings("unchecked")
+        List<Text> tokens = (List<Text>) udf.evaluate(args);
         Assert.assertNotNull(tokens);
         Assert.assertEquals(182, tokens.size());
         udf.close();
@@ -309,7 +317,8 @@ public class KuromojiUDFTest {
             public void prepare(int arg) throws HiveException {}
         };
 
-        List<Text> tokens = udf.evaluate(args);
+        @SuppressWarnings("unchecked")
+        List<Text> tokens = (List<Text>) udf.evaluate(args);
 
         Assert.assertNotNull(tokens);
         Assert.assertEquals(3, tokens.size());
@@ -349,7 +358,8 @@ public class KuromojiUDFTest {
             public void prepare(int arg) throws HiveException {}
         };
 
-        List<Text> tokens = udf.evaluate(args);
+        @SuppressWarnings("unchecked")
+        List<Text> tokens = (List<Text>) udf.evaluate(args);
         Assert.assertNotNull(tokens);
 
         udf.close();
@@ -389,7 +399,8 @@ public class KuromojiUDFTest {
             public void prepare(int arg) throws HiveException {}
         };
 
-        List<Text> tokens = udf.evaluate(args);
+        @SuppressWarnings("unchecked")
+        List<Text> tokens = (List<Text>) udf.evaluate(args);
 
         Assert.assertNotNull(tokens);
         Assert.assertEquals(7, tokens.size());
@@ -417,7 +428,8 @@ public class KuromojiUDFTest {
             @Override
             public void prepare(int arg) throws HiveException {}
         };
-        List<Text> tokens = udf.evaluate(args);
+        @SuppressWarnings("unchecked")
+        List<Text> tokens = (List<Text>) udf.evaluate(args);
         Assert.assertNotNull(tokens);
 
         // serialization after evaluation
@@ -426,4 +438,80 @@ public class KuromojiUDFTest {
 
         udf.close();
     }
+
+    @Test
+    public void testNormalModeWithOption()
+            throws IOException, HiveException, IllegalAccessException, NoSuchFieldException {
+        GenericUDF udf = new KuromojiUDF();
+        ObjectInspector[] argOIs = new ObjectInspector[2];
+
+        argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector; // line
+        argOIs[1] = HiveUtils.getConstStringObjectInspector("-mode normal"); // mode
+        udf.initialize(argOIs);
+
+        Object mode = PrivilegedAccessor.getValue(udf, "_mode");
+        Assert.assertEquals(Mode.NORMAL, mode);
+
+        DeferredObject[] args = new DeferredObject[1];
+        args[0] = new DeferredObject() {
+            public Text get() throws HiveException {
+                return new Text("クロモジのJapaneseAnalyzerを使ってみる。テスト。");
+            }
+
+            @Override
+            public void prepare(int arg) throws HiveException {}
+        };
+        Object result = udf.evaluate(args);
+        Assert.assertThat(Arrays.asList(new Text("クロモジ"), new Text("japaneseanalyzer"),
+            new Text("使う"), new Text("みる"), new Text("テスト")), CoreMatchers.is(result));
+
+        udf.close();
+    }
+
+    @Test
+    public void testNormalModeWithPosOptions()
+            throws IOException, HiveException, IllegalAccessException, NoSuchFieldException {
+        GenericUDF udf = new KuromojiUDF();
+        ObjectInspector[] argOIs = new ObjectInspector[2];
+
+        argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector; // line
+        argOIs[1] = HiveUtils.getConstStringObjectInspector("-mode normal -pos"); // mode
+        udf.initialize(argOIs);
+
+        Object mode = PrivilegedAccessor.getValue(udf, "_mode");
+        Assert.assertEquals(Mode.NORMAL, mode);
+
+        DeferredObject[] args = new DeferredObject[1];
+        args[0] = new DeferredObject() {
+            public Text get() throws HiveException {
+                return new Text("クロモジのJapaneseAnalyzerを使ってみる。テスト。");
+            }
+
+            @Override
+            public void prepare(int arg) throws HiveException {}
+        };
+
+        Object[] result = (Object[]) udf.evaluate(args);
+        Assert.assertEquals(2, result.length);
+
+        Assert.assertEquals(Arrays.asList(new Text("クロモジ"), new Text("japaneseanalyzer"),
+            new Text("使う"), new Text("みる"), new Text("テスト")), result[0]);
+        Assert.assertEquals(Arrays.asList(new Text("名詞-一般"), new Text("名詞-一般"), new Text("動詞-自立"),
+            new Text("動詞-非自立"), new Text("名詞-サ変接続")), result[1]);
+
+        udf.close();
+    }
+
+    @Test(expected = UDFArgumentException.class)
+    public void testUnsupportedOptionArgs()
+            throws IOException, HiveException, IllegalAccessException, NoSuchFieldException {
+        GenericUDF udf = new KuromojiUDF();
+        ObjectInspector[] argOIs = new ObjectInspector[2];
+
+        argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector; // line
+        argOIs[1] = HiveUtils.getConstStringObjectInspector("-mode normal -unsupported_option"); // mode
+        udf.initialize(argOIs);
+
+        udf.close();
+    }
 }