You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hivemall.apache.org by my...@apache.org on 2019/04/19 07:04:08 UTC
[incubator-hivemall] branch master updated: [HIVEMALL-251] Add
option to return PartOfSpeech information for tokenize_ja
This is an automated email from the ASF dual-hosted git repository.
myui pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git
The following commit(s) were added to refs/heads/master by this push:
new 054e967 [HIVEMALL-251] Add option to return PartOfSpeech information for tokenize_ja
054e967 is described below
commit 054e9672d8cbc13eb7bd330e25dc3f362701f0bf
Author: Makoto Yui <my...@apache.org>
AuthorDate: Fri Apr 19 16:04:01 2019 +0900
[HIVEMALL-251] Add option to return PartOfSpeech information for tokenize_ja
## What changes were proposed in this pull request?
Add option to return PartOfSpeech information for `tokenize_ja` UDF.
## What type of PR is it?
Feature, Improvement
## What is the Jira issue?
https://issues.apache.org/jira/browse/HIVEMALL-251
## How was this patch tested?
unit tests and manual tests on EMR
## How to use this feature?
```sql
WITH tmp as (
select
tokenize_ja('kuromojiを使った分かち書きのテストです。','-mode search -pos') as r
)
select
r.tokens,
r.pos,
r.tokens[0] as token0,
r.pos[0] as pos0
from
tmp;
```
| tokens |pos | token0 | pos0 |
|:-:|:-:|:-:|:-:|
| ["kuromoji","使う","分かち書き","テスト"] | ["名詞-一般","動詞-自立","名詞-一般","名詞-サ変接続"] | kuromoji | 名詞-一般 |
## Checklist
- [x] Did you apply source code formatter, i.e., `./bin/format_code.sh`, for your commit?
- [x] Did you run system tests on Hive (or Spark)?
Author: Makoto Yui <my...@apache.org>
Closes #191 from myui/HIVEMALL-251.
---
.../java/hivemall/UDAFEvaluatorWithOptions.java | 8 +-
core/src/main/java/hivemall/UDFWithOptions.java | 12 +-
core/src/main/java/hivemall/UDTFWithOptions.java | 7 +-
.../java/hivemall/smile/data/AttributeType.java | 2 +-
.../java/hivemall/utils/lang/CommandLineUtils.java | 3 +
docs/gitbook/misc/tokenizer.md | 36 ++++++
.../java/hivemall/nlp/tokenizer/KuromojiUDF.java | 138 +++++++++++++++++----
.../hivemall/nlp/tokenizer/KuromojiUDFTest.java | 104 ++++++++++++++--
8 files changed, 277 insertions(+), 33 deletions(-)
diff --git a/core/src/main/java/hivemall/UDAFEvaluatorWithOptions.java b/core/src/main/java/hivemall/UDAFEvaluatorWithOptions.java
index de1564c..7817beb 100644
--- a/core/src/main/java/hivemall/UDAFEvaluatorWithOptions.java
+++ b/core/src/main/java/hivemall/UDAFEvaluatorWithOptions.java
@@ -88,7 +88,13 @@ public abstract class UDAFEvaluatorWithOptions extends GenericUDAFEvaluator {
String[] args = optionValue.split("\\s+");
Options opts = getOptions();
opts.addOption("help", false, "Show function help");
- CommandLine cl = CommandLineUtils.parseOptions(args, opts);
+
+ final CommandLine cl;
+ try {
+ cl = CommandLineUtils.parseOptions(args, opts);
+ } catch (IllegalArgumentException e) {
+ throw new UDFArgumentException(e);
+ }
if (cl.hasOption("help")) {
Description funcDesc = getClass().getAnnotation(Description.class);
diff --git a/core/src/main/java/hivemall/UDFWithOptions.java b/core/src/main/java/hivemall/UDFWithOptions.java
index f8272ce..4fb80a5 100644
--- a/core/src/main/java/hivemall/UDFWithOptions.java
+++ b/core/src/main/java/hivemall/UDFWithOptions.java
@@ -86,7 +86,13 @@ public abstract class UDFWithOptions extends GenericUDF {
String[] args = optionValue.split("\\s+");
Options opts = getOptions();
opts.addOption("help", false, "Show function help");
- CommandLine cl = CommandLineUtils.parseOptions(args, opts);
+
+ final CommandLine cl;
+ try {
+ cl = CommandLineUtils.parseOptions(args, opts);
+ } catch (IllegalArgumentException e) {
+ throw new UDFArgumentException(e);
+ }
if (cl.hasOption("help")) {
showHelp(opts);
@@ -95,6 +101,10 @@ public abstract class UDFWithOptions extends GenericUDF {
return cl;
}
+ protected void showHelp() throws UDFArgumentException {
+ showHelp(getOptions(), null);
+ }
+
protected void showHelp(@Nullable String errMsg) throws UDFArgumentException {
showHelp(getOptions(), errMsg);
}
diff --git a/core/src/main/java/hivemall/UDTFWithOptions.java b/core/src/main/java/hivemall/UDTFWithOptions.java
index 43d9023..a71395e 100644
--- a/core/src/main/java/hivemall/UDTFWithOptions.java
+++ b/core/src/main/java/hivemall/UDTFWithOptions.java
@@ -94,8 +94,13 @@ public abstract class UDTFWithOptions extends GenericUDTF {
String[] args = optionValue.split("\\s+");
Options opts = getOptions();
opts.addOption("help", false, "Show function help");
- CommandLine cl = CommandLineUtils.parseOptions(args, opts);
+ final CommandLine cl;
+ try {
+ cl = CommandLineUtils.parseOptions(args, opts);
+ } catch (IllegalArgumentException e) {
+ throw new UDFArgumentException(e);
+ }
if (cl.hasOption("help")) {
showHelp(opts);
}
diff --git a/core/src/main/java/hivemall/smile/data/AttributeType.java b/core/src/main/java/hivemall/smile/data/AttributeType.java
index 559ad2d..7aa0ef0 100644
--- a/core/src/main/java/hivemall/smile/data/AttributeType.java
+++ b/core/src/main/java/hivemall/smile/data/AttributeType.java
@@ -64,4 +64,4 @@ public enum AttributeType {
return type;
}
-}
\ No newline at end of file
+}
diff --git a/core/src/main/java/hivemall/utils/lang/CommandLineUtils.java b/core/src/main/java/hivemall/utils/lang/CommandLineUtils.java
index 61e844e..a8d406f 100644
--- a/core/src/main/java/hivemall/utils/lang/CommandLineUtils.java
+++ b/core/src/main/java/hivemall/utils/lang/CommandLineUtils.java
@@ -18,6 +18,8 @@
*/
package hivemall.utils.lang;
+import javax.annotation.Nonnull;
+
import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Options;
@@ -27,6 +29,7 @@ public final class CommandLineUtils {
private CommandLineUtils() {}
+ @Nonnull
public static CommandLine parseOptions(final String[] args, final Options opts) {
final BasicParser parser = new BasicParser();
final CommandLine cl;
diff --git a/docs/gitbook/misc/tokenizer.md b/docs/gitbook/misc/tokenizer.md
index e578198..0c0e97e 100644
--- a/docs/gitbook/misc/tokenizer.md
+++ b/docs/gitbook/misc/tokenizer.md
@@ -110,8 +110,44 @@ select tokenize_ja("日本経済新聞&関西国際空港", "normal", null, nu
> ["日本","経済","新聞","関西","国際","空港"]
+Dictionary SHOULD be accessible through http/https protocol. And, it SHOULD be compressed using gzip with `.gz` suffix because the maximum dictionary size is limited to 32MB and read timeout is set to 60 sec. Also, connection must be established in 10 sec.
+
+If you want to use HTTP Basic Authentication, please use the following form: `https://user:password@www.sitreurl.com/my_dict.txt.gz` (see Sec 3.1 of [rfc1738](https://www.ietf.org/rfc/rfc1738.txt))
+
For detailed APIs, please refer Javadoc of [JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html) as well.
+## Part-of-speech
+
+The second argument can also accept the following option format:
+
+```
+ -mode <arg> The tokenization mode. One of ['normal', 'search',
+ 'extended', 'default' (normal)]
+ -pos Return part-of-speech information
+```
+
+Then, you can get part-of-speech information as follows:
+
+```sql
+WITH tmp as (
+ select
+ tokenize_ja('kuromojiを使った分かち書きのテストです。','-mode search -pos') as r
+)
+select
+ r.tokens,
+ r.pos,
+ r.tokens[0] as token0,
+ r.pos[0] as pos0
+from
+ tmp;
+```
+
+| tokens |pos | token0 | pos0 |
+|:-:|:-:|:-:|:-:|
+| ["kuromoji","使う","分かち書き","テスト"] | ["名詞-一般","動詞-自立","名詞-一般","名詞-サ変接続"] | kuromoji | 名詞-一般 |
+
+Note that when `-pos` option is specified, `tokenize_ja` returns a struct record containing `array<string> tokens` and `array<string> pos` as the elements.
+
## Chinese Tokenizer
Chinese text tokenizer UDF uses [SmartChineseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html).
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
index 48b566f..0317d2a 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
@@ -18,6 +18,7 @@
*/
package hivemall.nlp.tokenizer;
+import hivemall.UDFWithOptions;
import hivemall.utils.hadoop.HiveUtils;
import hivemall.utils.io.HttpUtils;
import hivemall.utils.io.IOUtils;
@@ -37,16 +38,18 @@ import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
+import java.util.Objects;
import java.util.Set;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.UDFType;
-import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
@@ -56,9 +59,12 @@ import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
+import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
+import com.clearspring.analytics.util.Preconditions;
+
@Description(name = "tokenize_ja",
value = "_FUNC_(String line [, const string mode = \"normal\", const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or string userDictURL)])"
+ " - returns tokenized strings in array<string>",
@@ -66,12 +72,14 @@ import org.apache.lucene.analysis.util.CharArraySet;
+ "\n"
+ "> [\"kuromoji\",\"使う\",\"分かち書き\",\"テスト\",\"第\",\"二\",\"引数\",\"normal\",\"search\",\"extended\",\"指定\",\"デフォルト\",\"normal\",\" モード\"]\n")
@UDFType(deterministic = true, stateful = false)
-public final class KuromojiUDF extends GenericUDF {
+public final class KuromojiUDF extends UDFWithOptions {
private static final int CONNECT_TIMEOUT_MS = 10000; // 10 sec
private static final int READ_TIMEOUT_MS = 60000; // 60 sec
private static final long MAX_INPUT_STREAM_SIZE = 32L * 1024L * 1024L; // ~32MB
private Mode _mode;
+ private boolean _returnPos;
+ private transient Object[] _result;
@Nullable
private String[] _stopWordsArray;
private Set<String> _stopTags;
@@ -82,14 +90,43 @@ public final class KuromojiUDF extends GenericUDF {
private transient JapaneseAnalyzer _analyzer;
@Override
+ protected Options getOptions() {
+ Options opts = new Options();
+ opts.addOption("mode", true,
+ "The tokenization mode. One of ['normal', 'search', 'extended', 'default' (normal)]");
+ opts.addOption("pos", false, "Return part-of-speech information");
+ return opts;
+ }
+
+ @Override
+ protected CommandLine processOptions(String optionValue) throws UDFArgumentException {
+ CommandLine cl = parseOptions(optionValue);
+ if (cl.hasOption("mode")) {
+ String modeStr = cl.getOptionValue("mode");
+ this._mode = tokenizationMode(modeStr);
+ }
+ this._returnPos = cl.hasOption("pos");
+ return cl;
+ }
+
+ @Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
final int arglen = arguments.length;
if (arglen < 1 || arglen > 5) {
- throw new UDFArgumentException(
- "Invalid number of arguments for `tokenize_ja`: " + arglen);
+ showHelp("Invalid number of arguments for `tokenize_ja`: " + arglen);
}
- this._mode = (arglen >= 2) ? tokenizationMode(arguments[1]) : Mode.NORMAL;
+ this._mode = Mode.NORMAL;
+ if (arglen >= 2) {
+ String arg1 = HiveUtils.getConstString(arguments[1]);
+ if (arg1 != null) {
+ if (arg1.startsWith("-")) {
+ processOptions(arg1);
+ } else {
+ this._mode = tokenizationMode(arg1);
+ }
+ }
+ }
if (arglen >= 3 && !HiveUtils.isVoidOI(arguments[2])) {
this._stopWordsArray = HiveUtils.getConstStringArray(arguments[2]);
@@ -111,12 +148,25 @@ public final class KuromojiUDF extends GenericUDF {
this._analyzer = null;
- return ObjectInspectorFactory.getStandardListObjectInspector(
- PrimitiveObjectInspectorFactory.writableStringObjectInspector);
+ if (_returnPos) {
+ this._result = new Object[2];
+ ArrayList<String> fieldNames = new ArrayList<String>();
+ ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
+ fieldNames.add("tokens");
+ fieldOIs.add(ObjectInspectorFactory.getStandardListObjectInspector(
+ PrimitiveObjectInspectorFactory.writableStringObjectInspector));
+ fieldNames.add("pos");
+ fieldOIs.add(ObjectInspectorFactory.getStandardListObjectInspector(
+ PrimitiveObjectInspectorFactory.writableStringObjectInspector));
+ return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
+ } else {
+ return ObjectInspectorFactory.getStandardListObjectInspector(
+ PrimitiveObjectInspectorFactory.writableStringObjectInspector);
+ }
}
@Override
- public List<Text> evaluate(DeferredObject[] arguments) throws HiveException {
+ public Object evaluate(DeferredObject[] arguments) throws HiveException {
if (_analyzer == null) {
CharArraySet stopWords = stopWords(_stopWordsArray);
@@ -136,20 +186,55 @@ public final class KuromojiUDF extends GenericUDF {
}
String line = arg0.toString();
- final List<Text> results = new ArrayList<Text>(32);
+ if (_returnPos) {
+ return parseLine(_analyzer, line, _result);
+ } else {
+ return parseLine(_analyzer, line);
+ }
+ }
+
+ @Nonnull
+ private static Object[] parseLine(@Nonnull JapaneseAnalyzer analyzer, @Nonnull String line,
+ @Nonnull Object[] result) throws HiveException {
+ Objects.requireNonNull(result);
+ Preconditions.checkArgument(result.length == 2);
+
+ final List<Text> tokens = new ArrayList<Text>(32);
+ final List<Text> pos = new ArrayList<Text>(32);
TokenStream stream = null;
try {
- stream = _analyzer.tokenStream("", line);
+ stream = analyzer.tokenStream("", line);
if (stream != null) {
- analyzeTokens(stream, results);
+ analyzeTokens(stream, tokens, pos);
}
} catch (IOException e) {
- IOUtils.closeQuietly(_analyzer);
+ IOUtils.closeQuietly(analyzer);
throw new HiveException(e);
} finally {
IOUtils.closeQuietly(stream);
}
- return results;
+ result[0] = tokens;
+ result[1] = pos;
+ return result;
+ }
+
+ @Nonnull
+ private static List<Text> parseLine(@Nonnull JapaneseAnalyzer analyzer, @Nonnull String line)
+ throws HiveException {
+ final List<Text> tokens = new ArrayList<Text>(32);
+ TokenStream stream = null;
+ try {
+ stream = analyzer.tokenStream("", line);
+ if (stream != null) {
+ analyzeTokens(stream, tokens);
+ }
+ } catch (IOException e) {
+ IOUtils.closeQuietly(analyzer);
+ throw new HiveException(e);
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+ return tokens;
}
@Override
@@ -158,12 +243,7 @@ public final class KuromojiUDF extends GenericUDF {
}
@Nonnull
- private static Mode tokenizationMode(@Nonnull final ObjectInspector oi)
- throws UDFArgumentException {
- String arg = HiveUtils.getConstString(oi);
- if (arg == null) {
- return Mode.NORMAL;
- }
+ private static Mode tokenizationMode(@Nonnull final String arg) throws UDFArgumentException {
final Mode mode;
if ("NORMAL".equalsIgnoreCase(arg)) {
mode = Mode.NORMAL;
@@ -292,15 +372,31 @@ public final class KuromojiUDF extends GenericUDF {
}
}
- private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results)
+ private static void analyzeTokens(@Nonnull final TokenStream stream,
+ @Nonnull final List<Text> tokens) throws IOException {
+ // instantiate an attribute placeholder once
+ CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
+ stream.reset();
+
+ while (stream.incrementToken()) {
+ String term = termAttr.toString();
+ tokens.add(new Text(term));
+ }
+ }
+
+ private static void analyzeTokens(@Nonnull final TokenStream stream,
+ @Nonnull final List<Text> tokenResult, @Nonnull final List<Text> posResult)
throws IOException {
// instantiate an attribute placeholder once
CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
+ PartOfSpeechAttribute posAttr = stream.addAttribute(PartOfSpeechAttribute.class);
stream.reset();
while (stream.incrementToken()) {
String term = termAttr.toString();
- results.add(new Text(term));
+ tokenResult.add(new Text(term));
+ String pos = posAttr.getPartOfSpeech();
+ posResult.add(new Text(pos));
}
}
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
index 244075d..2a3de26 100644
--- a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
@@ -19,9 +19,12 @@
package hivemall.nlp.tokenizer;
import hivemall.TestUtils;
+import hivemall.utils.hadoop.HiveUtils;
+import hivemall.utils.lang.PrivilegedAccessor;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.List;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
@@ -33,6 +36,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.io.Text;
+import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
+import org.hamcrest.CoreMatchers;
import org.junit.Assert;
import org.junit.Test;
@@ -208,12 +213,14 @@ public class KuromojiUDFTest {
@Override
public void prepare(int arg) throws HiveException {}
};
- List<Text> tokens = udf.evaluate(args);
+ @SuppressWarnings("unchecked")
+ List<Text> tokens = (List<Text>) udf.evaluate(args);
Assert.assertNotNull(tokens);
Assert.assertEquals(5, tokens.size());
udf.close();
}
+ @SuppressWarnings("unchecked")
@Test
public void testEvaluateTwoRows() throws IOException, HiveException {
KuromojiUDF udf = new KuromojiUDF();
@@ -231,7 +238,7 @@ public class KuromojiUDFTest {
@Override
public void prepare(int arg) throws HiveException {}
};
- List<Text> tokens = udf.evaluate(args);
+ List<Text> tokens = (List<Text>) udf.evaluate(args);
Assert.assertNotNull(tokens);
Assert.assertEquals(5, tokens.size());
@@ -243,7 +250,7 @@ public class KuromojiUDFTest {
@Override
public void prepare(int arg) throws HiveException {}
};
- tokens = udf.evaluate(args);
+ tokens = (List<Text>) udf.evaluate(args);
Assert.assertNotNull(tokens);
Assert.assertEquals(4, tokens.size());
@@ -268,7 +275,8 @@ public class KuromojiUDFTest {
@Override
public void prepare(int arg) throws HiveException {}
};
- List<Text> tokens = udf.evaluate(args);
+ @SuppressWarnings("unchecked")
+ List<Text> tokens = (List<Text>) udf.evaluate(args);
Assert.assertNotNull(tokens);
Assert.assertEquals(182, tokens.size());
udf.close();
@@ -309,7 +317,8 @@ public class KuromojiUDFTest {
public void prepare(int arg) throws HiveException {}
};
- List<Text> tokens = udf.evaluate(args);
+ @SuppressWarnings("unchecked")
+ List<Text> tokens = (List<Text>) udf.evaluate(args);
Assert.assertNotNull(tokens);
Assert.assertEquals(3, tokens.size());
@@ -349,7 +358,8 @@ public class KuromojiUDFTest {
public void prepare(int arg) throws HiveException {}
};
- List<Text> tokens = udf.evaluate(args);
+ @SuppressWarnings("unchecked")
+ List<Text> tokens = (List<Text>) udf.evaluate(args);
Assert.assertNotNull(tokens);
udf.close();
@@ -389,7 +399,8 @@ public class KuromojiUDFTest {
public void prepare(int arg) throws HiveException {}
};
- List<Text> tokens = udf.evaluate(args);
+ @SuppressWarnings("unchecked")
+ List<Text> tokens = (List<Text>) udf.evaluate(args);
Assert.assertNotNull(tokens);
Assert.assertEquals(7, tokens.size());
@@ -417,7 +428,8 @@ public class KuromojiUDFTest {
@Override
public void prepare(int arg) throws HiveException {}
};
- List<Text> tokens = udf.evaluate(args);
+ @SuppressWarnings("unchecked")
+ List<Text> tokens = (List<Text>) udf.evaluate(args);
Assert.assertNotNull(tokens);
// serialization after evaluation
@@ -426,4 +438,80 @@ public class KuromojiUDFTest {
udf.close();
}
+
+ @Test
+ public void testNormalModeWithOption()
+ throws IOException, HiveException, IllegalAccessException, NoSuchFieldException {
+ GenericUDF udf = new KuromojiUDF();
+ ObjectInspector[] argOIs = new ObjectInspector[2];
+
+ argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector; // line
+ argOIs[1] = HiveUtils.getConstStringObjectInspector("-mode normal"); // mode
+ udf.initialize(argOIs);
+
+ Object mode = PrivilegedAccessor.getValue(udf, "_mode");
+ Assert.assertEquals(Mode.NORMAL, mode);
+
+ DeferredObject[] args = new DeferredObject[1];
+ args[0] = new DeferredObject() {
+ public Text get() throws HiveException {
+ return new Text("クロモジのJapaneseAnalyzerを使ってみる。テスト。");
+ }
+
+ @Override
+ public void prepare(int arg) throws HiveException {}
+ };
+ Object result = udf.evaluate(args);
+ Assert.assertThat(Arrays.asList(new Text("クロモジ"), new Text("japaneseanalyzer"),
+ new Text("使う"), new Text("みる"), new Text("テスト")), CoreMatchers.is(result));
+
+ udf.close();
+ }
+
+ @Test
+ public void testNormalModeWithPosOptions()
+ throws IOException, HiveException, IllegalAccessException, NoSuchFieldException {
+ GenericUDF udf = new KuromojiUDF();
+ ObjectInspector[] argOIs = new ObjectInspector[2];
+
+ argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector; // line
+ argOIs[1] = HiveUtils.getConstStringObjectInspector("-mode normal -pos"); // mode
+ udf.initialize(argOIs);
+
+ Object mode = PrivilegedAccessor.getValue(udf, "_mode");
+ Assert.assertEquals(Mode.NORMAL, mode);
+
+ DeferredObject[] args = new DeferredObject[1];
+ args[0] = new DeferredObject() {
+ public Text get() throws HiveException {
+ return new Text("クロモジのJapaneseAnalyzerを使ってみる。テスト。");
+ }
+
+ @Override
+ public void prepare(int arg) throws HiveException {}
+ };
+
+ Object[] result = (Object[]) udf.evaluate(args);
+ Assert.assertEquals(2, result.length);
+
+ Assert.assertEquals(Arrays.asList(new Text("クロモジ"), new Text("japaneseanalyzer"),
+ new Text("使う"), new Text("みる"), new Text("テスト")), result[0]);
+ Assert.assertEquals(Arrays.asList(new Text("名詞-一般"), new Text("名詞-一般"), new Text("動詞-自立"),
+ new Text("動詞-非自立"), new Text("名詞-サ変接続")), result[1]);
+
+ udf.close();
+ }
+
+ @Test(expected = UDFArgumentException.class)
+ public void testUnsupportedOptionArgs()
+ throws IOException, HiveException, IllegalAccessException, NoSuchFieldException {
+ GenericUDF udf = new KuromojiUDF();
+ ObjectInspector[] argOIs = new ObjectInspector[2];
+
+ argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector; // line
+ argOIs[1] = HiveUtils.getConstStringObjectInspector("-mode normal -unsupported_option"); // mode
+ udf.initialize(argOIs);
+
+ udf.close();
+ }
}