You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hivemall.apache.org by my...@apache.org on 2021/04/23 10:17:21 UTC
[incubator-hivemall] branch master updated: [HIVEMALL-309] Enhance
tokenize_ko to support stopwords and external user dict
This is an automated email from the ASF dual-hosted git repository.
myui pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git
The following commit(s) were added to refs/heads/master by this push:
new 782c6e6 [HIVEMALL-309] Enhance tokenize_ko to support stopwords and external user dict
782c6e6 is described below
commit 782c6e6025d582ba7a5e49f76ab6b8848098319d
Author: Makoto Yui <my...@apache.org>
AuthorDate: Fri Apr 23 19:17:14 2021 +0900
[HIVEMALL-309] Enhance tokenize_ko to support stopwords and external user dict
## What changes were proposed in this pull request?
Enhance tokenize_ko to support stopwords and external user dict
## What type of PR is it?
Improvement
## What is the Jira issue?
https://issues.apache.org/jira/browse/HIVEMALL-309
## How was this patch tested?
unit tests, manual tests on EMR
## How to use this feature?
```sql
-- default stopward (null), default stoptags (null), custom dict
select tokenize_ko('나는 C++ 언어를 프로그래밍 언어로 사랑한다.', '-mode discard', null, null, array('C++'));
> ["나","c++","언어","프로그래밍","언어","사랑"]
select tokenize_ko('나는 c++ 프로그래밍을 즐긴다.', '-mode discard', null, null, 'https://raw.githubusercontent.com/apache/lucene/main/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/userdict.txt');
> ["나","c++","프로그래밍","즐기"]
```
## Checklist
- [x] Did you apply source code formatter, i.e., `./bin/format_code.sh`, for your commit?
- [x] Did you run system tests on Hive (or Spark)?
Author: Makoto Yui <my...@apache.org>
Closes #238 from myui/korean-enhancement.
---
LICENSE | 8 +
docs/gitbook/misc/funcs.md | 6 +-
docs/gitbook/misc/tokenizer.md | 100 ++-
.../hivemall/nlp/tokenizer/KuromojiNEologdUDF.java | 11 +-
.../java/hivemall/nlp/tokenizer/KuromojiUDF.java | 11 +-
.../java/hivemall/nlp/tokenizer/SmartcnUDF.java | 7 +-
.../java/hivemall/nlp/tokenizer/TokenizeKoUDF.java | 274 +++++++--
.../hivemall/nlp/tokenizer/ext/KoreanAnalyzer.java | 118 ++++
.../resources/META-INF/LICENSE-stopwords-ko.txt | 21 +
.../hivemall/nlp/tokenizer/ext/stopwords-ko.txt | 680 +++++++++++++++++++++
.../hivemall/nlp/tokenizer/KuromojiUDFTest.java | 18 +
.../hivemall/nlp/tokenizer/SmartcnUDFTest.java | 6 +-
.../hivemall/nlp/tokenizer/TokenizeKoUDFTest.java | 251 +++++---
.../nlp/tokenizer/ext/KoreanAnalyzerTest.java | 88 +++
.../hivemall/nlp/tokenizer/ext/userdict-ko.txt | 11 +
15 files changed, 1445 insertions(+), 165 deletions(-)
diff --git a/LICENSE b/LICENSE
index 5d99f40..b0940b8 100644
--- a/LICENSE
+++ b/LICENSE
@@ -296,6 +296,14 @@ For details, see http://fontawesome.io/
src/site/resources/LICENSE-font_awesome-css.txt
+This product bundles collection of stopwords for Korean language
+which is licensed under the MIT license, specifically for tokenize_ko UDF.
+For details, see https://github.com/stopwords-iso/stopwords-ko
+
+ You can find a copy of the License at
+
+ nlp/src/main/resources/META-INF/LICENSE-stopwords-ko.txt
+
---------------------------------------------------------------------------
The SIL Open Font License (https://opensource.org/licenses/OFL-1.1)
---------------------------------------------------------------------------
diff --git a/docs/gitbook/misc/funcs.md b/docs/gitbook/misc/funcs.md
index b40019a..3470695 100644
--- a/docs/gitbook/misc/funcs.md
+++ b/docs/gitbook/misc/funcs.md
@@ -1059,7 +1059,7 @@ Reference: <a href="https://papers.nips.cc/paper/3848-adaptive-regularization-of
- `tokenize_cn(String line [, const list<string> stopWords])` - returns tokenized strings in array<string>
-- `tokenize_ja(String line [, const string mode = "normal", const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or string userDictURL)`]) - returns tokenized strings in array<string>
+- `tokenize_ja(String line [, const string mode = "normal", const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or const string userDictURL)`]) - returns tokenized strings in array<string>
```sql
select tokenize_ja("kuromojiを使った分かち書きのテストです。第二引数にはnormal/search/extendedを指定できます。デフォルトではnormalモードです。");
@@ -1067,7 +1067,7 @@ Reference: <a href="https://papers.nips.cc/paper/3848-adaptive-regularization-of
```
-- `tokenize_ja_neologd(String line [, const string mode = "normal", const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or string userDictURL)`]) - returns tokenized strings in array<string>
+- `tokenize_ja_neologd(String line [, const string mode = "normal", const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or const string userDictURL)`]) - returns tokenized strings in array<string>
```sql
select tokenize_ja_neologd("kuromojiを使った分かち書きのテストです。第二引数にはnormal/search/extendedを指定できます。デフォルトではnormalモードです。");
@@ -1075,7 +1075,7 @@ Reference: <a href="https://papers.nips.cc/paper/3848-adaptive-regularization-of
```
-- `tokenize_ko(String line [, const array<string> userDict, const string mode = "discard", const array<string> stopTags, boolean outputUnknownUnigrams])` - returns tokenized strings in array<string>
+- `tokenize_ko(String line [, const string mode = "discard" (or const string opts)`, const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or const string userDictURL)]) - returns tokenized strings in array<string>
```sql
select tokenize_ko("소설 무궁화꽃이 피었습니다.");
diff --git a/docs/gitbook/misc/tokenizer.md b/docs/gitbook/misc/tokenizer.md
index 5ddc93e..3992e2c 100644
--- a/docs/gitbook/misc/tokenizer.md
+++ b/docs/gitbook/misc/tokenizer.md
@@ -105,6 +105,8 @@ select stoptags_exclude(array("名詞-固有名詞"));
詞-形容詞接続","接頭詞-数接","未知語","記号","記号-アルファベット","記号-一般","記号-句点","記号-括弧閉
","記号-括弧開","記号-空白","記号-読点","語断片","連体詞","非言語音"]
+### Custom dictionary
+
Moreover, the fifth argument `userDict` enables you to register a user-defined custom dictionary in [Kuromoji official format](https://github.com/atilika/kuromoji/blob/909fd6b32bf4e9dc86b7599de5c9b50ca8f004a1/kuromoji-core/src/test/resources/userdict.txt):
```sql
@@ -136,8 +138,7 @@ select tokenize_ja("日本経済新聞&関西国際空港", "normal", null, nu
For detailed APIs, please refer Javadoc of [JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html) as well.
-
-## Part-of-speech
+### Part-of-speech
From Hivemall v0.6.0, the second argument can also accept the following option format:
@@ -196,12 +197,32 @@ Korean toknizer internally uses [lucene-analyzers-nori](analyzers-nori: Korean M
The signature of the UDF is as follows:
```sql
-tokenize_ko(String line [,
- const array<string> userDict,
- const string mode = "discard",
- const array<string> stopTags,
- boolean outputUnknownUnigrams
- ]) - returns tokenized strings in array<string>
+tokenize_ko(
+ String line [, const string mode = "discard" (or const string opts),
+ const array<string> stopWords,
+ const array<string>
+ stopTags,
+ const array<string> userDict (or const string userDictURL)]
+) - returns tokenized strings in array<string>
+```
+
+> #### Note
+> Instead of mode, the 2nd argument can take options starting with `-`.
+
+You can get usage as follows:
+
+```sql
+select tokenize_ko("", "-help");
+
+usage: tokenize_ko(String line [, const string mode = "discard" (or const
+ string opts), const array<string> stopWords, const array<string>
+ stopTags, const array<string> userDict (or const string
+ userDictURL)]) - returns tokenized strings in array<string> [-help]
+ [-mode <arg>] [-outputUnknownUnigrams]
+ -help Show function help
+ -mode <arg> The tokenization mode. One of ['node', 'discard'
+ (default), 'mixed']
+ -outputUnknownUnigrams outputs unigrams for unknown words.
```
> #### Note
@@ -214,24 +235,69 @@ See the following examples for the usage.
select tokenize_ko();
> 8.8.2
-select tokenize_ko("소설 무궁화꽃이 피었습니다.");
+select tokenize_ko('소설 무궁화꽃이 피었습니다.');
> ["소설","무궁","화","꽃","피"]
-select tokenize_ko("소설 무궁화꽃이 피었습니다.", null, "mixed");
+select tokenize_ko('소설 무궁화꽃이 피었습니다.', '-mode discard');
+> ["소설","무궁","화","꽃","피"]
+
+select tokenize_ko('소설 무궁화꽃이 피었습니다.', 'mixed');
> ["소설","무궁화","무궁","화","꽃","피"]
-select tokenize_ko("소설 무궁화꽃이 피었습니다.", null, "discard", array("E", "VV"));
-> ["소설","무궁","화","꽃","이"]
+select tokenize_ko('소설 무궁화꽃이 피었습니다.', '-mode mixed');
+> ["소설","무궁화","무궁","화","꽃","피"]
-select tokenize_ko("Hello, world.", null, "none", array(), true);
-> ["h","e","l","l","o","w","o","r","l","d"]
+select tokenize_ko('소설 무궁화꽃이 피었습니다.', '-mode none');
+> ["소설","무궁화","꽃","피"]
-select tokenize_ko("Hello, world.", null, "none", array(), false);
+select tokenize_ko('Hello, world.', '-mode none');
> ["hello","world"]
-select tokenize_ko("나는 C++ 언어를 프로그래밍 언어로 사랑한다.", null, "discard", array());
+select tokenize_ko('Hello, world.', '-mode none -outputUnknownUnigrams');
+> ["h","e","l","l","o","w","o","r","l","d"]
+
+-- default stopward (null), with stoptags
+select tokenize_ko('소설 무궁화꽃이 피었습니다.', 'discard', null, array('E'));
+> ["소설","무궁","화","꽃","이","피"]
+
+select tokenize_ko('소설 무궁화꽃이 피었습니다.', 'discard', null, array('E', 'VV'));
+> ["소설","무궁","화","꽃","이"]
+
+select tokenize_ko('나는 C++ 언어를 프로그래밍 언어로 사랑한다.', '-mode discard');
+> ["나","c","언어","프로그래밍","언어","사랑"]
+
+select tokenize_ko('나는 C++ 언어를 프로그래밍 언어로 사랑한다.', '-mode discard', array(), null);
> ["나","는","c","언어","를","프로그래밍","언어","로","사랑","하","ᆫ다"]
-select tokenize_ko("나는 C++ 언어를 프로그래밍 언어로 사랑한다.", array("C++"), "discard", array());
+-- default stopward (null), default stoptags (null)
+select tokenize_ko('나는 C++ 언어를 프로그래밍 언어로 사랑한다.', '-mode discard');
+select tokenize_ko('나는 C++ 언어를 프로그래밍 언어로 사랑한다.', '-mode discard', null, null);
+> ["나","c","언어","프로그래밍","언어","사랑"]
+
+-- no stopward (empty array), default stoptags (null)
+select tokenize_ko('나는 C++ 언어를 프로그래밍 언어로 사랑한다.', '-mode discard', array());
+select tokenize_ko('나는 C++ 언어를 프로그래밍 언어로 사랑한다.', '-mode discard', array(), null);
+> ["나","c","언어","프로그래밍","언어","사랑"]
+
+-- no stopward (empty array), no stoptags (emptry array), custom dict
+select tokenize_ko('나는 C++ 언어를 프로그래밍 언어로 사랑한다.', '-mode discard', array(), array(), array('C++'));
> ["나","는","c++","언어","를","프로그래밍","언어","로","사랑","하","ᆫ다"]
+
+> -- default stopward (null), default stoptags (null), custom dict
+select tokenize_ko('나는 C++ 언어를 프로그래밍 언어로 사랑한다.', '-mode discard', null, null, array('C++'));
+> ["나","c++","언어","프로그래밍","언어","사랑"]
+```
+
+### Custom dictionary
+
+Moreover, the fifth argument `userDictURL` enables you to register a user-defined custom dictionary placed in http/https accessible external site. Find the dictionary format [here from Lucene's one](https://raw.githubusercontent.com/apache/lucene/main/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/userdict.txt).
+
+
+```sql
+select tokenize_ko('나는 c++ 프로그래밍을 즐긴다.', '-mode discard', null, null, 'https://raw.githubusercontent.com/apache/lucene/main/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/userdict.txt');
+
+> ["나","c++","프로그래밍","즐기"]
```
+
+> #### Note
+> Dictionary SHOULD be accessible through http/https protocol. And, it SHOULD be compressed using gzip with `.gz` suffix because the maximum dictionary size is limited to 32MB and read timeout is set to 60 sec. Also, connection must be established in 10 sec.
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
index e7e4ace..b41d4dc 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
@@ -66,7 +66,7 @@ import org.apache.lucene.analysis.ja.neologd.tokenattributes.PartOfSpeechAttribu
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@Description(name = "tokenize_ja_neologd",
- value = "_FUNC_(String line [, const string mode = \"normal\", const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or string userDictURL)])"
+ value = "_FUNC_(String line [, const string mode = \"normal\", const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or const string userDictURL)])"
+ " - returns tokenized strings in array<string>",
extended = "select tokenize_ja_neologd(\"kuromojiを使った分かち書きのテストです。第二引数にはnormal/search/extendedを指定できます。デフォルトではnormalモードです。\");\n"
+ "\n"
@@ -274,13 +274,15 @@ public final class KuromojiNEologdUDF extends UDFWithOptions {
@Nonnull
private static CharArraySet stopWords(@Nullable final String[] array)
throws UDFArgumentException {
+ CharArraySet stopWords = JapaneseAnalyzer.getDefaultStopSet();
if (array == null) {
- return JapaneseAnalyzer.getDefaultStopSet();
+ return stopWords;
}
if (array.length == 0) {
return CharArraySet.EMPTY_SET;
}
- return new CharArraySet(Arrays.asList(array), /* ignoreCase */true);
+ stopWords.addAll(Arrays.asList(array));
+ return stopWords;
}
@Nonnull
@@ -313,6 +315,9 @@ public final class KuromojiNEologdUDF extends UDFWithOptions {
if (userDictArray == null) {
return null;
}
+ if (userDictArray.length == 0) {
+ return null;
+ }
final StringBuilder builder = new StringBuilder();
for (String row : userDictArray) {
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
index 8f05782..07059b2 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
@@ -66,7 +66,7 @@ import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@Description(name = "tokenize_ja",
- value = "_FUNC_(String line [, const string mode = \"normal\", const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or string userDictURL)])"
+ value = "_FUNC_(String line [, const string mode = \"normal\", const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or const string userDictURL)])"
+ " - returns tokenized strings in array<string>",
extended = "select tokenize_ja(\"kuromojiを使った分かち書きのテストです。第二引数にはnormal/search/extendedを指定できます。デフォルトではnormalモードです。\");\n"
+ "\n"
@@ -274,13 +274,15 @@ public final class KuromojiUDF extends UDFWithOptions {
@Nonnull
private static CharArraySet stopWords(@Nullable final String[] array)
throws UDFArgumentException {
+ CharArraySet stopWords = JapaneseAnalyzer.getDefaultStopSet();
if (array == null) {
- return JapaneseAnalyzer.getDefaultStopSet();
+ return stopWords;
}
if (array.length == 0) {
return CharArraySet.EMPTY_SET;
}
- return new CharArraySet(Arrays.asList(array), /* ignoreCase */true);
+ stopWords.addAll(Arrays.asList(array));
+ return stopWords;
}
@Nonnull
@@ -313,6 +315,9 @@ public final class KuromojiUDF extends UDFWithOptions {
if (userDictArray == null) {
return null;
}
+ if (userDictArray.length == 0) {
+ return null;
+ }
final StringBuilder builder = new StringBuilder();
for (String row : userDictArray) {
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
index 8bb5db9..aec8c3a 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
@@ -119,14 +119,15 @@ public final class SmartcnUDF extends GenericUDF {
@Nonnull
private static CharArraySet stopWords(@Nullable final String[] array)
throws UDFArgumentException {
+ CharArraySet stopWords = SmartChineseAnalyzer.getDefaultStopSet();
if (array == null) {
- return SmartChineseAnalyzer.getDefaultStopSet();
+ return stopWords;
}
if (array.length == 0) {
return CharArraySet.EMPTY_SET;
}
- CharArraySet results = new CharArraySet(Arrays.asList(array), true /* ignoreCase */);
- return results;
+ stopWords.addAll(Arrays.asList(array));
+ return stopWords;
}
private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results)
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java
index 8c2a939..fb61633 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java
@@ -18,12 +18,22 @@
*/
package hivemall.nlp.tokenizer;
+import hivemall.UDFWithOptions;
+import hivemall.nlp.tokenizer.ext.KoreanAnalyzer;
import hivemall.utils.hadoop.HiveUtils;
+import hivemall.utils.io.HttpUtils;
import hivemall.utils.io.IOUtils;
+import hivemall.utils.lang.ExceptionUtils;
import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
+import java.net.HttpURLConnection;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@@ -36,18 +46,18 @@ import java.util.Set;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.UDFType;
-import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.Text;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.ko.KoreanAnalyzer;
-import org.apache.lucene.analysis.ko.KoreanPartOfSpeechStopFilter;
import org.apache.lucene.analysis.ko.KoreanTokenizer;
import org.apache.lucene.analysis.ko.KoreanTokenizer.DecompoundMode;
import org.apache.lucene.analysis.ko.POS;
@@ -55,36 +65,83 @@ import org.apache.lucene.analysis.ko.dict.UserDictionary;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@Description(name = "tokenize_ko",
- value = "_FUNC_(String line [, const array<string> userDict, const string mode = \"discard\", const array<string> stopTags, boolean outputUnknownUnigrams])"
+ value = "_FUNC_(String line [, const string mode = \"discard\" (or const string opts), const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or const string userDictURL)])"
+ " - returns tokenized strings in array<string>",
extended = "select tokenize_ko(\"소설 무궁화꽃이 피었습니다.\");\n" + "\n"
+ "> [\"소설\",\"무궁\",\"화\",\"꽃\",\"피\"]\n")
@UDFType(deterministic = true, stateful = false)
-public final class TokenizeKoUDF extends GenericUDF {
-
- @Nullable
- private UserDictionary userDict;
+public final class TokenizeKoUDF extends UDFWithOptions {
+ private static final int CONNECT_TIMEOUT_MS = 10000; // 10 sec
+ private static final int READ_TIMEOUT_MS = 60000; // 60 sec
+ private static final long MAX_INPUT_STREAM_SIZE = 32L * 1024L * 1024L; // ~32MB
private DecompoundMode mode;
+ @Nullable
+ private String[] stopWordsArray;
private Set<POS.Tag> stopTags;
- private boolean outputUnknownUnigrams;
+ private boolean outputUnknownUnigrams = false;
+
+ @Nullable
+ private Object userDictObj; // String[] or String
private transient KoreanAnalyzer analyzer;
@Override
+ protected Options getOptions() {
+ Options opts = new Options();
+ opts.addOption("mode", true,
+ "The tokenization mode. One of ['node', 'discard' (default), 'mixed']");
+ opts.addOption("outputUnknownUnigrams", false, "outputs unigrams for unknown words.");
+ return opts;
+ }
+
+ @Override
+ protected CommandLine processOptions(String optionValue) throws UDFArgumentException {
+ CommandLine cl = parseOptions(optionValue);
+ if (cl.hasOption("mode")) {
+ String modeStr = cl.getOptionValue("mode");
+ this.mode = decompoundMode(modeStr);
+ }
+ this.outputUnknownUnigrams = cl.hasOption("outputUnknownUnigrams");
+ return cl;
+ }
+
+ @Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
final int arglen = arguments.length;
- if (arglen > 5) {
- throw new UDFArgumentException(
- "Invalid number of arguments for `tokenize_ko`: " + arglen);
+ if (arglen > 6) {
+ showHelp("Invalid number of arguments for `tokenize_ko`: " + arglen);
}
- this.userDict = (arglen >= 3) ? parseUserDict(arguments[1]) : null;
- this.mode = (arglen >= 3) ? parseDecompoundMode(arguments[2])
- : KoreanTokenizer.DEFAULT_DECOMPOUND;
- this.stopTags = (arglen >= 4) ? parseStopTags(arguments[3])
- : KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS;
- this.outputUnknownUnigrams = (arglen >= 5) && HiveUtils.getConstBoolean(arguments[4]);
+ this.mode = KoreanTokenizer.DEFAULT_DECOMPOUND;
+ if (arglen >= 2) {
+ String arg1 = HiveUtils.getConstString(arguments[1]);
+ if (arg1 != null) {
+ if (arg1.startsWith("-")) {
+ processOptions(arg1);
+ } else {
+ this.mode = decompoundMode(arg1);
+ }
+ }
+ }
+
+ if (arglen >= 3 && !HiveUtils.isVoidOI(arguments[2])) {
+ this.stopWordsArray = HiveUtils.getConstStringArray(arguments[2]);
+ }
+
+ this.stopTags =
+ (arglen >= 4) ? stopTags(arguments[3]) : KoreanAnalyzer.getDefaultStopTags();
+
+ if (arglen >= 5) {
+ if (HiveUtils.isConstListOI(arguments[4])) {
+ this.userDictObj = HiveUtils.getConstStringArray(arguments[4]);
+ } else if (HiveUtils.isConstString(arguments[4])) {
+ this.userDictObj = HiveUtils.getConstString(arguments[4]);
+ } else {
+ throw new UDFArgumentException(
+ "User dictionary MUST be given as an array of constant string or constant string (URL)");
+ }
+ }
this.analyzer = null;
@@ -106,7 +163,17 @@ public final class TokenizeKoUDF extends GenericUDF {
}
if (analyzer == null) {
- this.analyzer = new KoreanAnalyzer(userDict, mode, stopTags, outputUnknownUnigrams);
+ CharArraySet stopWords = stopWords(stopWordsArray);
+
+ UserDictionary userDict = null;
+ if (userDictObj instanceof String[]) {
+ userDict = userDictionary((String[]) userDictObj);
+ } else if (userDictObj instanceof String) {
+ userDict = userDictionary((String) userDictObj);
+ }
+
+ this.analyzer =
+ new KoreanAnalyzer(userDict, mode, stopWords, stopTags, outputUnknownUnigrams);
}
Object arg0 = arguments[0].get();
@@ -137,42 +204,9 @@ public final class TokenizeKoUDF extends GenericUDF {
IOUtils.closeQuietly(analyzer);
}
- @Nullable
- private static UserDictionary parseUserDict(@Nonnull final ObjectInspector oi)
- throws UDFArgumentException {
- if (HiveUtils.isVoidOI(oi)) {
- return null;
- }
- final String[] array = HiveUtils.getConstStringArray(oi);
- if (array == null) {
- return null;
- }
- final int length = array.length;
- if (length == 0) {
- return null;
- }
- final StringBuilder builder = new StringBuilder();
- for (int i = 0; i < length; i++) {
- String row = array[i];
- if (row != null) {
- builder.append(row).append('\n');
- }
- }
-
- final Reader reader = new StringReader(builder.toString());
- try {
- return UserDictionary.open(reader); // return null if empty
- } catch (Throwable e) {
- throw new UDFArgumentException(
- "Failed to create user dictionary based on the given array<string>: "
- + builder.toString());
- }
- }
-
@Nonnull
- private static DecompoundMode parseDecompoundMode(@Nonnull final ObjectInspector oi)
+ private static DecompoundMode decompoundMode(@Nullable final String arg)
throws UDFArgumentException {
- String arg = HiveUtils.getConstString(oi);
if (arg == null) {
return KoreanTokenizer.DEFAULT_DECOMPOUND;
}
@@ -191,14 +225,28 @@ public final class TokenizeKoUDF extends GenericUDF {
}
@Nonnull
- private static Set<POS.Tag> parseStopTags(@Nonnull final ObjectInspector oi)
+ private static CharArraySet stopWords(@Nullable final String[] array)
+ throws UDFArgumentException {
+ final CharArraySet stopWords = KoreanAnalyzer.getDefaultStopSet();
+ if (array == null) {
+ return stopWords;
+ }
+ if (array.length == 0) {
+ return CharArraySet.EMPTY_SET;
+ }
+ stopWords.addAll(Arrays.asList(array));
+ return stopWords;
+ }
+
+ @Nonnull
+ private static Set<POS.Tag> stopTags(@Nonnull final ObjectInspector oi)
throws UDFArgumentException {
if (HiveUtils.isVoidOI(oi)) {
- return KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS;
+ return KoreanAnalyzer.getDefaultStopTags();
}
final String[] array = HiveUtils.getConstStringArray(oi);
if (array == null) {
- return KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS;
+ return KoreanAnalyzer.getDefaultStopTags();
}
final int length = array.length;
if (length == 0) {
@@ -219,6 +267,120 @@ public final class TokenizeKoUDF extends GenericUDF {
return stopTags;
}
+
+ @Nullable
+ private static UserDictionary userDictionary(@Nullable final String[] userDictArray)
+ throws UDFArgumentException {
+ if (userDictArray == null) {
+ return null;
+ }
+ if (userDictArray.length == 0) {
+ return null;
+ }
+
+ final StringBuilder builder = new StringBuilder();
+ for (String row : userDictArray) {
+ builder.append(row).append('\n');
+ }
+ final Reader reader = new StringReader(builder.toString());
+ try {
+ return UserDictionary.open(reader); // return null if empty
+ } catch (Throwable e) {
+ throw new UDFArgumentException(
+ "Failed to create user dictionary based on the given array<string>: "
+ + builder.toString() + '\n' + ExceptionUtils.prettyPrintStackTrace(e));
+ }
+ }
+
+ @Nullable
+ private static UserDictionary userDictionary(@Nonnull final ObjectInspector oi)
+ throws UDFArgumentException {
+ if (HiveUtils.isVoidOI(oi)) {
+ return null;
+ }
+ final String[] array = HiveUtils.getConstStringArray(oi);
+ if (array == null) {
+ return null;
+ }
+ final int length = array.length;
+ if (length == 0) {
+ return null;
+ }
+ final StringBuilder builder = new StringBuilder();
+ for (int i = 0; i < length; i++) {
+ String row = array[i];
+ if (row != null) {
+ builder.append(row).append('\n');
+ }
+ }
+
+ final Reader reader = new StringReader(builder.toString());
+ try {
+ return UserDictionary.open(reader); // return null if empty
+ } catch (Throwable e) {
+ throw new UDFArgumentException(
+ "Failed to create user dictionary based on the given array<string>: "
+ + builder.toString());
+ }
+ }
+
+
+ @Nullable
+ private static UserDictionary userDictionary(@Nullable final String userDictURL)
+ throws UDFArgumentException {
+ if (userDictURL == null) {
+ return null;
+ }
+
+ final HttpURLConnection conn;
+ try {
+ conn = HttpUtils.getHttpURLConnection(userDictURL);
+ } catch (IllegalArgumentException | IOException e) {
+ throw new UDFArgumentException("Failed to create HTTP connection to the URL: "
+ + userDictURL + '\n' + ExceptionUtils.prettyPrintStackTrace(e));
+ }
+
+ // allow to read as a compressed GZIP file for efficiency
+ conn.setRequestProperty("Accept-Encoding", "gzip");
+
+ conn.setConnectTimeout(CONNECT_TIMEOUT_MS); // throw exception from connect()
+ conn.setReadTimeout(READ_TIMEOUT_MS); // throw exception from getXXX() methods
+
+ final int responseCode;
+ try {
+ responseCode = conn.getResponseCode();
+ } catch (IOException e) {
+ throw new UDFArgumentException("Failed to get response code: " + userDictURL + '\n'
+ + ExceptionUtils.prettyPrintStackTrace(e));
+ }
+ if (responseCode != 200) {
+ throw new UDFArgumentException("Got invalid response code: " + responseCode);
+ }
+
+ final InputStream is;
+ try {
+ is = IOUtils.decodeInputStream(
+ HttpUtils.getLimitedInputStream(conn, MAX_INPUT_STREAM_SIZE));
+ } catch (NullPointerException | IOException e) {
+ throw new UDFArgumentException("Failed to get input stream from the connection: "
+ + userDictURL + '\n' + ExceptionUtils.prettyPrintStackTrace(e));
+ }
+
+ CharsetDecoder decoder =
+ StandardCharsets.UTF_8.newDecoder()
+ .onMalformedInput(CodingErrorAction.REPORT)
+ .onUnmappableCharacter(CodingErrorAction.REPORT);
+ final Reader reader = new InputStreamReader(is, decoder);
+ try {
+ return UserDictionary.open(reader); // return null if empty
+ } catch (Throwable e) {
+ throw new UDFArgumentException(
+ "Failed to parse the file in CSV format (UTF-8 encoding is expected): "
+ + userDictURL + '\n' + ExceptionUtils.prettyPrintStackTrace(e));
+ }
+ }
+
+
private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results)
throws IOException {
// instantiate an attribute placeholder once
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/ext/KoreanAnalyzer.java b/nlp/src/main/java/hivemall/nlp/tokenizer/ext/KoreanAnalyzer.java
new file mode 100644
index 0000000..48a6569
--- /dev/null
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/ext/KoreanAnalyzer.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.nlp.tokenizer.ext;
+
+import static org.apache.lucene.analysis.TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY;
+
+import java.io.IOException;
+import java.util.Set;
+
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ko.KoreanPartOfSpeechStopFilter;
+import org.apache.lucene.analysis.ko.KoreanReadingFormFilter;
+import org.apache.lucene.analysis.ko.KoreanTokenizer;
+import org.apache.lucene.analysis.ko.KoreanTokenizer.DecompoundMode;
+import org.apache.lucene.analysis.ko.POS;
+import org.apache.lucene.analysis.ko.dict.UserDictionary;
+
+/**
+ * Korean analyzer supporting stopwords.
+ */
+public final class KoreanAnalyzer extends StopwordAnalyzerBase {
+
+ private final UserDictionary userDict;
+ private final KoreanTokenizer.DecompoundMode mode;
+ private final Set<POS.Tag> stopTags;
+ private final boolean outputUnknownUnigrams;
+
+ /**
+ * Creates a new KoreanAnalyzer.
+ */
+ public KoreanAnalyzer() {
+ this(null, KoreanTokenizer.DEFAULT_DECOMPOUND, DefaultSetHolder.DEFAULT_STOP_SET, KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS, false);
+ }
+
+ /**
+ * Creates a new KoreanAnalyzer.
+ *
+ * @param userDict Optional: if non-null, user dictionary.
+ * @param mode Decompound mode.
+ * @param stopTags The set of part of speech that should be filtered.
+ * @param outputUnknownUnigrams If true outputs unigrams for unknown words.
+ */
+ public KoreanAnalyzer(@Nullable UserDictionary userDict, @Nonnull DecompoundMode mode,
+ @Nullable CharArraySet stopwords, @Nonnull Set<POS.Tag> stopTags,
+ boolean outputUnknownUnigrams) {
+ super(stopwords);
+ this.userDict = userDict;
+ this.mode = mode;
+ this.stopTags = stopTags;
+ this.outputUnknownUnigrams = outputUnknownUnigrams;
+ }
+
+ @Nonnull
+ public static CharArraySet getDefaultStopSet() {
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ @Nonnull
+ public static Set<POS.Tag> getDefaultStopTags() {
+ return KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS;
+ }
+
+ private static class DefaultSetHolder {
+ static final CharArraySet DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET =
+ loadStopwordSet(true, KoreanAnalyzer.class, "stopwords-ko.txt", "#");
+ } catch (IOException ex) {
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new KoreanTokenizer(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, userDict, mode,
+ outputUnknownUnigrams);
+ TokenStream stream = new KoreanPartOfSpeechStopFilter(tokenizer, stopTags);
+ stream = new KoreanReadingFormFilter(stream);
+ stream = new LowerCaseFilter(stream);
+ return new TokenStreamComponents(tokenizer, stream);
+ }
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ return new LowerCaseFilter(in);
+ }
+
+ @Nonnull
+ public static TokenStream normalize(@Nonnull TokenStream in) {
+ return new LowerCaseFilter(in);
+ }
+}
diff --git a/nlp/src/main/resources/META-INF/LICENSE-stopwords-ko.txt b/nlp/src/main/resources/META-INF/LICENSE-stopwords-ko.txt
new file mode 100644
index 0000000..866a3b4
--- /dev/null
+++ b/nlp/src/main/resources/META-INF/LICENSE-stopwords-ko.txt
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2016 Gene Diaz
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/nlp/src/main/resources/hivemall/nlp/tokenizer/ext/stopwords-ko.txt b/nlp/src/main/resources/hivemall/nlp/tokenizer/ext/stopwords-ko.txt
new file mode 100644
index 0000000..0a72b07
--- /dev/null
+++ b/nlp/src/main/resources/hivemall/nlp/tokenizer/ext/stopwords-ko.txt
@@ -0,0 +1,680 @@
+# derived from https://github.com/stopwords-iso/stopwords-ko
+!
+"
+$
+%
+&
+'
+(
+)
+*
++
+,
+-
+.
+...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+;
+<
+=
+>
+?
+@
+\
+^
+_
+`
+|
+~
+·
+—
+——
+‘
+’
+“
+”
+…
+、
+。
+〈
+〉
+《
+》
+가
+가까스로
+가령
+각
+각각
+각자
+각종
+갖고말하자면
+같다
+같이
+개의치않고
+거니와
+거바
+거의
+것
+것과 같이
+것들
+게다가
+게우다
+겨우
+견지에서
+결과에 이르다
+결국
+결론을 낼 수 있다
+겸사겸사
+고려하면
+고로
+곧
+공동으로
+과
+과연
+관계가 있다
+관계없이
+관련이 있다
+관하여
+관한
+관해서는
+구
+구체적으로
+구토하다
+그
+그들
+그때
+그래
+그래도
+그래서
+그러나
+그러니
+그러니까
+그러면
+그러므로
+그러한즉
+그런 까닭에
+그런데
+그런즉
+그럼
+그럼에도 불구하고
+그렇게 함으로써
+그렇지
+그렇지 않다면
+그렇지 않으면
+그렇지만
+그렇지않으면
+그리고
+그리하여
+그만이다
+그에 따르는
+그위에
+그저
+그중에서
+그치지 않다
+근거로
+근거하여
+기대여
+기점으로
+기준으로
+기타
+까닭으로
+까악
+까지
+까지 미치다
+까지도
+꽈당
+끙끙
+끼익
+나
+나머지는
+남들
+남짓
+너
+너희
+너희들
+네
+넷
+년
+논하지 않다
+놀라다
+누가 알겠는가
+누구
+다른
+다른 방면으로
+다만
+다섯
+다소
+다수
+다시 말하자면
+다시말하면
+다음
+다음에
+다음으로
+단지
+답다
+당신
+당장
+대로 하다
+대하면
+대하여
+대해 말하자면
+대해서
+댕그
+더구나
+더군다나
+더라도
+더불어
+더욱더
+더욱이는
+도달하다
+도착하다
+동시에
+동안
+된바에야
+된이상
+두번째로
+둘
+둥둥
+뒤따라
+뒤이어
+든간에
+들
+등
+등등
+딩동
+따라
+따라서
+따위
+따지지 않다
+딱
+때
+때가 되어
+때문에
+또
+또한
+뚝뚝
+라 해도
+령
+로
+로 인하여
+로부터
+로써
+륙
+를
+마음대로
+마저
+마저도
+마치
+막론하고
+만 못하다
+만약
+만약에
+만은 아니다
+만이 아니다
+만일
+만큼
+말하자면
+말할것도 없고
+매
+매번
+메쓰겁다
+몇
+모
+모두
+무렵
+무릎쓰고
+무슨
+무엇
+무엇때문에
+물론
+및
+바꾸어말하면
+바꾸어말하자면
+바꾸어서 말하면
+바꾸어서 한다면
+바꿔 말하면
+바로
+바와같이
+밖에 안된다
+반대로
+반대로 말하자면
+반드시
+버금
+보는데서
+보다더
+보드득
+본대로
+봐
+봐라
+부류의 사람들
+부터
+불구하고
+불문하고
+붕붕
+비걱거리다
+비교적
+비길수 없다
+비로소
+비록
+비슷하다
+비추어 보아
+비하면
+뿐만 아니라
+뿐만아니라
+뿐이다
+삐걱
+삐걱거리다
+사
+삼
+상대적으로 말하자면
+생각한대로
+설령
+설마
+설사
+셋
+소생
+소인
+솨
+쉿
+습니까
+습니다
+시각
+시간
+시작하여
+시초에
+시키다
+실로
+심지어
+아
+아니
+아니나다를가
+아니라면
+아니면
+아니었다면
+아래윗
+아무거나
+아무도
+아야
+아울러
+아이
+아이고
+아이구
+아이야
+아이쿠
+아하
+아홉
+안 그러면
+않기 위하여
+않기 위해서
+알 수 있다
+알았어
+앗
+앞에서
+앞의것
+야
+약간
+양자
+어
+어기여차
+어느
+어느 년도
+어느것
+어느곳
+어느때
+어느쪽
+어느해
+어디
+어때
+어떠한
+어떤
+어떤것
+어떤것들
+어떻게
+어떻해
+어이
+어째서
+어쨋든
+어쩔수 없다
+어찌
+어찌됏든
+어찌됏어
+어찌하든지
+어찌하여
+언제
+언젠가
+얼마
+얼마 안 되는 것
+얼마간
+얼마나
+얼마든지
+얼마만큼
+얼마큼
+엉엉
+에
+에 가서
+에 달려 있다
+에 대해
+에 있다
+에 한하다
+에게
+에서
+여
+여기
+여덟
+여러분
+여보시오
+여부
+여섯
+여전히
+여차
+연관되다
+연이서
+영
+영차
+옆사람
+예
+예를 들면
+예를 들자면
+예컨대
+예하면
+오
+오로지
+오르다
+오자마자
+오직
+오호
+오히려
+와
+와 같은 사람들
+와르르
+와아
+왜
+왜냐하면
+외에도
+요만큼
+요만한 것
+요만한걸
+요컨대
+우르르
+우리
+우리들
+우선
+우에 종합한것과같이
+운운
+월
+위에서 서술한바와같이
+위하여
+위해서
+윙윙
+육
+으로
+으로 인하여
+으로서
+으로써
+을
+응
+응당
+의
+의거하여
+의지하여
+의해
+의해되다
+의해서
+이
+이 되다
+이 때문에
+이 밖에
+이 외에
+이 정도의
+이것
+이곳
+이때
+이라면
+이래
+이러이러하다
+이러한
+이런
+이럴정도로
+이렇게 많은 것
+이렇게되면
+이렇게말하자면
+이렇구나
+이로 인하여
+이르기까지
+이리하여
+이만큼
+이번
+이봐
+이상
+이어서
+이었다
+이와 같다
+이와 같은
+이와 반대로
+이와같다면
+이외에도
+이용하여
+이유만으로
+이젠
+이지만
+이쪽
+이천구
+이천육
+이천칠
+이천팔
+인 듯하다
+인젠
+일
+일것이다
+일곱
+일단
+일때
+일반적으로
+일지라도
+임에 틀림없다
+입각하여
+입장에서
+잇따라
+있다
+자
+자기
+자기집
+자마자
+자신
+잠깐
+잠시
+저
+저것
+저것만큼
+저기
+저쪽
+저희
+전부
+전자
+전후
+점에서 보아
+정도에 이르다
+제
+제각기
+제외하고
+조금
+조차
+조차도
+졸졸
+좀
+좋아
+좍좍
+주룩주룩
+주저하지 않고
+줄은 몰랏다
+줄은모른다
+중에서
+중의하나
+즈음하여
+즉
+즉시
+지든지
+지만
+지말고
+진짜로
+쪽으로
+차라리
+참
+참나
+첫번째로
+쳇
+총적으로
+총적으로 말하면
+총적으로 보면
+칠
+콸콸
+쾅쾅
+쿵
+타다
+타인
+탕탕
+토하다
+통하여
+툭
+퉤
+틈타
+팍
+팔
+퍽
+펄렁
+하
+하게될것이다
+하게하다
+하겠는가
+하고 있다
+하고있었다
+하곤하였다
+하구나
+하기 때문에
+하기 위하여
+하기는한데
+하기만 하면
+하기보다는
+하기에
+하나
+하느니
+하는 김에
+하는 편이 낫다
+하는것도
+하는것만 못하다
+하는것이 낫다
+하는바
+하더라도
+하도다
+하도록시키다
+하도록하다
+하든지
+하려고하다
+하마터면
+하면 할수록
+하면된다
+하면서
+하물며
+하여금
+하여야
+하자마자
+하지 않는다면
+하지 않도록
+하지마
+하지마라
+하지만
+하하
+한 까닭에
+한 이유는
+한 후
+한다면
+한다면 몰라도
+한데
+한마디
+한적이있다
+한켠으로는
+한항목
+할 따름이다
+할 생각이다
+할 줄 안다
+할 지경이다
+할 힘이 있다
+할때
+할만하다
+할망정
+할뿐
+할수있다
+할수있어
+할줄알다
+할지라도
+할지언정
+함께
+해도된다
+해도좋다
+해봐요
+해서는 안된다
+해야한다
+해요
+했어요
+향하다
+향하여
+향해서
+허
+허걱
+허허
+헉
+헉헉
+헐떡헐떡
+형식으로 쓰여
+혹시
+혹은
+혼자
+훨씬
+휘익
+휴
+흐흐
+흥
+힘입어
+︿
+!
+#
+$
+%
+&
+(
+)
+*
++
+,
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+;
+<
+>
+?
+@
+[
+]
+{
+|
+}
+~
+¥
\ No newline at end of file
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
index 480c5e9..658f605 100644
--- a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
@@ -63,6 +63,24 @@ public class KuromojiUDFTest {
udf.close();
}
+
+ @Test
+ public void testShowHelp() throws IOException {
+ GenericUDF udf = new KuromojiUDF();
+ ObjectInspector[] argOIs = new ObjectInspector[2];
+ argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ argOIs[1] = HiveUtils.getConstStringObjectInspector("-help");
+ try {
+ udf.initialize(argOIs);
+ Assert.fail("should not reach here");
+ } catch (UDFArgumentException e) {
+ String errmsg = e.getMessage();
+ Assert.assertTrue(errmsg.contains("usage:"));
+ } finally {
+ udf.close();
+ }
+ }
+
@Test
public void testTwoArgument() throws UDFArgumentException, IOException {
GenericUDF udf = new KuromojiUDF();
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
index a5b7288..e0bb30c 100644
--- a/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
@@ -84,8 +84,7 @@ public class SmartcnUDFTest {
}
@Override
- public void prepare(int arg) throws HiveException {
- }
+ public void prepare(int arg) throws HiveException {}
};
List<Text> tokens = udf.evaluate(args);
Assert.assertTrue(tokens.size() >= 2);
@@ -110,8 +109,7 @@ public class SmartcnUDFTest {
}
@Override
- public void prepare(int arg) throws HiveException {
- }
+ public void prepare(int arg) throws HiveException {}
};
List<Text> tokens = udf.evaluate(args);
Assert.assertTrue(tokens.size() >= 2);
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/TokenizeKoUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/TokenizeKoUDFTest.java
index 5365de0..8b97747 100644
--- a/nlp/src/test/java/hivemall/nlp/tokenizer/TokenizeKoUDFTest.java
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/TokenizeKoUDFTest.java
@@ -18,6 +18,8 @@
*/
package hivemall.nlp.tokenizer;
+import hivemall.utils.hadoop.HiveUtils;
+
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
@@ -30,7 +32,6 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
-import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.Text;
import org.junit.Assert;
import org.junit.Before;
@@ -56,7 +57,24 @@ public class TokenizeKoUDFTest {
}
@Test
- public void test() throws HiveException, IOException {
+ public void testShowHelp() throws IOException {
+ GenericUDF udf = new TokenizeKoUDF();
+ ObjectInspector[] argOIs = new ObjectInspector[2];
+ argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ argOIs[1] = HiveUtils.getConstStringObjectInspector("-help");
+ try {
+ udf.initialize(argOIs);
+ Assert.fail("should not reach here");
+ } catch (UDFArgumentException e) {
+ String errmsg = e.getMessage();
+ Assert.assertTrue(errmsg.contains("usage:"));
+ } finally {
+ udf.close();
+ }
+ }
+
+ @Test
+ public void testOneArgument() throws HiveException, IOException {
ObjectInspector[] argOIs = new ObjectInspector[1];
argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
udf.initialize(argOIs);
@@ -68,8 +86,7 @@ public class TokenizeKoUDFTest {
}
@Override
- public void prepare(int arg) throws HiveException {
- }
+ public void prepare(int arg) throws HiveException {}
};
List<Text> tokens = udf.evaluate(args);
@@ -81,15 +98,27 @@ public class TokenizeKoUDFTest {
}
@Test
- public void testNullUserList() throws HiveException, IOException {
- ObjectInspector[] argOIs = new ObjectInspector[2];
+ public void testNullUserDict() throws HiveException, IOException {
+ ObjectInspector[] argOIs = new ObjectInspector[5];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ // mode
+ PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
+ stringType.setTypeName("string");
+ argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+ stringType, null);
+ // stopWords
+ argOIs[2] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+ // stopTags
+ argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
// userDict
- argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ argOIs[4] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
udf.initialize(argOIs);
+
GenericUDF.DeferredObject[] args = new GenericUDF.DeferredObject[1];
args[0] = new GenericUDF.DeferredObject() {
public Text get() throws HiveException {
@@ -97,8 +126,7 @@ public class TokenizeKoUDFTest {
}
@Override
- public void prepare(int arg) throws HiveException {
- }
+ public void prepare(int arg) throws HiveException {}
};
List<Text> tokens = udf.evaluate(args);
@@ -111,33 +139,27 @@ public class TokenizeKoUDFTest {
@Test
public void testNullMode() throws UDFArgumentException, IOException {
- ObjectInspector[] argOIs = new ObjectInspector[3];
+ ObjectInspector[] argOIs = new ObjectInspector[2];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
- // userDict
- argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
- PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
// mode
PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
stringType.setTypeName("string");
- argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+ argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
stringType, null);
udf.initialize(argOIs);
udf.close();
}
@Test
- public void testMode() throws HiveException, IOException {
- ObjectInspector[] argOIs = new ObjectInspector[3];
+ public void testModeMixed() throws HiveException, IOException {
+ ObjectInspector[] argOIs = new ObjectInspector[2];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
- // userDict
- argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
- PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
// mode
PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
stringType.setTypeName("string");
- argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+ argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
stringType, new Text("mixed"));
udf.initialize(argOIs);
@@ -148,8 +170,7 @@ public class TokenizeKoUDFTest {
}
@Override
- public void prepare(int arg) throws HiveException {
- }
+ public void prepare(int arg) throws HiveException {}
};
List<Text> tokens = udf.evaluate(args);
@@ -162,34 +183,37 @@ public class TokenizeKoUDFTest {
@Test(expected = UDFArgumentException.class)
public void testInvalidMode() throws IOException, HiveException {
- ObjectInspector[] argOIs = new ObjectInspector[3];
+ ObjectInspector[] argOIs = new ObjectInspector[2];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
- // userDict
- argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
- PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
// mode
PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
stringType.setTypeName("string");
- argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+ argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
stringType, new Text("unsupported mode"));
udf.initialize(argOIs);
udf.close();
}
@Test
- public void testNonnullUserList() throws HiveException, IOException {
- ObjectInspector[] argOIs = new ObjectInspector[3];
+ public void testUserDictArray() throws HiveException, IOException {
+ ObjectInspector[] argOIs = new ObjectInspector[5];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
- // userDict
- argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
- PrimitiveObjectInspectorFactory.javaStringObjectInspector, Arrays.asList("C++"));
// mode
PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
stringType.setTypeName("string");
- argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+ argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
stringType, new Text("mixed"));
+ // stopWords
+ argOIs[2] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+ // stopTags
+ argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+ // userDict
+ argOIs[4] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, Arrays.asList("C++"));
udf.initialize(argOIs);
GenericUDF.DeferredObject[] args = new GenericUDF.DeferredObject[1];
@@ -199,8 +223,7 @@ public class TokenizeKoUDFTest {
}
@Override
- public void prepare(int arg) throws HiveException {
- }
+ public void prepare(int arg) throws HiveException {}
};
List<Text> tokens = udf.evaluate(args);
@@ -212,21 +235,70 @@ public class TokenizeKoUDFTest {
}
@Test
- public void testStopTags() throws HiveException, IOException {
- ObjectInspector[] argOIs = new ObjectInspector[4];
+ public void testUserDictUrl() throws HiveException, IOException {
+ ObjectInspector[] argOIs = new ObjectInspector[5];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
- // userDict
- argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ // mode
+ PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
+ stringType.setTypeName("string");
+ argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+ stringType, new Text("discard"));
+ // stopWords
+ argOIs[2] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+ // stopTags
+ argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+ // userDict
+ argOIs[4] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+ stringType, new Text(
+ "https://raw.githubusercontent.com/apache/lucene/044d152d954f1e22aac5a53792011da54c680617/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/userdict.txt"));
+
+ udf.initialize(argOIs);
+
+ GenericUDF.DeferredObject[] args = new GenericUDF.DeferredObject[1];
+ args[0] = new GenericUDF.DeferredObject() {
+ public Text get() throws HiveException {
+ return new Text("나는 c++ 프로그래밍을 즐긴다");
+ }
+
+ @Override
+ public void prepare(int arg) throws HiveException {}
+ };
+ List<Text> tokens = udf.evaluate(args);
+
+ Assert.assertNotNull(tokens);
+ Assert.assertEquals(4, tokens.size());
+ Assert.assertEquals("나 c++ 프로그래밍 즐기", getString(tokens));
+
+ udf.close();
+ }
+
+ @Test
+ public void testStopTags() throws HiveException, IOException {
+ ObjectInspector[] argOIs = new ObjectInspector[5];
+ // line
+ argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+
// mode
PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
stringType.setTypeName("string");
- argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+ argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
stringType, null);
+
+ // stopWords
+ argOIs[2] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+
// stopTags
argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
PrimitiveObjectInspectorFactory.javaStringObjectInspector, Arrays.asList("E", "VV"));
+
+ // userDict
+ argOIs[4] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+
udf.initialize(argOIs);
GenericUDF.DeferredObject[] args = new GenericUDF.DeferredObject[1];
@@ -236,8 +308,7 @@ public class TokenizeKoUDFTest {
}
@Override
- public void prepare(int arg) throws HiveException {
- }
+ public void prepare(int arg) throws HiveException {}
};
List<Text> tokens = udf.evaluate(args);
@@ -248,19 +319,57 @@ public class TokenizeKoUDFTest {
udf.close();
}
+
+ @Test
+ public void testWithoutDictCplusplus() throws HiveException, IOException {
+ ObjectInspector[] argOIs = new ObjectInspector[4];
+ // line
+ argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ // mode
+ PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
+ stringType.setTypeName("string");
+ argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+ stringType, new Text("discard"));
+ // stopWords
+ argOIs[2] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+ // stopTags
+ argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+
+ udf.initialize(argOIs);
+
+ GenericUDF.DeferredObject[] args = new GenericUDF.DeferredObject[1];
+ args[0] = new GenericUDF.DeferredObject() {
+ public Text get() throws HiveException {
+ return new Text("나는 c++ 프로그래밍을 즐긴다");
+ }
+
+ @Override
+ public void prepare(int arg) throws HiveException {}
+ };
+ List<Text> tokens = udf.evaluate(args);
+
+ Assert.assertNotNull(tokens);
+ Assert.assertEquals(4, tokens.size());
+ Assert.assertEquals("나 c 프로그래밍 즐기", getString(tokens));
+
+ udf.close();
+ }
+
@Test(expected = UDFArgumentException.class)
public void testInvalidStopTag() throws UDFArgumentException, IOException {
ObjectInspector[] argOIs = new ObjectInspector[4];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
- // userDict
- argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
- PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
// mode
PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
stringType.setTypeName("string");
- argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+ argOIs[1] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
stringType, null);
+ // stopWords
+ argOIs[2] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
// stopTags
argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
PrimitiveObjectInspectorFactory.javaStringObjectInspector, Arrays.asList("E", "?"));
@@ -270,25 +379,21 @@ public class TokenizeKoUDFTest {
@Test
public void testOutputUnknownUnigramsTrue() throws HiveException, IOException {
- ObjectInspector[] argOIs = new ObjectInspector[5];
+ ObjectInspector[] argOIs = new ObjectInspector[4];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
- // userDict
- argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+
+ // opts
+ argOIs[1] = HiveUtils.getConstStringObjectInspector("-mode discard -outputUnknownUnigrams"); // mode
+
+ // stopWords
+ argOIs[2] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
- // mode
- PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
- stringType.setTypeName("string");
- argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
- stringType, null);
+
// stopTags
argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
- // outputUnknowUnigrams
- PrimitiveTypeInfo booleanType = new PrimitiveTypeInfo();
- booleanType.setTypeName("boolean");
- argOIs[4] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
- booleanType, new BooleanWritable(true));
+
udf.initialize(argOIs);
GenericUDF.DeferredObject[] args = new GenericUDF.DeferredObject[1];
@@ -298,8 +403,7 @@ public class TokenizeKoUDFTest {
}
@Override
- public void prepare(int arg) throws HiveException {
- }
+ public void prepare(int arg) throws HiveException {}
};
List<Text> tokens = udf.evaluate(args);
@@ -312,25 +416,21 @@ public class TokenizeKoUDFTest {
@Test
public void testOutputUnknownUnigramsFalse() throws HiveException, IOException {
- ObjectInspector[] argOIs = new ObjectInspector[5];
+ ObjectInspector[] argOIs = new ObjectInspector[4];
// line
argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
- // userDict
- argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+
+ // opts
+ argOIs[1] = HiveUtils.getConstStringObjectInspector("-mode discard"); // mode
+
+ // stopWords
+ argOIs[2] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
- // mode
- PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
- stringType.setTypeName("string");
- argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
- stringType, null);
+
// stopTags
argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
- // outputUnknowUnigrams
- PrimitiveTypeInfo booleanType = new PrimitiveTypeInfo();
- booleanType.setTypeName("boolean");
- argOIs[4] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
- booleanType, new BooleanWritable(false));
+
udf.initialize(argOIs);
GenericUDF.DeferredObject[] args = new GenericUDF.DeferredObject[1];
@@ -340,8 +440,7 @@ public class TokenizeKoUDFTest {
}
@Override
- public void prepare(int arg) throws HiveException {
- }
+ public void prepare(int arg) throws HiveException {}
};
List<Text> tokens = udf.evaluate(args);
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/ext/KoreanAnalyzerTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/ext/KoreanAnalyzerTest.java
new file mode 100644
index 0000000..6d44c1b
--- /dev/null
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/ext/KoreanAnalyzerTest.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.nlp.tokenizer.ext;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+import javax.annotation.Nonnull;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ko.dict.UserDictionary;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class KoreanAnalyzerTest {
+
+ @Test
+ public void testStopwords() throws IOException {
+ KoreanAnalyzer analyzer = new KoreanAnalyzer();
+ Assert.assertTrue(analyzer.getStopwordSet().size() > 10);
+
+ List<String> results = analyzeTokens(analyzer.tokenStream("", "소설 무궁화꽃이 피었습니다."));
+ Assert.assertEquals(5, results.size());
+ analyzer.close();
+ }
+
+ @Test
+ public void testUserDict() {
+ UserDictionary dict = readDict();
+ Assert.assertNotNull(dict);;
+ }
+
+ @Nonnull
+ public static UserDictionary readDict() {
+ InputStream is = KoreanAnalyzer.class.getResourceAsStream("userdict-ko.txt");
+ if (is == null) {
+ throw new RuntimeException("Cannot find userdict-ko.txt in test classpath!");
+ }
+ try {
+ try {
+ Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
+ return UserDictionary.open(reader);
+ } finally {
+ is.close();
+ }
+ } catch (IOException ioe) {
+ throw new RuntimeException(ioe);
+ }
+ }
+
+ @Nonnull
+ private static List<String> analyzeTokens(@Nonnull TokenStream stream) throws IOException {
+ final List<String> results = new ArrayList<String>();
+
+ // instantiate an attribute placeholder once
+ CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
+ stream.reset();
+
+ while (stream.incrementToken()) {
+ String term = termAttr.toString();
+ results.add(term);
+ }
+ return results;
+ }
+
+}
diff --git a/nlp/src/test/resources/hivemall/nlp/tokenizer/ext/userdict-ko.txt b/nlp/src/test/resources/hivemall/nlp/tokenizer/ext/userdict-ko.txt
new file mode 100644
index 0000000..ffd3613
--- /dev/null
+++ b/nlp/src/test/resources/hivemall/nlp/tokenizer/ext/userdict-ko.txt
@@ -0,0 +1,11 @@
+# Additional nouns
+c++
+C샤프
+세종
+세종시 세종 시
+대한민국날씨
+대한민국
+날씨
+21세기대한민국
+세기
+