You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hivemall.apache.org by my...@apache.org on 2021/04/22 14:53:17 UTC
[incubator-hivemall] branch master updated: Implement Korean text
tokenizer
This is an automated email from the ASF dual-hosted git repository.
myui pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git
The following commit(s) were added to refs/heads/master by this push:
new 0ee33d2 Implement Korean text tokenizer
0ee33d2 is described below
commit 0ee33d2f59a0c5e0a47abbecfd56fb864b831462
Author: Makoto Yui <my...@apache.org>
AuthorDate: Thu Apr 22 23:53:10 2021 +0900
Implement Korean text tokenizer
## What changes were proposed in this pull request?
Implement Korean text tokenizer
## What type of PR is it?
Feature
## What is the Jira issue?
https://issues.apache.org/jira/browse/HIVEMALL-307
## How was this patch tested?
unit tests and manual tests on EMR
## How to use this feature?
```sql
-- show version of lucene-analyzers-nori
select tokenize_ko();
> 8.8.2
select tokenize_ko("소설 무궁화꽃이 피었습니다.");
> ["소설","무궁","화","꽃","피"]
select tokenize_ko("소설 무궁화꽃이 피었습니다.", null, "mixed");
> ["소설","무궁화","무궁","화","꽃","피"]
select tokenize_ko("소설 무궁화꽃이 피었습니다.", null, "discard", array("E", "VV"));
> ["소설","무궁","화","꽃","이"]
select tokenize_ko("Hello, world.", null, "none", array(), true);
> ["h","e","l","l","o","w","o","r","l","d"]
select tokenize_ko("Hello, world.", null, "none", array(), false);
> ["hello","world"]
select tokenize_ko("나는 C++ 언어를 프로그래밍 언어로 사랑한다.", null, "discard", array());
> ["나","는","c","언어","를","프로그래밍","언어","로","사랑","하","ᆫ다"]
select tokenize_ko("나는 C++ 언어를 프로그래밍 언어로 사랑한다.", array("C++"), "discard", array());
> ["나","는","c++","언어","를","프로그래밍","언어","로","사랑","하","ᆫ다"]
```
## Checklist
- [x] Did you apply source code formatter, i.e., `./bin/format_code.sh`, for your commit?
- [x] Did you run system tests on Hive (or Spark)?
Author: Makoto Yui <my...@apache.org>
Closes #237 from myui/korean_tokenizer.
---
dist/pom.xml | 7 +-
docs/Dockerfile | 19 ++
docs/gitbook/misc/funcs.md | 8 +
docs/gitbook/misc/tokenizer.md | 52 ++-
nlp/pom.xml | 19 +-
.../hivemall/nlp/tokenizer/KuromojiNEologdUDF.java | 2 +-
.../java/hivemall/nlp/tokenizer/KuromojiUDF.java | 2 +-
.../java/hivemall/nlp/tokenizer/SmartcnUDF.java | 17 +-
.../java/hivemall/nlp/tokenizer/TokenizeKoUDF.java | 239 ++++++++++++++
.../hivemall/nlp/tokenizer/tokenizer.properties | 6 +-
.../hivemall/nlp/tokenizer/SmartcnUDFTest.java | 23 +-
.../hivemall/nlp/tokenizer/TokenizeKoUDFTest.java | 363 +++++++++++++++++++++
resources/ddl/define-all-as-permanent.hive | 3 +
resources/ddl/define-all.hive | 3 +
resources/ddl/define-all.spark | 3 +
15 files changed, 745 insertions(+), 21 deletions(-)
diff --git a/dist/pom.xml b/dist/pom.xml
index 80104ac..f70b6fe 100644
--- a/dist/pom.xml
+++ b/dist/pom.xml
@@ -118,11 +118,12 @@
<include>com.clearspring.analytics:stream</include>
<!-- hivemall-nlp -->
<include>org.apache.hivemall:hivemall-nlp</include>
- <include>org.apache.lucene:lucene-analyzers-kuromoji</include>
- <include>org.apache.lucene:lucene-analyzers-smartcn</include>
- <include>org.apache.lucene:lucene-analyzers-common</include>
<include>org.apache.lucene:lucene-core</include>
+ <include>org.apache.lucene:lucene-analyzers-common</include>
+ <include>org.apache.lucene:lucene-analyzers-kuromoji</include>
<include>io.github.myui:lucene-analyzers-kuromoji-neologd</include>
+ <include>org.apache.lucene:lucene-analyzers-smartcn</include>
+ <include>org.apache.lucene:lucene-analyzers-nori</include>
<!-- hivemall-xgboost -->
<include>org.apache.hivemall:hivemall-xgboost</include>
<include>io.github.myui:xgboost4j</include>
diff --git a/docs/Dockerfile b/docs/Dockerfile
index c764938..9b25cb8 100644
--- a/docs/Dockerfile
+++ b/docs/Dockerfile
@@ -1,3 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
FROM node:8-buster-slim
ARG GITBOOK_VERSION=3.2.3
diff --git a/docs/gitbook/misc/funcs.md b/docs/gitbook/misc/funcs.md
index 84f167c..b40019a 100644
--- a/docs/gitbook/misc/funcs.md
+++ b/docs/gitbook/misc/funcs.md
@@ -1075,6 +1075,14 @@ Reference: <a href="https://papers.nips.cc/paper/3848-adaptive-regularization-of
```
+- `tokenize_ko(String line [, const array<string> userDict, const string mode = "discard", const array<string> stopTags, boolean outputUnknownUnigrams])` - returns tokenized strings in array<string>
+ ```sql
+ select tokenize_ko("소설 무궁화꽃이 피었습니다.");
+
+ > ["소설","무궁","화","꽃","피"]
+
+ ```
+
# Others
- `hivemall_version()` - Returns the version of Hivemall
diff --git a/docs/gitbook/misc/tokenizer.md b/docs/gitbook/misc/tokenizer.md
index 78d9970..5ddc93e 100644
--- a/docs/gitbook/misc/tokenizer.md
+++ b/docs/gitbook/misc/tokenizer.md
@@ -174,14 +174,64 @@ Note that when `-pos` option is specified, `tokenize_ja` returns a struct record
Chinese text tokenizer UDF uses [SmartChineseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html).
The signature of the UDF is as follows:
+
```sql
tokenize_cn(string line, optional const array<string> stopWords)
```
Its basic usage is as follows:
+
```sql
select tokenize_cn("Smartcn为Apache2.0协议的开源中文分词系统,Java语言编写,修改的中科院计算所ICTCLAS分词系统。");
-```
+
> [smartcn, 为, apach, 2, 0, 协议, 的, 开源, 中文, 分词, 系统, java, 语言, 编写, 修改, 的, 中科院, 计算, 所, ictcla, 分词, 系统]
+```
For detailed APIs, please refer Javadoc of [SmartChineseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html) as well.
+
+## Korean Tokenizer
+
+Korean toknizer internally uses [lucene-analyzers-nori](analyzers-nori: Korean Morphological Analyzer) for tokenization.
+
+The signature of the UDF is as follows:
+
+```sql
+tokenize_ko(String line [,
+ const array<string> userDict,
+ const string mode = "discard",
+ const array<string> stopTags,
+ boolean outputUnknownUnigrams
+ ]) - returns tokenized strings in array<string>
+```
+
+> #### Note
+> For details options, please refer [Lucene API document](https://lucene.apache.org/core/8_8_2/analyzers-nori/org/apache/lucene/analysis/ko/KoreanAnalyzer.html). `none`, `discord` (default), or `mixed` are supported for the mode argument.
+
+See the following examples for the usage.
+
+```sql
+-- show version of lucene-analyzers-nori
+select tokenize_ko();
+> 8.8.2
+
+select tokenize_ko("소설 무궁화꽃이 피었습니다.");
+> ["소설","무궁","화","꽃","피"]
+
+select tokenize_ko("소설 무궁화꽃이 피었습니다.", null, "mixed");
+> ["소설","무궁화","무궁","화","꽃","피"]
+
+select tokenize_ko("소설 무궁화꽃이 피었습니다.", null, "discard", array("E", "VV"));
+> ["소설","무궁","화","꽃","이"]
+
+select tokenize_ko("Hello, world.", null, "none", array(), true);
+> ["h","e","l","l","o","w","o","r","l","d"]
+
+select tokenize_ko("Hello, world.", null, "none", array(), false);
+> ["hello","world"]
+
+select tokenize_ko("나는 C++ 언어를 프로그래밍 언어로 사랑한다.", null, "discard", array());
+> ["나","는","c","언어","를","프로그래밍","언어","로","사랑","하","ᆫ다"]
+
+select tokenize_ko("나는 C++ 언어를 프로그래밍 언어로 사랑한다.", array("C++"), "discard", array());
+> ["나","는","c++","언어","를","프로그래밍","언어","로","사랑","하","ᆫ다"]
+```
diff --git a/nlp/pom.xml b/nlp/pom.xml
index 55afab7..fdc8c4a 100644
--- a/nlp/pom.xml
+++ b/nlp/pom.xml
@@ -107,17 +107,23 @@
<scope>compile</scope>
</dependency>
<dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-analyzers-smartcn</artifactId>
- <version>${lucene.version}</version>
- <scope>compile</scope>
- </dependency>
- <dependency>
<groupId>io.github.myui</groupId>
<artifactId>lucene-analyzers-kuromoji-neologd</artifactId>
<version>${lucene-analyzers-kuromoji-neologd.version}</version>
<scope>compile</scope>
</dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-analyzers-smartcn</artifactId>
+ <version>${lucene.version}</version>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-analyzers-nori</artifactId>
+ <version>${lucene.version}</version>
+ <scope>compile</scope>
+ </dependency>
<!-- test scope -->
<dependency>
@@ -135,6 +141,7 @@
</dependencies>
<build>
+ <!-- mvn resources:resources to process properties -->
<resources>
<resource>
<directory>src/main/resources</directory>
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
index f3303a0..e7e4ace 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
@@ -175,7 +175,7 @@ public final class KuromojiNEologdUDF extends UDFWithOptions {
throw new HiveException("Failed to read tokenizer.properties");
}
return Collections.singletonList(
- new Text(properties.getProperty("tokenizer_ja_neologd.version")));
+ new Text(properties.getProperty("tokenize_ja_neologd.version")));
}
if (_analyzer == null) {
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
index 7902f60..8f05782 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
@@ -175,7 +175,7 @@ public final class KuromojiUDF extends UDFWithOptions {
throw new HiveException("Failed to read tokenizer.properties");
}
return Collections.singletonList(
- new Text(properties.getProperty("tokenizer_ja.version")));
+ new Text(properties.getProperty("tokenize_ja.version")));
}
if (_analyzer == null) {
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
index 93c8620..8bb5db9 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
@@ -24,7 +24,9 @@ import hivemall.utils.io.IOUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collections;
import java.util.List;
+import java.util.Properties;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
@@ -38,10 +40,10 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.Text;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.CharArraySet;
@Description(name = "tokenize_cn", value = "_FUNC_(String line [, const list<string> stopWords])"
+ " - returns tokenized strings in array<string>")
@@ -55,7 +57,7 @@ public final class SmartcnUDF extends GenericUDF {
@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
final int arglen = arguments.length;
- if (arglen < 1 || arglen > 2) {
+ if (arglen > 2) {
throw new UDFArgumentException(
"Invalid number of arguments for `tokenize_cn`: " + arglen);
}
@@ -69,6 +71,17 @@ public final class SmartcnUDF extends GenericUDF {
@Override
public List<Text> evaluate(DeferredObject[] arguments) throws HiveException {
+ if (arguments.length == 0) {
+ final Properties properties = new Properties();
+ try {
+ properties.load(this.getClass().getResourceAsStream("tokenizer.properties"));
+ } catch (IOException e) {
+ throw new HiveException("Failed to read tokenizer.properties");
+ }
+ return Collections.singletonList(
+ new Text(properties.getProperty("tokenize_cn.version")));
+ }
+
SmartChineseAnalyzer analyzer = _analyzer;
if (analyzer == null) {
CharArraySet stopwords = stopWords(_stopWordsArray);
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java
new file mode 100644
index 0000000..8c2a939
--- /dev/null
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java
@@ -0,0 +1,239 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.nlp.tokenizer;
+
+import hivemall.utils.hadoop.HiveUtils;
+import hivemall.utils.io.IOUtils;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Properties;
+import java.util.Set;
+
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFType;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.io.Text;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ko.KoreanAnalyzer;
+import org.apache.lucene.analysis.ko.KoreanPartOfSpeechStopFilter;
+import org.apache.lucene.analysis.ko.KoreanTokenizer;
+import org.apache.lucene.analysis.ko.KoreanTokenizer.DecompoundMode;
+import org.apache.lucene.analysis.ko.POS;
+import org.apache.lucene.analysis.ko.dict.UserDictionary;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+@Description(name = "tokenize_ko",
+ value = "_FUNC_(String line [, const array<string> userDict, const string mode = \"discard\", const array<string> stopTags, boolean outputUnknownUnigrams])"
+ + " - returns tokenized strings in array<string>",
+ extended = "select tokenize_ko(\"소설 무궁화꽃이 피었습니다.\");\n" + "\n"
+ + "> [\"소설\",\"무궁\",\"화\",\"꽃\",\"피\"]\n")
+@UDFType(deterministic = true, stateful = false)
+public final class TokenizeKoUDF extends GenericUDF {
+
+ @Nullable
+ private UserDictionary userDict;
+
+ private DecompoundMode mode;
+ private Set<POS.Tag> stopTags;
+ private boolean outputUnknownUnigrams;
+
+ private transient KoreanAnalyzer analyzer;
+
+ @Override
+ public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
+ final int arglen = arguments.length;
+ if (arglen > 5) {
+ throw new UDFArgumentException(
+ "Invalid number of arguments for `tokenize_ko`: " + arglen);
+ }
+
+ this.userDict = (arglen >= 3) ? parseUserDict(arguments[1]) : null;
+ this.mode = (arglen >= 3) ? parseDecompoundMode(arguments[2])
+ : KoreanTokenizer.DEFAULT_DECOMPOUND;
+ this.stopTags = (arglen >= 4) ? parseStopTags(arguments[3])
+ : KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS;
+ this.outputUnknownUnigrams = (arglen >= 5) && HiveUtils.getConstBoolean(arguments[4]);
+
+ this.analyzer = null;
+
+ return ObjectInspectorFactory.getStandardListObjectInspector(
+ PrimitiveObjectInspectorFactory.writableStringObjectInspector);
+ }
+
+ @Override
+ public List<Text> evaluate(DeferredObject[] arguments) throws HiveException {
+ if (arguments.length == 0) {
+ final Properties properties = new Properties();
+ try {
+ properties.load(this.getClass().getResourceAsStream("tokenizer.properties"));
+ } catch (IOException e) {
+ throw new HiveException("Failed to read tokenizer.properties");
+ }
+ return Collections.singletonList(
+ new Text(properties.getProperty("tokenize_ko.version")));
+ }
+
+ if (analyzer == null) {
+ this.analyzer = new KoreanAnalyzer(userDict, mode, stopTags, outputUnknownUnigrams);
+ }
+
+ Object arg0 = arguments[0].get();
+ if (arg0 == null) {
+ return null;
+ }
+
+ String line = arg0.toString();
+
+ final List<Text> tokens = new ArrayList<Text>(32);
+ TokenStream stream = null;
+ try {
+ stream = analyzer.tokenStream("", line);
+ if (stream != null) {
+ analyzeTokens(stream, tokens);
+ }
+ } catch (IOException e) {
+ IOUtils.closeQuietly(analyzer);
+ throw new HiveException(e);
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+ return tokens;
+ }
+
+ @Override
+ public void close() throws IOException {
+ IOUtils.closeQuietly(analyzer);
+ }
+
+ @Nullable
+ private static UserDictionary parseUserDict(@Nonnull final ObjectInspector oi)
+ throws UDFArgumentException {
+ if (HiveUtils.isVoidOI(oi)) {
+ return null;
+ }
+ final String[] array = HiveUtils.getConstStringArray(oi);
+ if (array == null) {
+ return null;
+ }
+ final int length = array.length;
+ if (length == 0) {
+ return null;
+ }
+ final StringBuilder builder = new StringBuilder();
+ for (int i = 0; i < length; i++) {
+ String row = array[i];
+ if (row != null) {
+ builder.append(row).append('\n');
+ }
+ }
+
+ final Reader reader = new StringReader(builder.toString());
+ try {
+ return UserDictionary.open(reader); // return null if empty
+ } catch (Throwable e) {
+ throw new UDFArgumentException(
+ "Failed to create user dictionary based on the given array<string>: "
+ + builder.toString());
+ }
+ }
+
+ @Nonnull
+ private static DecompoundMode parseDecompoundMode(@Nonnull final ObjectInspector oi)
+ throws UDFArgumentException {
+ String arg = HiveUtils.getConstString(oi);
+ if (arg == null) {
+ return KoreanTokenizer.DEFAULT_DECOMPOUND;
+ }
+ final DecompoundMode mode;
+ try {
+ mode = DecompoundMode.valueOf(arg.toUpperCase(Locale.ENGLISH));
+ } catch (IllegalArgumentException e) {
+ final StringBuilder sb = new StringBuilder();
+ for (DecompoundMode v : DecompoundMode.values()) {
+ sb.append(v.toString()).append(", ");
+ }
+ throw new UDFArgumentException(
+ "Expected either " + sb.toString() + "but got an unexpected mode: " + arg);
+ }
+ return mode;
+ }
+
+ @Nonnull
+ private static Set<POS.Tag> parseStopTags(@Nonnull final ObjectInspector oi)
+ throws UDFArgumentException {
+ if (HiveUtils.isVoidOI(oi)) {
+ return KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS;
+ }
+ final String[] array = HiveUtils.getConstStringArray(oi);
+ if (array == null) {
+ return KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS;
+ }
+ final int length = array.length;
+ if (length == 0) {
+ return Collections.emptySet();
+ }
+ final Set<POS.Tag> stopTags = new HashSet<POS.Tag>(length);
+ for (int i = 0; i < length; i++) {
+ String s = array[i];
+ if (s != null) {
+ try {
+ stopTags.add(POS.resolveTag(s));
+ } catch (IllegalArgumentException e) {
+ throw new UDFArgumentException(
+ "Unrecognized POS tag has been specified as a stop tag: " + e.getMessage());
+ }
+ }
+ }
+ return stopTags;
+ }
+
+ private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results)
+ throws IOException {
+ // instantiate an attribute placeholder once
+ CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
+ stream.reset();
+
+ while (stream.incrementToken()) {
+ String term = termAttr.toString();
+ results.add(new Text(term));
+ }
+ }
+
+ @Override
+ public String getDisplayString(String[] children) {
+ return "tokenize_ko(" + Arrays.toString(children) + ')';
+ }
+
+}
diff --git a/nlp/src/main/resources/hivemall/nlp/tokenizer/tokenizer.properties b/nlp/src/main/resources/hivemall/nlp/tokenizer/tokenizer.properties
index 46e5acd..4118d1d 100644
--- a/nlp/src/main/resources/hivemall/nlp/tokenizer/tokenizer.properties
+++ b/nlp/src/main/resources/hivemall/nlp/tokenizer/tokenizer.properties
@@ -1,2 +1,4 @@
-tokenizer_ja.version=${lucene.version}
-tokenizer_ja_neologd.version=${lucene-analyzers-kuromoji-neologd.version}
+tokenize_ja.version=${lucene.version}
+tokenize_ja_neologd.version=${lucene-analyzers-kuromoji-neologd.version}
+tokenize_cn.version=${lucene.version}
+tokenize_ko.version=${lucene.version}
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
index 342e48a..a5b7288 100644
--- a/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
@@ -18,11 +18,11 @@
*/
package hivemall.nlp.tokenizer;
+import hivemall.TestUtils;
+
import java.io.IOException;
import java.util.List;
-import hivemall.TestUtils;
-
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
@@ -37,6 +37,16 @@ import org.junit.Test;
public class SmartcnUDFTest {
@Test
+ public void testNoArgument() throws IOException, HiveException {
+ GenericUDF udf = new SmartcnUDF();
+ ObjectInspector[] argOIs = new ObjectInspector[0];
+ udf.initialize(argOIs);
+ Object result = udf.evaluate(new DeferredObject[0]);
+ Assert.assertNotNull(result);
+ udf.close();
+ }
+
+ @Test
public void testOneArgument() throws UDFArgumentException, IOException {
GenericUDF udf = new SmartcnUDF();
ObjectInspector[] argOIs = new ObjectInspector[1];
@@ -74,10 +84,11 @@ public class SmartcnUDFTest {
}
@Override
- public void prepare(int arg) throws HiveException {}
+ public void prepare(int arg) throws HiveException {
+ }
};
List<Text> tokens = udf.evaluate(args);
- Assert.assertNotNull(tokens);
+ Assert.assertTrue(tokens.size() >= 2);
udf.close();
}
@@ -99,9 +110,11 @@ public class SmartcnUDFTest {
}
@Override
- public void prepare(int arg) throws HiveException {}
+ public void prepare(int arg) throws HiveException {
+ }
};
List<Text> tokens = udf.evaluate(args);
+ Assert.assertTrue(tokens.size() >= 2);
// serialization after evaluation
serialized = TestUtils.serializeObjectByKryo(udf);
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/TokenizeKoUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/TokenizeKoUDFTest.java
new file mode 100644
index 0000000..5365de0
--- /dev/null
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/TokenizeKoUDFTest.java
@@ -0,0 +1,363 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.nlp.tokenizer;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
+import org.apache.hadoop.io.BooleanWritable;
+import org.apache.hadoop.io.Text;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TokenizeKoUDFTest {
+
+ private TokenizeKoUDF udf;
+
+ @Before
+ public void setUp() {
+ this.udf = new TokenizeKoUDF();
+ }
+
+ @Test
+ public void testNoArgument() throws IOException, HiveException {
+ GenericUDF udf = new TokenizeKoUDF();
+ ObjectInspector[] argOIs = new ObjectInspector[0];
+ udf.initialize(argOIs);
+ Object result = udf.evaluate(new DeferredObject[0]);
+ Assert.assertNotNull(result);
+ udf.close();
+ }
+
+ @Test
+ public void test() throws HiveException, IOException {
+ ObjectInspector[] argOIs = new ObjectInspector[1];
+ argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ udf.initialize(argOIs);
+
+ GenericUDF.DeferredObject[] args = new GenericUDF.DeferredObject[1];
+ args[0] = new GenericUDF.DeferredObject() {
+ public Text get() throws HiveException {
+ return new Text("소설 무궁화꽃이 피었습니다.");
+ }
+
+ @Override
+ public void prepare(int arg) throws HiveException {
+ }
+ };
+ List<Text> tokens = udf.evaluate(args);
+
+ Assert.assertNotNull(tokens);
+ Assert.assertEquals(5, tokens.size());
+ Assert.assertEquals("소설 무궁 화 꽃 피", getString(tokens));
+
+ udf.close();
+ }
+
+ @Test
+ public void testNullUserList() throws HiveException, IOException {
+ ObjectInspector[] argOIs = new ObjectInspector[2];
+ // line
+ argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ // userDict
+ argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+ udf.initialize(argOIs);
+
+ GenericUDF.DeferredObject[] args = new GenericUDF.DeferredObject[1];
+ args[0] = new GenericUDF.DeferredObject() {
+ public Text get() throws HiveException {
+ return new Text("I enjoy C++ programming.");
+ }
+
+ @Override
+ public void prepare(int arg) throws HiveException {
+ }
+ };
+ List<Text> tokens = udf.evaluate(args);
+
+ Assert.assertNotNull(tokens);
+ Assert.assertEquals(4, tokens.size());
+ Assert.assertEquals("i enjoy c programming", getString(tokens));
+
+ udf.close();
+ }
+
+ @Test
+ public void testNullMode() throws UDFArgumentException, IOException {
+ ObjectInspector[] argOIs = new ObjectInspector[3];
+ // line
+ argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ // userDict
+ argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+ // mode
+ PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
+ stringType.setTypeName("string");
+ argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+ stringType, null);
+ udf.initialize(argOIs);
+ udf.close();
+ }
+
+ @Test
+ public void testMode() throws HiveException, IOException {
+ ObjectInspector[] argOIs = new ObjectInspector[3];
+ // line
+ argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ // userDict
+ argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+ // mode
+ PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
+ stringType.setTypeName("string");
+ argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+ stringType, new Text("mixed"));
+ udf.initialize(argOIs);
+
+ GenericUDF.DeferredObject[] args = new GenericUDF.DeferredObject[1];
+ args[0] = new GenericUDF.DeferredObject() {
+ public Text get() throws HiveException {
+ return new Text("소설 무궁화꽃이 피었습니다.");
+ }
+
+ @Override
+ public void prepare(int arg) throws HiveException {
+ }
+ };
+ List<Text> tokens = udf.evaluate(args);
+
+ Assert.assertNotNull(tokens);
+ Assert.assertEquals(6, tokens.size());
+ Assert.assertEquals("소설 무궁화 무궁 화 꽃 피", getString(tokens));
+
+ udf.close();
+ }
+
+ @Test(expected = UDFArgumentException.class)
+ public void testInvalidMode() throws IOException, HiveException {
+ ObjectInspector[] argOIs = new ObjectInspector[3];
+ // line
+ argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ // userDict
+ argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+ // mode
+ PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
+ stringType.setTypeName("string");
+ argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+ stringType, new Text("unsupported mode"));
+ udf.initialize(argOIs);
+ udf.close();
+ }
+
+ @Test
+ public void testNonnullUserList() throws HiveException, IOException {
+ ObjectInspector[] argOIs = new ObjectInspector[3];
+ // line
+ argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ // userDict
+ argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, Arrays.asList("C++"));
+ // mode
+ PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
+ stringType.setTypeName("string");
+ argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+ stringType, new Text("mixed"));
+ udf.initialize(argOIs);
+
+ GenericUDF.DeferredObject[] args = new GenericUDF.DeferredObject[1];
+ args[0] = new GenericUDF.DeferredObject() {
+ public Text get() throws HiveException {
+ return new Text("I enjoy C++ programming.");
+ }
+
+ @Override
+ public void prepare(int arg) throws HiveException {
+ }
+ };
+ List<Text> tokens = udf.evaluate(args);
+
+ Assert.assertNotNull(tokens);
+ Assert.assertEquals(4, tokens.size());
+ Assert.assertEquals("i enjoy c++ programming", getString(tokens));
+
+ udf.close();
+ }
+
+ @Test
+ public void testStopTags() throws HiveException, IOException {
+ ObjectInspector[] argOIs = new ObjectInspector[4];
+ // line
+ argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ // userDict
+ argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+ // mode
+ PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
+ stringType.setTypeName("string");
+ argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+ stringType, null);
+ // stopTags
+ argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, Arrays.asList("E", "VV"));
+ udf.initialize(argOIs);
+
+ GenericUDF.DeferredObject[] args = new GenericUDF.DeferredObject[1];
+ args[0] = new GenericUDF.DeferredObject() {
+ public Text get() throws HiveException {
+ return new Text("소설 무궁화꽃이 피었습니다.");
+ }
+
+ @Override
+ public void prepare(int arg) throws HiveException {
+ }
+ };
+ List<Text> tokens = udf.evaluate(args);
+
+ Assert.assertNotNull(tokens);
+ Assert.assertEquals(5, tokens.size());
+ Assert.assertEquals("소설 무궁 화 꽃 이", getString(tokens));
+
+ udf.close();
+ }
+
+ @Test(expected = UDFArgumentException.class)
+ public void testInvalidStopTag() throws UDFArgumentException, IOException {
+ ObjectInspector[] argOIs = new ObjectInspector[4];
+ // line
+ argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ // userDict
+ argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+ // mode
+ PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
+ stringType.setTypeName("string");
+ argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+ stringType, null);
+ // stopTags
+ argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, Arrays.asList("E", "?"));
+ udf.initialize(argOIs);
+ udf.close();
+ }
+
+ @Test
+ public void testOutputUnknownUnigramsTrue() throws HiveException, IOException {
+ ObjectInspector[] argOIs = new ObjectInspector[5];
+ // line
+ argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ // userDict
+ argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+ // mode
+ PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
+ stringType.setTypeName("string");
+ argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+ stringType, null);
+ // stopTags
+ argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+ // outputUnknowUnigrams
+ PrimitiveTypeInfo booleanType = new PrimitiveTypeInfo();
+ booleanType.setTypeName("boolean");
+ argOIs[4] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+ booleanType, new BooleanWritable(true));
+ udf.initialize(argOIs);
+
+ GenericUDF.DeferredObject[] args = new GenericUDF.DeferredObject[1];
+ args[0] = new GenericUDF.DeferredObject() {
+ public Text get() throws HiveException {
+ return new Text("Hello, world.");
+ }
+
+ @Override
+ public void prepare(int arg) throws HiveException {
+ }
+ };
+ List<Text> tokens = udf.evaluate(args);
+
+ Assert.assertNotNull(tokens);
+ Assert.assertEquals(10, tokens.size());
+ Assert.assertEquals("h e l l o w o r l d", getString(tokens));
+
+ udf.close();
+ }
+
+ @Test
+ public void testOutputUnknownUnigramsFalse() throws HiveException, IOException {
+ ObjectInspector[] argOIs = new ObjectInspector[5];
+ // line
+ argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ // userDict
+ argOIs[1] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+ // mode
+ PrimitiveTypeInfo stringType = new PrimitiveTypeInfo();
+ stringType.setTypeName("string");
+ argOIs[2] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+ stringType, null);
+ // stopTags
+ argOIs[3] = ObjectInspectorFactory.getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector, null);
+ // outputUnknowUnigrams
+ PrimitiveTypeInfo booleanType = new PrimitiveTypeInfo();
+ booleanType.setTypeName("boolean");
+ argOIs[4] = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(
+ booleanType, new BooleanWritable(false));
+ udf.initialize(argOIs);
+
+ GenericUDF.DeferredObject[] args = new GenericUDF.DeferredObject[1];
+ args[0] = new GenericUDF.DeferredObject() {
+ public Text get() throws HiveException {
+ return new Text("Hello, world.");
+ }
+
+ @Override
+ public void prepare(int arg) throws HiveException {
+ }
+ };
+ List<Text> tokens = udf.evaluate(args);
+
+ Assert.assertNotNull(tokens);
+ Assert.assertEquals(2, tokens.size());
+ Assert.assertEquals("hello world", getString(tokens));
+
+ udf.close();
+ }
+
+ private static String getString(List<Text> tokens) {
+ StringBuilder sb = new StringBuilder();
+ for (Text token : tokens) {
+ sb.append(token.toString()).append(" ");
+ }
+ return sb.toString().trim();
+ }
+
+}
diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive
index 209f161..91e0693 100644
--- a/resources/ddl/define-all-as-permanent.hive
+++ b/resources/ddl/define-all-as-permanent.hive
@@ -915,6 +915,9 @@ CREATE FUNCTION tokenize_ja_neologd as 'hivemall.nlp.tokenizer.KuromojiNEologdUD
DROP FUNCTION IF EXISTS tokenize_cn;
CREATE FUNCTION tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF' USING JAR '${hivemall_jar}';
+DROP FUNCTION IF EXISTS tokenize_ko;
+CREATE FUNCTION tokenize_ko as 'hivemall.nlp.tokenizer.TokenizeKoUDF' USING JAR '${hivemall_jar}';
+
DROP FUNCTION IF EXISTS stoptags_exclude;
CREATE FUNCTION stoptags_exclude as 'hivemall.nlp.tokenizer.StoptagsExcludeUDF' USING JAR '${hivemall_jar}';
diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive
index 0e89507..c6c1a20 100644
--- a/resources/ddl/define-all.hive
+++ b/resources/ddl/define-all.hive
@@ -907,6 +907,9 @@ create temporary function tokenize_ja_neologd as 'hivemall.nlp.tokenizer.Kuromoj
drop temporary function if exists tokenize_cn;
create temporary function tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF';
+drop temporary function if exists tokenize_ko;
+create temporary function tokenize_ko as 'hivemall.nlp.tokenizer.TokenizeKoUDF';
+
drop temporary function if exists stoptags_exclude;
create temporary function stoptags_exclude as 'hivemall.nlp.tokenizer.StoptagsExcludeUDF';
diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark
index 6eb2921..c9e7efc 100644
--- a/resources/ddl/define-all.spark
+++ b/resources/ddl/define-all.spark
@@ -893,6 +893,9 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION tokenize_ja_neologd AS 'hivemall.nlp.t
sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS tokenize_cn")
sqlContext.sql("CREATE TEMPORARY FUNCTION tokenize_cn AS 'hivemall.nlp.tokenizer.SmartcnUDF'")
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS tokenize_ko")
+sqlContext.sql("CREATE TEMPORARY FUNCTION tokenize_ko AS 'hivemall.nlp.tokenizer.TokenizeKoUDF'")
+
sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS stoptags_exclude")
sqlContext.sql("CREATE TEMPORARY FUNCTION stoptags_exclude AS 'hivemall.nlp.tokenizer.StoptagsExcludeUDF'")