You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hivemall.apache.org by ta...@apache.org on 2017/07/01 13:14:25 UTC
[1/5] incubator-hivemall git commit: Added tokenize_cn UDF,
using org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer
Repository: incubator-hivemall
Updated Branches:
refs/heads/master 9876d0631 -> ec6d945fe
Added tokenize_cn UDF, using org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer
Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/5eb80373
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/5eb80373
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/5eb80373
Branch: refs/heads/master
Commit: 5eb80373641920428d9f95f54b726995e89e8443
Parents: 9f01ebf
Author: partyyoung <pa...@126.com>
Authored: Thu Jun 29 18:32:05 2017 +0800
Committer: partyyoung <pa...@126.com>
Committed: Thu Jun 29 18:32:05 2017 +0800
----------------------------------------------------------------------
nlp/pom.xml | 13 ++
.../java/hivemall/nlp/tokenizer/SmartcnUDF.java | 137 +++++++++++++++++++
.../hivemall/nlp/tokenizer/SmartcnUDFTest.java | 85 ++++++++++++
3 files changed, 235 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/5eb80373/nlp/pom.xml
----------------------------------------------------------------------
diff --git a/nlp/pom.xml b/nlp/pom.xml
index b6ea409..021cd6d 100644
--- a/nlp/pom.xml
+++ b/nlp/pom.xml
@@ -117,6 +117,12 @@
<version>5.3.1</version>
<scope>compile</scope>
</dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-analyzers-smartcn</artifactId>
+ <version>5.3.1</version>
+ <scope>compile</scope>
+ </dependency>
<!-- test scope -->
<dependency>
@@ -171,6 +177,7 @@
<includes>
<include>io.github.myui:hivemall-core</include>
<include>org.apache.lucene:lucene-analyzers-kuromoji</include>
+ <include>org.apache.lucene:lucene-analyzers-smartcn</include>
<include>org.apache.lucene:lucene-analyzers-common</include>
<include>org.apache.lucene:lucene-core</include>
</includes>
@@ -183,6 +190,12 @@
</includes>
</filter>
<filter>
+ <artifact>org.apache.lucene:lucene-analyzers-smartcn</artifact>
+ <includes>
+ <include>**</include>
+ </includes>
+ </filter>
+ <filter>
<artifact>org.apache.lucene:lucene-analyzers-common</artifact>
<includes>
<include>**</include>
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/5eb80373/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
----------------------------------------------------------------------
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
new file mode 100644
index 0000000..3d148c9
--- /dev/null
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.nlp.tokenizer;
+
+import hivemall.utils.hadoop.HiveUtils;
+import hivemall.utils.io.IOUtils;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import javax.annotation.Nonnull;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFType;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.io.Text;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
+
+@Description(
+ name = "tokenize_cn",
+ value = "_FUNC_(String line [, const list<string> stopWords])"
+ + " - returns tokenized strings in array<string>")
+@UDFType(deterministic = true, stateful = false)
+public final class SmartcnUDF extends GenericUDF {
+
+ private String[] _stopWordsArray;
+
+ private transient SmartChineseAnalyzer _analyzer;
+
+ @Override
+ public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
+ final int arglen = arguments.length;
+ if (arglen < 1 || arglen > 2) {
+ throw new UDFArgumentException("Invalid number of arguments for `tokenize_cn`: "
+ + arglen);
+ }
+
+ this._stopWordsArray = (arglen >= 2) ? HiveUtils.getConstStringArray(arguments[1]) : null;
+ this._analyzer = null;
+
+ return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
+ }
+
+ @Override
+ public List<Text> evaluate(DeferredObject[] arguments) throws HiveException {
+ SmartChineseAnalyzer analyzer = _analyzer;
+ if (analyzer == null) {
+ CharArraySet stopwords = stopWords(_stopWordsArray);
+ analyzer= new SmartChineseAnalyzer(stopwords);
+ this._analyzer = analyzer;
+ }
+
+ Object arg0 = arguments[0].get();
+ if (arg0 == null) {
+ return null;
+ }
+ String line = arg0.toString();
+
+ final List<Text> results = new ArrayList<Text>(32);
+ TokenStream stream = null;
+ try {
+ stream = analyzer.tokenStream("", line);
+ if (stream != null) {
+ analyzeTokens(stream, results);
+ }
+ } catch (IOException e) {
+ IOUtils.closeQuietly(analyzer);
+ throw new HiveException(e);
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+ return results;
+ }
+
+ @Override
+ public void close() throws IOException {
+ IOUtils.closeQuietly(_analyzer);
+ }
+
+
+ @Nonnull
+ private static CharArraySet stopWords(@Nonnull final String[] array)
+ throws UDFArgumentException {
+ if (array == null) {
+ return SmartChineseAnalyzer.getDefaultStopSet();
+ }
+ if (array.length == 0) {
+ return CharArraySet.EMPTY_SET;
+ }
+ CharArraySet results = new CharArraySet(Arrays.asList(array), /* ignoreCase */true);
+ return results;
+ }
+
+ private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results)
+ throws IOException {
+ // instantiate an attribute placeholder once
+ CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
+ stream.reset();
+
+ while (stream.incrementToken()) {
+ String term = termAttr.toString();
+ results.add(new Text(term));
+ }
+ }
+
+ @Override
+ public String getDisplayString(String[] children) {
+ return "tokenize_cn(" + Arrays.toString(children) + ')';
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/5eb80373/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
----------------------------------------------------------------------
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
new file mode 100644
index 0000000..720e532
--- /dev/null
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.nlp.tokenizer;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.io.Text;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class SmartcnUDFTest {
+
+ @Test
+ public void testOneArgment() throws UDFArgumentException, IOException {
+ GenericUDF udf = new SmartcnUDF();
+ ObjectInspector[] argOIs = new ObjectInspector[1];
+ // line
+ argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ udf.initialize(argOIs);
+ udf.close();
+ }
+
+ @Test
+ public void testTwoArgment() throws UDFArgumentException, IOException {
+ GenericUDF udf = new SmartcnUDF();
+ ObjectInspector[] argOIs = new ObjectInspector[2];
+ // line
+ argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ // stopWords
+ argOIs[1] = ObjectInspectorFactory
+ .getStandardConstantListObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector,
+ null);
+ udf.initialize(argOIs);
+ udf.close();
+ }
+
+ @Test
+ public void testEvalauteOneRow() throws IOException, HiveException {
+ SmartcnUDF udf = new SmartcnUDF();
+ ObjectInspector[] argOIs = new ObjectInspector[1];
+ // line
+ argOIs[0] = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
+ udf.initialize(argOIs);
+
+ DeferredObject[] args = new DeferredObject[1];
+ args[0] = new DeferredObject() {
+ public Text get() throws HiveException {
+ return new Text(
+ "Smartcn为Apache2.0协议的开源中文分词系统,Java语言编写,修改的中科院计算所ICTCLAS分词系统。");
+ }
+
+ @Override
+ public void prepare(int arg) throws HiveException {
+ }
+ };
+ List<Text> tokens = udf.evaluate(args);
+ Assert.assertNotNull(tokens);
+ udf.close();
+ }
+}
[4/5] incubator-hivemall git commit: fixed some typos about NLP module
Posted by ta...@apache.org.
fixed some typos about NLP module
Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/efc3a6de
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/efc3a6de
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/efc3a6de
Branch: refs/heads/master
Commit: efc3a6deecdc65eebf6946c6b1efb253debdca1b
Parents: e24c4fc
Author: partyyoung <pa...@126.com>
Authored: Sat Jul 1 19:13:13 2017 +0800
Committer: partyyoung <pa...@126.com>
Committed: Sat Jul 1 19:13:13 2017 +0800
----------------------------------------------------------------------
docs/gitbook/misc/tokenizer.md | 16 +-
.../java/hivemall/nlp/tokenizer/SmartcnUDF.java | 167 ++++++++++---------
.../hivemall/nlp/tokenizer/KuromojiUDFTest.java | 12 +-
.../hivemall/nlp/tokenizer/SmartcnUDFTest.java | 6 +-
resources/ddl/define-udfs.td.hql | 1 +
5 files changed, 102 insertions(+), 100 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/efc3a6de/docs/gitbook/misc/tokenizer.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/misc/tokenizer.md b/docs/gitbook/misc/tokenizer.md
index a2d3820..99f281d 100644
--- a/docs/gitbook/misc/tokenizer.md
+++ b/docs/gitbook/misc/tokenizer.md
@@ -24,9 +24,9 @@ Hivemall provides simple English text tokenizer UDF that has following syntax:
tokenize(text input, optional boolean toLowerCase = false)
```
-# Tokenizer for Japanese Texts
+# Tokenizer for Non-English Texts
-Hivemall-NLP module provides a Japanese text tokenizer UDF using [Kuromoji](https://github.com/atilika/kuromoji).
+Hivemall-NLP module provides some Non-English Text tokenizer UDFs as follows.
First of all, you need to issue the following DDLs to use the NLP module. Note NLP module is not included in [hivemall-with-dependencies.jar](https://github.com/myui/hivemall/releases).
@@ -34,6 +34,10 @@ First of all, you need to issue the following DDLs to use the NLP module. Note N
> source /tmp/[define-additional.hive](https://github.com/myui/hivemall/releases);
+## Japanese Tokenizer
+
+Japanese text tokenizer UDF uses [Kuromoji](https://github.com/atilika/kuromoji).
+
The signature of the UDF is as follows:
```sql
tokenize_ja(text input, optional const text mode = "normal", optional const array<string> stopWords, optional const array<string> stopTags)
@@ -48,13 +52,9 @@ select tokenize_ja("kuromojiを使った分かち書きのテストです。第
For detailed APIs, please refer Javadoc of [JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html) as well.
-# Tokenizer for Chinese Texts
-
-Hivemall-NLP module provides a Chinese text tokenizer UDF using [SmartChineseAnalyzer](http://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html).
+## Chinese Tokenizer
-> add jar /tmp/[hivemall-nlp-xxx-with-dependencies.jar](https://github.com/myui/hivemall/releases);
-
-> source /tmp/[define-additional.hive](https://github.com/myui/hivemall/releases);
+Chinese text tokenizer UDF uses [SmartChineseAnalyzer](http://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html).
The signature of the UDF is as follows:
```sql
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/efc3a6de/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
----------------------------------------------------------------------
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
index 3d148c9..a016c7e 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
@@ -42,96 +42,97 @@ import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
-@Description(
- name = "tokenize_cn",
- value = "_FUNC_(String line [, const list<string> stopWords])"
- + " - returns tokenized strings in array<string>")
+@Description(name = "tokenize_cn", value = "_FUNC_(String line [, const list<string> stopWords])"
+ + " - returns tokenized strings in array<string>")
@UDFType(deterministic = true, stateful = false)
public final class SmartcnUDF extends GenericUDF {
- private String[] _stopWordsArray;
+ private String[] _stopWordsArray;
- private transient SmartChineseAnalyzer _analyzer;
+ private transient SmartChineseAnalyzer _analyzer;
- @Override
- public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
- final int arglen = arguments.length;
- if (arglen < 1 || arglen > 2) {
- throw new UDFArgumentException("Invalid number of arguments for `tokenize_cn`: "
- + arglen);
- }
+ @Override
+ public ObjectInspector initialize(ObjectInspector[] arguments)
+ throws UDFArgumentException {
+ final int arglen = arguments.length;
+ if (arglen < 1 || arglen > 2) {
+ throw new UDFArgumentException(
+ "Invalid number of arguments for `tokenize_cn`: " + arglen);
+ }
- this._stopWordsArray = (arglen >= 2) ? HiveUtils.getConstStringArray(arguments[1]) : null;
- this._analyzer = null;
+ this._stopWordsArray = (arglen >= 2) ? HiveUtils
+ .getConstStringArray(arguments[1]) : null;
+ this._analyzer = null;
- return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
- }
+ return ObjectInspectorFactory
+ .getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
+ }
- @Override
- public List<Text> evaluate(DeferredObject[] arguments) throws HiveException {
- SmartChineseAnalyzer analyzer = _analyzer;
- if (analyzer == null) {
+ @Override
+ public List<Text> evaluate(DeferredObject[] arguments) throws HiveException {
+ SmartChineseAnalyzer analyzer = _analyzer;
+ if (analyzer == null) {
CharArraySet stopwords = stopWords(_stopWordsArray);
- analyzer= new SmartChineseAnalyzer(stopwords);
- this._analyzer = analyzer;
- }
-
- Object arg0 = arguments[0].get();
- if (arg0 == null) {
- return null;
- }
- String line = arg0.toString();
-
- final List<Text> results = new ArrayList<Text>(32);
- TokenStream stream = null;
- try {
- stream = analyzer.tokenStream("", line);
- if (stream != null) {
- analyzeTokens(stream, results);
- }
- } catch (IOException e) {
- IOUtils.closeQuietly(analyzer);
- throw new HiveException(e);
- } finally {
- IOUtils.closeQuietly(stream);
- }
- return results;
- }
-
- @Override
- public void close() throws IOException {
- IOUtils.closeQuietly(_analyzer);
- }
-
-
- @Nonnull
- private static CharArraySet stopWords(@Nonnull final String[] array)
- throws UDFArgumentException {
- if (array == null) {
- return SmartChineseAnalyzer.getDefaultStopSet();
- }
- if (array.length == 0) {
- return CharArraySet.EMPTY_SET;
- }
- CharArraySet results = new CharArraySet(Arrays.asList(array), /* ignoreCase */true);
- return results;
- }
-
- private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results)
- throws IOException {
- // instantiate an attribute placeholder once
- CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
- stream.reset();
-
- while (stream.incrementToken()) {
- String term = termAttr.toString();
- results.add(new Text(term));
- }
- }
-
- @Override
- public String getDisplayString(String[] children) {
- return "tokenize_cn(" + Arrays.toString(children) + ')';
- }
-
+ analyzer = new SmartChineseAnalyzer(stopwords);
+ this._analyzer = analyzer;
+ }
+
+ Object arg0 = arguments[0].get();
+ if (arg0 == null) {
+ return null;
+ }
+ String line = arg0.toString();
+
+ final List<Text> results = new ArrayList<Text>(32);
+ TokenStream stream = null;
+ try {
+ stream = analyzer.tokenStream("", line);
+ if (stream != null) {
+ analyzeTokens(stream, results);
+ }
+ } catch (IOException e) {
+ IOUtils.closeQuietly(analyzer);
+ throw new HiveException(e);
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+ return results;
+ }
+
+ @Override
+ public void close() throws IOException {
+ IOUtils.closeQuietly(_analyzer);
+ }
+
+ @Nonnull
+ private static CharArraySet stopWords(@Nonnull final String[] array)
+ throws UDFArgumentException {
+ if (array == null) {
+ return SmartChineseAnalyzer.getDefaultStopSet();
+ }
+ if (array.length == 0) {
+ return CharArraySet.EMPTY_SET;
+ }
+ CharArraySet results = new CharArraySet(Arrays.asList(array), /* ignoreCase */
+ true);
+ return results;
+ }
+
+ private static void analyzeTokens(@Nonnull TokenStream stream,
+ @Nonnull List<Text> results) throws IOException {
+ // instantiate an attribute placeholder once
+ CharTermAttribute termAttr = stream
+ .getAttribute(CharTermAttribute.class);
+ stream.reset();
+
+ while (stream.incrementToken()) {
+ String term = termAttr.toString();
+ results.add(new Text(term));
+ }
+ }
+
+ @Override
+ public String getDisplayString(String[] children) {
+ return "tokenize_cn(" + Arrays.toString(children) + ')';
+ }
}
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/efc3a6de/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
----------------------------------------------------------------------
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
index 005e689..7bbaed7 100644
--- a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
@@ -40,7 +40,7 @@ import com.esotericsoftware.kryo.io.Output;
public class KuromojiUDFTest {
@Test
- public void testOneArgment() throws UDFArgumentException, IOException {
+ public void testOneArgument() throws UDFArgumentException, IOException {
GenericUDF udf = new KuromojiUDF();
ObjectInspector[] argOIs = new ObjectInspector[1];
// line
@@ -50,7 +50,7 @@ public class KuromojiUDFTest {
}
@Test
- public void testTwoArgment() throws UDFArgumentException, IOException {
+ public void testTwoArgument() throws UDFArgumentException, IOException {
GenericUDF udf = new KuromojiUDF();
ObjectInspector[] argOIs = new ObjectInspector[2];
// line
@@ -94,7 +94,7 @@ public class KuromojiUDFTest {
}
@Test
- public void testThreeArgment() throws UDFArgumentException, IOException {
+ public void testThreeArgument() throws UDFArgumentException, IOException {
GenericUDF udf = new KuromojiUDF();
ObjectInspector[] argOIs = new ObjectInspector[3];
// line
@@ -112,7 +112,7 @@ public class KuromojiUDFTest {
}
@Test
- public void testFourArgment() throws UDFArgumentException, IOException {
+ public void testFourArgument() throws UDFArgumentException, IOException {
GenericUDF udf = new KuromojiUDF();
ObjectInspector[] argOIs = new ObjectInspector[4];
// line
@@ -133,7 +133,7 @@ public class KuromojiUDFTest {
}
@Test
- public void testEvalauteOneRow() throws IOException, HiveException {
+ public void testEvaluateOneRow() throws IOException, HiveException {
KuromojiUDF udf = new KuromojiUDF();
ObjectInspector[] argOIs = new ObjectInspector[1];
// line
@@ -156,7 +156,7 @@ public class KuromojiUDFTest {
}
@Test
- public void testEvalauteTwoRows() throws IOException, HiveException {
+ public void testEvaluateTwoRows() throws IOException, HiveException {
KuromojiUDF udf = new KuromojiUDF();
ObjectInspector[] argOIs = new ObjectInspector[1];
// line
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/efc3a6de/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
----------------------------------------------------------------------
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
index 720e532..67c2283 100644
--- a/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
@@ -35,7 +35,7 @@ import org.junit.Test;
public class SmartcnUDFTest {
@Test
- public void testOneArgment() throws UDFArgumentException, IOException {
+ public void testOneArgument() throws UDFArgumentException, IOException {
GenericUDF udf = new SmartcnUDF();
ObjectInspector[] argOIs = new ObjectInspector[1];
// line
@@ -45,7 +45,7 @@ public class SmartcnUDFTest {
}
@Test
- public void testTwoArgment() throws UDFArgumentException, IOException {
+ public void testTwoArgument() throws UDFArgumentException, IOException {
GenericUDF udf = new SmartcnUDF();
ObjectInspector[] argOIs = new ObjectInspector[2];
// line
@@ -60,7 +60,7 @@ public class SmartcnUDFTest {
}
@Test
- public void testEvalauteOneRow() throws IOException, HiveException {
+ public void testEvaluateOneRow() throws IOException, HiveException {
SmartcnUDF udf = new SmartcnUDF();
ObjectInspector[] argOIs = new ObjectInspector[1];
// line
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/efc3a6de/resources/ddl/define-udfs.td.hql
----------------------------------------------------------------------
diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql
index 1d11d1a..953a6ac 100644
--- a/resources/ddl/define-udfs.td.hql
+++ b/resources/ddl/define-udfs.td.hql
@@ -176,6 +176,7 @@ create temporary function train_regression as 'hivemall.regression.GeneralRegres
-- NLP features
create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';
+create temporary function tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF';
-- Backward compatibilities
create temporary function concat_array as 'hivemall.tools.array.ArrayConcatUDF';
[5/5] incubator-hivemall git commit: Merge #91: [HIVEMALL-122] Added
tokenize_cn UDF
Posted by ta...@apache.org.
Merge #91: [HIVEMALL-122] Added tokenize_cn UDF
Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/ec6d945f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/ec6d945f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/ec6d945f
Branch: refs/heads/master
Commit: ec6d945fe48d64cd5175a2ad0156b8bd387c6138
Parents: 9876d06 efc3a6d
Author: Takuya Kitazawa <k....@gmail.com>
Authored: Sat Jul 1 06:13:00 2017 -0700
Committer: Takuya Kitazawa <k....@gmail.com>
Committed: Sat Jul 1 06:13:00 2017 -0700
----------------------------------------------------------------------
docs/gitbook/SUMMARY.md | 2 +-
docs/gitbook/misc/tokenizer.md | 27 +++-
nlp/pom.xml | 13 ++
.../java/hivemall/nlp/tokenizer/SmartcnUDF.java | 138 +++++++++++++++++++
.../hivemall/nlp/tokenizer/KuromojiUDFTest.java | 12 +-
.../hivemall/nlp/tokenizer/SmartcnUDFTest.java | 85 ++++++++++++
resources/ddl/define-additional.hive | 3 +
resources/ddl/define-udfs.td.hql | 1 +
8 files changed, 271 insertions(+), 10 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/ec6d945f/docs/gitbook/SUMMARY.md
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/ec6d945f/resources/ddl/define-udfs.td.hql
----------------------------------------------------------------------
[2/5] incubator-hivemall git commit: Added usage of tokenize_cn
Posted by ta...@apache.org.
Added usage of tokenize_cn
Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/1f819536
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/1f819536
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/1f819536
Branch: refs/heads/master
Commit: 1f819536b294920d5629d59b4524f2a6d6a0d014
Parents: 5eb8037
Author: partyyoung <pa...@126.com>
Authored: Fri Jun 30 17:49:59 2017 +0800
Committer: partyyoung <pa...@126.com>
Committed: Fri Jun 30 17:49:59 2017 +0800
----------------------------------------------------------------------
docs/gitbook/misc/tokenizer.md | 23 ++++++++++++++++++++++-
resources/ddl/define-additional.hive | 3 +++
2 files changed, 25 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1f819536/docs/gitbook/misc/tokenizer.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/misc/tokenizer.md b/docs/gitbook/misc/tokenizer.md
index 47f07e0..a2d3820 100644
--- a/docs/gitbook/misc/tokenizer.md
+++ b/docs/gitbook/misc/tokenizer.md
@@ -46,4 +46,25 @@ select tokenize_ja("kuromojiを使った分かち書きのテストです。第
```
> ["kuromoji","使う","分かち書き","テスト","第","二","引数","normal","search","extended","指定","デフォルト","normal","モード"]
-For detailed APIs, please refer Javadoc of [JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html) as well.
\ No newline at end of file
+For detailed APIs, please refer Javadoc of [JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html) as well.
+
+# Tokenizer for Chinese Texts
+
+Hivemall-NLP module provides a Chinese text tokenizer UDF using [SmartChineseAnalyzer](http://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html).
+
+> add jar /tmp/[hivemall-nlp-xxx-with-dependencies.jar](https://github.com/myui/hivemall/releases);
+
+> source /tmp/[define-additional.hive](https://github.com/myui/hivemall/releases);
+
+The signature of the UDF is as follows:
+```sql
+tokenize_cn(string line, optional const array<string> stopWords)
+```
+
+It's basic usage is as follows:
+```sql
+select tokenize_cn("Smartcn为Apache2.0协议的开源中文分词系统,Java语言编写,修改的中科院计算所ICTCLAS分词系统。");
+```
+> [smartcn, 为, apach, 2, 0, 协议, 的, 开源, 中文, 分词, 系统, java, 语言, 编写, 修改, 的, 中科院, 计算, 所, ictcla, 分词, 系统]
+
+For detailed APIs, please refer Javadoc of [SmartChineseAnalyzer](http://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html) as well.
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1f819536/resources/ddl/define-additional.hive
----------------------------------------------------------------------
diff --git a/resources/ddl/define-additional.hive b/resources/ddl/define-additional.hive
index 7bbfcf4..af5cf82 100644
--- a/resources/ddl/define-additional.hive
+++ b/resources/ddl/define-additional.hive
@@ -9,6 +9,9 @@
drop temporary function if exists tokenize_ja;
create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';
+drop temporary function if exists tokenize_cn;
+create temporary function tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF';
+
------------------------------
-- XGBoost related features --
------------------------------
[3/5] incubator-hivemall git commit: the user guide: sec
<2.3.English/Japanese Text Tokenizer> be renamed as <2.3.Text Tokenizer>
Posted by ta...@apache.org.
the user guide: sec <2.3.English/Japanese Text Tokenizer> be renamed as <2.3.Text Tokenizer>
Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/e24c4fcc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/e24c4fcc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/e24c4fcc
Branch: refs/heads/master
Commit: e24c4fcc7c76d78ca0d8f2a18a5e7316318d0819
Parents: 1f81953
Author: partyyoung <pa...@126.com>
Authored: Fri Jun 30 17:59:08 2017 +0800
Committer: partyyoung <pa...@126.com>
Committed: Fri Jun 30 17:59:08 2017 +0800
----------------------------------------------------------------------
docs/gitbook/SUMMARY.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e24c4fcc/docs/gitbook/SUMMARY.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md
index 32b0150..b9d4e8a 100644
--- a/docs/gitbook/SUMMARY.md
+++ b/docs/gitbook/SUMMARY.md
@@ -49,7 +49,7 @@
* [List of generic Hivemall functions](misc/generic_funcs.md)
* [Efficient Top-K query processing](misc/topk.md)
-* [English/Japanese Text Tokenizer](misc/tokenizer.md)
+* [Text Tokenizer](misc/tokenizer.md)
## Part III - Feature Engineering