You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@hivemall.apache.org by ta...@apache.org on 2017/07/01 13:14:25 UTC

[1/5] incubator-hivemall git commit: Added tokenize_cn UDF, using org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer

Repository: incubator-hivemall
Updated Branches:
  refs/heads/master 9876d0631 -> ec6d945fe


Added tokenize_cn UDF, using org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer


Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/5eb80373
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/5eb80373
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/5eb80373

Branch: refs/heads/master
Commit: 5eb80373641920428d9f95f54b726995e89e8443
Parents: 9f01ebf
Author: partyyoung <pa...@126.com>
Authored: Thu Jun 29 18:32:05 2017 +0800
Committer: partyyoung <pa...@126.com>
Committed: Thu Jun 29 18:32:05 2017 +0800

----------------------------------------------------------------------
 nlp/pom.xml                                     |  13 ++
 .../java/hivemall/nlp/tokenizer/SmartcnUDF.java | 137 +++++++++++++++++++
 .../hivemall/nlp/tokenizer/SmartcnUDFTest.java  |  85 ++++++++++++
 3 files changed, 235 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/5eb80373/nlp/pom.xml
----------------------------------------------------------------------
diff --git a/nlp/pom.xml b/nlp/pom.xml
index b6ea409..021cd6d 100644
--- a/nlp/pom.xml
+++ b/nlp/pom.xml
@@ -117,6 +117,12 @@
 			<version>5.3.1</version>
 			<scope>compile</scope>
 		</dependency>
+		<dependency>
+			<groupId>org.apache.lucene</groupId>
+			<artifactId>lucene-analyzers-smartcn</artifactId>
+			<version>5.3.1</version>
+			<scope>compile</scope>
+		</dependency>
 
 		<!-- test scope -->
 		<dependency>
@@ -171,6 +177,7 @@
 								<includes>
 									<include>io.github.myui:hivemall-core</include>
 									<include>org.apache.lucene:lucene-analyzers-kuromoji</include>
+									<include>org.apache.lucene:lucene-analyzers-smartcn</include>
 									<include>org.apache.lucene:lucene-analyzers-common</include>
 									<include>org.apache.lucene:lucene-core</include>
 								</includes>
@@ -183,6 +190,12 @@
 									</includes>
 								</filter>
 								<filter>
+									<artifact>org.apache.lucene:lucene-analyzers-smartcn</artifact>
+									<includes>
+										<include>**</include>
+									</includes>
+								</filter>
+								<filter>
 									<artifact>org.apache.lucene:lucene-analyzers-common</artifact>
 									<includes>
 										<include>**</include>

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/5eb80373/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
----------------------------------------------------------------------
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
new file mode 100644
index 0000000..3d148c9
--- /dev/null
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.nlp.tokenizer;
+
+import hivemall.utils.hadoop.HiveUtils;
+import hivemall.utils.io.IOUtils;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import javax.annotation.Nonnull;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFType;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.io.Text;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
+
+@Description(
+        name = "tokenize_cn",
+        value = "_FUNC_(String line [, const list<string> stopWords])"
+                + " - returns tokenized strings in array<string>")
+@UDFType(deterministic = true, stateful = false)
+public final class SmartcnUDF extends GenericUDF {
+
+    private String[] _stopWordsArray;
+
+    private transient SmartChineseAnalyzer _analyzer;
+
+    @Override
+    public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
+        final int arglen = arguments.length;
+        if (arglen < 1 || arglen > 2) {
+            throw new UDFArgumentException("Invalid number of arguments for `tokenize_cn`: "
+                    + arglen);
+        }
+
+        this._stopWordsArray = (arglen >= 2) ? HiveUtils.getConstStringArray(arguments[1]) : null;
+        this._analyzer = null;
+
+        return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
+    }
+
+    @Override
+    public List<Text> evaluate(DeferredObject[] arguments) throws HiveException {
+        SmartChineseAnalyzer analyzer = _analyzer;
+        if (analyzer == null) {
+			CharArraySet stopwords = stopWords(_stopWordsArray);
+            analyzer= new SmartChineseAnalyzer(stopwords);
+            this._analyzer = analyzer;
+        }
+
+        Object arg0 = arguments[0].get();
+        if (arg0 == null) {
+            return null;
+        }
+        String line = arg0.toString();
+
+        final List<Text> results = new ArrayList<Text>(32);
+        TokenStream stream = null;
+        try {
+            stream = analyzer.tokenStream("", line);
+            if (stream != null) {
+                analyzeTokens(stream, results);
+            }
+        } catch (IOException e) {
+            IOUtils.closeQuietly(analyzer);
+            throw new HiveException(e);
+        } finally {
+            IOUtils.closeQuietly(stream);
+        }
+        return results;
+    }
+
+    @Override
+    public void close() throws IOException {
+        IOUtils.closeQuietly(_analyzer);
+    }
+
+
+    @Nonnull
+    private static CharArraySet stopWords(@Nonnull final String[] array)
+            throws UDFArgumentException {
+        if (array == null) {
+            return SmartChineseAnalyzer.getDefaultStopSet();
+        }
+        if (array.length == 0) {
+            return CharArraySet.EMPTY_SET;
+        }
+        CharArraySet results = new CharArraySet(Arrays.asList(array), /* ignoreCase */true);
+        return results;
+    }
+
+    private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results)
+            throws IOException {
+        // instantiate an attribute placeholder once
+        CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
+        stream.reset();
+
+        while (stream.incrementToken()) {
+            String term = termAttr.toString();
+            results.add(new Text(term));
+        }
+    }
+
+    @Override
+    public String getDisplayString(String[] children) {
+        return "tokenize_cn(" + Arrays.toString(children) + ')';
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/5eb80373/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
----------------------------------------------------------------------
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
new file mode 100644
index 0000000..720e532
--- /dev/null
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.nlp.tokenizer;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.io.Text;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class SmartcnUDFTest {
+
+	@Test
+	public void testOneArgment() throws UDFArgumentException, IOException {
+		GenericUDF udf = new SmartcnUDF();
+		ObjectInspector[] argOIs = new ObjectInspector[1];
+		// line
+		argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+		udf.initialize(argOIs);
+		udf.close();
+	}
+
+	@Test
+	public void testTwoArgment() throws UDFArgumentException, IOException {
+		GenericUDF udf = new SmartcnUDF();
+		ObjectInspector[] argOIs = new ObjectInspector[2];
+		// line
+		argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+		// stopWords
+		argOIs[1] = ObjectInspectorFactory
+				.getStandardConstantListObjectInspector(
+						PrimitiveObjectInspectorFactory.javaStringObjectInspector,
+						null);
+		udf.initialize(argOIs);
+		udf.close();
+	}
+
+	@Test
+	public void testEvalauteOneRow() throws IOException, HiveException {
+		SmartcnUDF udf = new SmartcnUDF();
+		ObjectInspector[] argOIs = new ObjectInspector[1];
+		// line
+		argOIs[0] = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
+		udf.initialize(argOIs);
+
+		DeferredObject[] args = new DeferredObject[1];
+		args[0] = new DeferredObject() {
+			public Text get() throws HiveException {
+				return new Text(
+						"Smartcn为Apache2.0协议的开源中文分词系统，Java语言编写，修改的中科院计算所ICTCLAS分词系统。");
+			}
+
+			@Override
+			public void prepare(int arg) throws HiveException {
+			}
+		};
+		List<Text> tokens = udf.evaluate(args);
+		Assert.assertNotNull(tokens);
+		udf.close();
+	}
+}

[4/5] incubator-hivemall git commit: fixed some typos about NLP module

Posted by ta...@apache.org.

fixed some typos about NLP module


Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/efc3a6de
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/efc3a6de
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/efc3a6de

Branch: refs/heads/master
Commit: efc3a6deecdc65eebf6946c6b1efb253debdca1b
Parents: e24c4fc
Author: partyyoung <pa...@126.com>
Authored: Sat Jul 1 19:13:13 2017 +0800
Committer: partyyoung <pa...@126.com>
Committed: Sat Jul 1 19:13:13 2017 +0800

----------------------------------------------------------------------
 docs/gitbook/misc/tokenizer.md                  |  16 +-
 .../java/hivemall/nlp/tokenizer/SmartcnUDF.java | 167 ++++++++++---------
 .../hivemall/nlp/tokenizer/KuromojiUDFTest.java |  12 +-
 .../hivemall/nlp/tokenizer/SmartcnUDFTest.java  |   6 +-
 resources/ddl/define-udfs.td.hql                |   1 +
 5 files changed, 102 insertions(+), 100 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/efc3a6de/docs/gitbook/misc/tokenizer.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/misc/tokenizer.md b/docs/gitbook/misc/tokenizer.md
index a2d3820..99f281d 100644
--- a/docs/gitbook/misc/tokenizer.md
+++ b/docs/gitbook/misc/tokenizer.md
@@ -24,9 +24,9 @@ Hivemall provides simple English text tokenizer UDF that has following syntax:
 tokenize(text input, optional boolean toLowerCase = false)
 ```
 
-# Tokenizer for Japanese Texts
+# Tokenizer for Non-English Texts
 
-Hivemall-NLP module provides a Japanese text tokenizer UDF using [Kuromoji](https://github.com/atilika/kuromoji). 
+Hivemall-NLP module provides some Non-English Text tokenizer UDFs as follows.
 
 First of all, you need to issue the following DDLs to use the NLP module. Note NLP module is not included in [hivemall-with-dependencies.jar](https://github.com/myui/hivemall/releases).
 
@@ -34,6 +34,10 @@ First of all, you need to issue the following DDLs to use the NLP module. Note N
 
 > source /tmp/[define-additional.hive](https://github.com/myui/hivemall/releases);
 
+## Japanese Tokenizer
+
+Japanese text tokenizer UDF uses [Kuromoji](https://github.com/atilika/kuromoji). 
+
 The signature of the UDF is as follows:
 ```sql
 tokenize_ja(text input, optional const text mode = "normal", optional const array<string> stopWords, optional const array<string> stopTags)
@@ -48,13 +52,9 @@ select tokenize_ja("kuromojiを使った分かち書きのテストです。第
 
 For detailed APIs, please refer Javadoc of [JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html) as well.
 
-# Tokenizer for Chinese Texts
-
-Hivemall-NLP module provides a Chinese text tokenizer UDF using [SmartChineseAnalyzer](http://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html). 
+## Chinese Tokenizer
 
-> add jar /tmp/[hivemall-nlp-xxx-with-dependencies.jar](https://github.com/myui/hivemall/releases);
-
-> source /tmp/[define-additional.hive](https://github.com/myui/hivemall/releases);
+Chinese text tokenizer UDF uses [SmartChineseAnalyzer](http://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html). 
 
 The signature of the UDF is as follows:
 ```sql

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/efc3a6de/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
----------------------------------------------------------------------
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
index 3d148c9..a016c7e 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
@@ -42,96 +42,97 @@ import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.util.CharArraySet;
 
-@Description(
-        name = "tokenize_cn",
-        value = "_FUNC_(String line [, const list<string> stopWords])"
-                + " - returns tokenized strings in array<string>")
+@Description(name = "tokenize_cn", value = "_FUNC_(String line [, const list<string> stopWords])"
+		+ " - returns tokenized strings in array<string>")
 @UDFType(deterministic = true, stateful = false)
 public final class SmartcnUDF extends GenericUDF {
 
-    private String[] _stopWordsArray;
+	private String[] _stopWordsArray;
 
-    private transient SmartChineseAnalyzer _analyzer;
+	private transient SmartChineseAnalyzer _analyzer;
 
-    @Override
-    public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
-        final int arglen = arguments.length;
-        if (arglen < 1 || arglen > 2) {
-            throw new UDFArgumentException("Invalid number of arguments for `tokenize_cn`: "
-                    + arglen);
-        }
+	@Override
+	public ObjectInspector initialize(ObjectInspector[] arguments)
+			throws UDFArgumentException {
+		final int arglen = arguments.length;
+		if (arglen < 1 || arglen > 2) {
+			throw new UDFArgumentException(
+					"Invalid number of arguments for `tokenize_cn`: " + arglen);
+		}
 
-        this._stopWordsArray = (arglen >= 2) ? HiveUtils.getConstStringArray(arguments[1]) : null;
-        this._analyzer = null;
+		this._stopWordsArray = (arglen >= 2) ? HiveUtils
+				.getConstStringArray(arguments[1]) : null;
+		this._analyzer = null;
 
-        return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
-    }
+		return ObjectInspectorFactory
+				.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
+	}
 
-    @Override
-    public List<Text> evaluate(DeferredObject[] arguments) throws HiveException {
-        SmartChineseAnalyzer analyzer = _analyzer;
-        if (analyzer == null) {
+	@Override
+	public List<Text> evaluate(DeferredObject[] arguments) throws HiveException {
+		SmartChineseAnalyzer analyzer = _analyzer;
+		if (analyzer == null) {
 			CharArraySet stopwords = stopWords(_stopWordsArray);
-            analyzer= new SmartChineseAnalyzer(stopwords);
-            this._analyzer = analyzer;
-        }
-
-        Object arg0 = arguments[0].get();
-        if (arg0 == null) {
-            return null;
-        }
-        String line = arg0.toString();
-
-        final List<Text> results = new ArrayList<Text>(32);
-        TokenStream stream = null;
-        try {
-            stream = analyzer.tokenStream("", line);
-            if (stream != null) {
-                analyzeTokens(stream, results);
-            }
-        } catch (IOException e) {
-            IOUtils.closeQuietly(analyzer);
-            throw new HiveException(e);
-        } finally {
-            IOUtils.closeQuietly(stream);
-        }
-        return results;
-    }
-
-    @Override
-    public void close() throws IOException {
-        IOUtils.closeQuietly(_analyzer);
-    }
-
-
-    @Nonnull
-    private static CharArraySet stopWords(@Nonnull final String[] array)
-            throws UDFArgumentException {
-        if (array == null) {
-            return SmartChineseAnalyzer.getDefaultStopSet();
-        }
-        if (array.length == 0) {
-            return CharArraySet.EMPTY_SET;
-        }
-        CharArraySet results = new CharArraySet(Arrays.asList(array), /* ignoreCase */true);
-        return results;
-    }
-
-    private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results)
-            throws IOException {
-        // instantiate an attribute placeholder once
-        CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
-        stream.reset();
-
-        while (stream.incrementToken()) {
-            String term = termAttr.toString();
-            results.add(new Text(term));
-        }
-    }
-
-    @Override
-    public String getDisplayString(String[] children) {
-        return "tokenize_cn(" + Arrays.toString(children) + ')';
-    }
-
+			analyzer = new SmartChineseAnalyzer(stopwords);
+			this._analyzer = analyzer;
+		}
+
+		Object arg0 = arguments[0].get();
+		if (arg0 == null) {
+			return null;
+		}
+		String line = arg0.toString();
+
+		final List<Text> results = new ArrayList<Text>(32);
+		TokenStream stream = null;
+		try {
+			stream = analyzer.tokenStream("", line);
+			if (stream != null) {
+				analyzeTokens(stream, results);
+			}
+		} catch (IOException e) {
+			IOUtils.closeQuietly(analyzer);
+			throw new HiveException(e);
+		} finally {
+			IOUtils.closeQuietly(stream);
+		}
+		return results;
+	}
+
+	@Override
+	public void close() throws IOException {
+		IOUtils.closeQuietly(_analyzer);
+	}
+
+	@Nonnull
+	private static CharArraySet stopWords(@Nonnull final String[] array)
+			throws UDFArgumentException {
+		if (array == null) {
+			return SmartChineseAnalyzer.getDefaultStopSet();
+		}
+		if (array.length == 0) {
+			return CharArraySet.EMPTY_SET;
+		}
+		CharArraySet results = new CharArraySet(Arrays.asList(array), /* ignoreCase */
+				true);
+		return results;
+	}
+
+	private static void analyzeTokens(@Nonnull TokenStream stream,
+			@Nonnull List<Text> results) throws IOException {
+		// instantiate an attribute placeholder once
+		CharTermAttribute termAttr = stream
+				.getAttribute(CharTermAttribute.class);
+		stream.reset();
+
+		while (stream.incrementToken()) {
+			String term = termAttr.toString();
+			results.add(new Text(term));
+		}
+	}
+
+	@Override
+	public String getDisplayString(String[] children) {
+		return "tokenize_cn(" + Arrays.toString(children) + ')';
+	}
 }

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/efc3a6de/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
----------------------------------------------------------------------
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
index 005e689..7bbaed7 100644
--- a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java
@@ -40,7 +40,7 @@ import com.esotericsoftware.kryo.io.Output;
 public class KuromojiUDFTest {
 
     @Test
-    public void testOneArgment() throws UDFArgumentException, IOException {
+    public void testOneArgument() throws UDFArgumentException, IOException {
         GenericUDF udf = new KuromojiUDF();
         ObjectInspector[] argOIs = new ObjectInspector[1];
         // line
@@ -50,7 +50,7 @@ public class KuromojiUDFTest {
     }
 
     @Test
-    public void testTwoArgment() throws UDFArgumentException, IOException {
+    public void testTwoArgument() throws UDFArgumentException, IOException {
         GenericUDF udf = new KuromojiUDF();
         ObjectInspector[] argOIs = new ObjectInspector[2];
         // line
@@ -94,7 +94,7 @@ public class KuromojiUDFTest {
     }
 
     @Test
-    public void testThreeArgment() throws UDFArgumentException, IOException {
+    public void testThreeArgument() throws UDFArgumentException, IOException {
         GenericUDF udf = new KuromojiUDF();
         ObjectInspector[] argOIs = new ObjectInspector[3];
         // line
@@ -112,7 +112,7 @@ public class KuromojiUDFTest {
     }
 
     @Test
-    public void testFourArgment() throws UDFArgumentException, IOException {
+    public void testFourArgument() throws UDFArgumentException, IOException {
         GenericUDF udf = new KuromojiUDF();
         ObjectInspector[] argOIs = new ObjectInspector[4];
         // line
@@ -133,7 +133,7 @@ public class KuromojiUDFTest {
     }
 
     @Test
-    public void testEvalauteOneRow() throws IOException, HiveException {
+    public void testEvaluateOneRow() throws IOException, HiveException {
         KuromojiUDF udf = new KuromojiUDF();
         ObjectInspector[] argOIs = new ObjectInspector[1];
         // line
@@ -156,7 +156,7 @@ public class KuromojiUDFTest {
     }
 
     @Test
-    public void testEvalauteTwoRows() throws IOException, HiveException {
+    public void testEvaluateTwoRows() throws IOException, HiveException {
         KuromojiUDF udf = new KuromojiUDF();
         ObjectInspector[] argOIs = new ObjectInspector[1];
         // line

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/efc3a6de/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
----------------------------------------------------------------------
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
index 720e532..67c2283 100644
--- a/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
@@ -35,7 +35,7 @@ import org.junit.Test;
 public class SmartcnUDFTest {
 
 	@Test
-	public void testOneArgment() throws UDFArgumentException, IOException {
+	public void testOneArgument() throws UDFArgumentException, IOException {
 		GenericUDF udf = new SmartcnUDF();
 		ObjectInspector[] argOIs = new ObjectInspector[1];
 		// line
@@ -45,7 +45,7 @@ public class SmartcnUDFTest {
 	}
 
 	@Test
-	public void testTwoArgment() throws UDFArgumentException, IOException {
+	public void testTwoArgument() throws UDFArgumentException, IOException {
 		GenericUDF udf = new SmartcnUDF();
 		ObjectInspector[] argOIs = new ObjectInspector[2];
 		// line
@@ -60,7 +60,7 @@ public class SmartcnUDFTest {
 	}
 
 	@Test
-	public void testEvalauteOneRow() throws IOException, HiveException {
+	public void testEvaluateOneRow() throws IOException, HiveException {
 		SmartcnUDF udf = new SmartcnUDF();
 		ObjectInspector[] argOIs = new ObjectInspector[1];
 		// line

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/efc3a6de/resources/ddl/define-udfs.td.hql
----------------------------------------------------------------------
diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql
index 1d11d1a..953a6ac 100644
--- a/resources/ddl/define-udfs.td.hql
+++ b/resources/ddl/define-udfs.td.hql
@@ -176,6 +176,7 @@ create temporary function train_regression as 'hivemall.regression.GeneralRegres
 
 -- NLP features
 create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';
+create temporary function tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF';
 
 -- Backward compatibilities
 create temporary function concat_array as 'hivemall.tools.array.ArrayConcatUDF';

[5/5] incubator-hivemall git commit: Merge #91: [HIVEMALL-122] Added tokenize_cn UDF

Posted by ta...@apache.org.

Merge #91: [HIVEMALL-122] Added tokenize_cn UDF


Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/ec6d945f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/ec6d945f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/ec6d945f

Branch: refs/heads/master
Commit: ec6d945fe48d64cd5175a2ad0156b8bd387c6138
Parents: 9876d06 efc3a6d
Author: Takuya Kitazawa <k....@gmail.com>
Authored: Sat Jul 1 06:13:00 2017 -0700
Committer: Takuya Kitazawa <k....@gmail.com>
Committed: Sat Jul 1 06:13:00 2017 -0700

----------------------------------------------------------------------
 docs/gitbook/SUMMARY.md                         |   2 +-
 docs/gitbook/misc/tokenizer.md                  |  27 +++-
 nlp/pom.xml                                     |  13 ++
 .../java/hivemall/nlp/tokenizer/SmartcnUDF.java | 138 +++++++++++++++++++
 .../hivemall/nlp/tokenizer/KuromojiUDFTest.java |  12 +-
 .../hivemall/nlp/tokenizer/SmartcnUDFTest.java  |  85 ++++++++++++
 resources/ddl/define-additional.hive            |   3 +
 resources/ddl/define-udfs.td.hql                |   1 +
 8 files changed, 271 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/ec6d945f/docs/gitbook/SUMMARY.md
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/ec6d945f/resources/ddl/define-udfs.td.hql
----------------------------------------------------------------------

[2/5] incubator-hivemall git commit: Added usage of tokenize_cn

Posted by ta...@apache.org.

Added usage of tokenize_cn


Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/1f819536
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/1f819536
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/1f819536

Branch: refs/heads/master
Commit: 1f819536b294920d5629d59b4524f2a6d6a0d014
Parents: 5eb8037
Author: partyyoung <pa...@126.com>
Authored: Fri Jun 30 17:49:59 2017 +0800
Committer: partyyoung <pa...@126.com>
Committed: Fri Jun 30 17:49:59 2017 +0800

----------------------------------------------------------------------
 docs/gitbook/misc/tokenizer.md       | 23 ++++++++++++++++++++++-
 resources/ddl/define-additional.hive |  3 +++
 2 files changed, 25 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1f819536/docs/gitbook/misc/tokenizer.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/misc/tokenizer.md b/docs/gitbook/misc/tokenizer.md
index 47f07e0..a2d3820 100644
--- a/docs/gitbook/misc/tokenizer.md
+++ b/docs/gitbook/misc/tokenizer.md
@@ -46,4 +46,25 @@ select tokenize_ja("kuromojiを使った分かち書きのテストです。第
 ```
 > ["kuromoji","使う","分かち書き","テスト","第","二","引数","normal","search","extended","指定","デフォルト","normal","モード"]
 
-For detailed APIs, please refer Javadoc of [JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html) as well.
\ No newline at end of file
+For detailed APIs, please refer Javadoc of [JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html) as well.
+
+# Tokenizer for Chinese Texts
+
+Hivemall-NLP module provides a Chinese text tokenizer UDF using [SmartChineseAnalyzer](http://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html). 
+
+> add jar /tmp/[hivemall-nlp-xxx-with-dependencies.jar](https://github.com/myui/hivemall/releases);
+
+> source /tmp/[define-additional.hive](https://github.com/myui/hivemall/releases);
+
+The signature of the UDF is as follows:
+```sql
+tokenize_cn(string line, optional const array<string> stopWords)
+```
+
+It's basic usage is as follows:
+```sql
+select tokenize_cn("Smartcn为Apache2.0协议的开源中文分词系统，Java语言编写，修改的中科院计算所ICTCLAS分词系统。");
+```
+> [smartcn, 为, apach, 2, 0, 协议, 的, 开源, 中文, 分词, 系统, java, 语言, 编写, 修改, 的, 中科院, 计算, 所, ictcla, 分词, 系统]
+
+For detailed APIs, please refer Javadoc of [SmartChineseAnalyzer](http://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html) as well.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1f819536/resources/ddl/define-additional.hive
----------------------------------------------------------------------
diff --git a/resources/ddl/define-additional.hive b/resources/ddl/define-additional.hive
index 7bbfcf4..af5cf82 100644
--- a/resources/ddl/define-additional.hive
+++ b/resources/ddl/define-additional.hive
@@ -9,6 +9,9 @@
 drop temporary function if exists tokenize_ja;
 create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';
 
+drop temporary function if exists tokenize_cn;
+create temporary function tokenize_cn as 'hivemall.nlp.tokenizer.SmartcnUDF';
+
 ------------------------------
 -- XGBoost related features --
 ------------------------------

[3/5] incubator-hivemall git commit: the user guide: sec <2.3.English/Japanese Text Tokenizer> be renamed as <2.3.Text Tokenizer>

Posted by ta...@apache.org.

the user guide: sec <2.3.English/Japanese Text Tokenizer> be renamed as <2.3.Text Tokenizer>


Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/e24c4fcc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/e24c4fcc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/e24c4fcc

Branch: refs/heads/master
Commit: e24c4fcc7c76d78ca0d8f2a18a5e7316318d0819
Parents: 1f81953
Author: partyyoung <pa...@126.com>
Authored: Fri Jun 30 17:59:08 2017 +0800
Committer: partyyoung <pa...@126.com>
Committed: Fri Jun 30 17:59:08 2017 +0800

----------------------------------------------------------------------
 docs/gitbook/SUMMARY.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e24c4fcc/docs/gitbook/SUMMARY.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md
index 32b0150..b9d4e8a 100644
--- a/docs/gitbook/SUMMARY.md
+++ b/docs/gitbook/SUMMARY.md
@@ -49,7 +49,7 @@
 
 * [List of generic Hivemall functions](misc/generic_funcs.md)
 * [Efficient Top-K query processing](misc/topk.md)
-* [English/Japanese Text Tokenizer](misc/tokenizer.md)
+* [Text Tokenizer](misc/tokenizer.md)
 
 ## Part III - Feature Engineering