You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hivemall.apache.org by ta...@apache.org on 2017/09/22 06:58:19 UTC
[5/7] incubator-hivemall git commit: HIVEMALL-142: Implement
`singularize`
HIVEMALL-142: Implement `singularize`
Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/8df2e365
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/8df2e365
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/8df2e365
Branch: refs/heads/dev/v0.4.2
Commit: 8df2e36590e5e1d7937121519c76e706915daba4
Parents: e3b2728
Author: Takuya Kitazawa <k....@gmail.com>
Authored: Wed Sep 20 16:30:16 2017 +0900
Committer: Takuya Kitazawa <ta...@apache.org>
Committed: Fri Sep 22 15:49:02 2017 +0900
----------------------------------------------------------------------
.../hivemall/tools/text/SingularizeUDF.java | 169 +++++++++++++++++++
.../java/hivemall/utils/lang/StringUtils.java | 16 ++
.../hivemall/tools/text/SingularizeUDFTest.java | 71 ++++++++
resources/ddl/define-all-as-permanent.hive | 3 +
resources/ddl/define-all.hive | 3 +
resources/ddl/define-all.spark | 3 +
resources/ddl/define-udfs.td.hql | 1 +
7 files changed, 266 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/8df2e365/core/src/main/java/hivemall/tools/text/SingularizeUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/text/SingularizeUDF.java b/core/src/main/java/hivemall/tools/text/SingularizeUDF.java
new file mode 100644
index 0000000..629dce2
--- /dev/null
+++ b/core/src/main/java/hivemall/tools/text/SingularizeUDF.java
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.tools.text;
+
+import hivemall.utils.lang.StringUtils;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import javax.annotation.Nullable;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDF;
+import org.apache.hadoop.hive.ql.udf.UDFType;
+
+// Inspired by
+// https://github.com/sundrio/sundrio/blob/95c2b11f7b842bdaa04f61e8e338aea60fb38f70/codegen/src/main/java/io/sundr/codegen/functions/Singularize.java
+// https://github.com/clips/pattern/blob/3eef00481a4555331cf9a099308910d977f6fc22/pattern/text/en/inflect.py#L445-L623
+@Description(name = "singularize",
+ value = "_FUNC_(string word) - Returns singular form of a given English word")
+@UDFType(deterministic = true, stateful = false)
+public final class SingularizeUDF extends UDF {
+
+ // sorted by an ascending (i.e., alphabetical) order for binary search
+ // plural preposition to detect compound words like "plural-preposition-something"
+ private static final String[] prepositions = new String[] {"about", "above", "across", "after",
+ "among", "around", "at", "athwart", "before", "behind", "below", "beneath", "beside",
+ "besides", "between", "betwixt", "beyond", "but", "by", "during", "except", "for",
+ "from", "in", "into", "near", "of", "off", "on", "onto", "out", "over", "since",
+ "till", "to", "under", "until", "unto", "upon", "with"};
+ // uninfected or uncountable words
+ private static final String[] unchanged = new String[] {"advice", "bison", "bread", "bream",
+ "breeches", "britches", "butter", "carp", "chassis", "cheese", "christmas", "clippers",
+ "cod", "contretemps", "corps", "debris", "diabetes", "djinn", "eland", "electricity",
+ "elk", "equipment", "flounder", "fruit", "furniture", "gallows", "garbage", "georgia",
+ "graffiti", "gravel", "happiness", "headquarters", "herpes", "high-jinks", "homework",
+ "information", "innings", "jackanapes", "ketchup", "knowledge", "love", "luggage",
+ "mackerel", "mathematics", "mayonnaise", "measles", "meat", "mews", "mumps", "mustard",
+ "news", "news", "pincers", "pliers", "proceedings", "progress", "rabies", "research",
+ "rice", "salmon", "sand", "scissors", "series", "shears", "software", "species",
+ "swine", "swiss", "trout", "tuna", "understanding", "water", "whiting", "wildebeest"};
+
+ private static final Map<String, String> irregular = new HashMap<String, String>();
+ static {
+ irregular.put("atlantes", "atlas");
+ irregular.put("atlases", "atlas");
+ irregular.put("axes", "axe");
+ irregular.put("beeves", "beef");
+ irregular.put("brethren", "brother");
+ irregular.put("children", "child");
+ irregular.put("corpora", "corpus");
+ irregular.put("corpuses", "corpus");
+ irregular.put("ephemerides", "ephemeris");
+ irregular.put("feet", "foot");
+ irregular.put("ganglia", "ganglion");
+ irregular.put("geese", "goose");
+ irregular.put("genera", "genus");
+ irregular.put("genii", "genie");
+ irregular.put("graffiti", "graffito");
+ irregular.put("helves", "helve");
+ irregular.put("kine", "cow");
+ irregular.put("leaves", "leaf");
+ irregular.put("loaves", "loaf");
+ irregular.put("men", "man");
+ irregular.put("mongooses", "mongoose");
+ irregular.put("monies", "money");
+ irregular.put("moves", "move");
+ irregular.put("mythoi", "mythos");
+ irregular.put("numena", "numen");
+ irregular.put("occipita", "occiput");
+ irregular.put("octopodes", "octopus");
+ irregular.put("opera", "opus");
+ irregular.put("opuses", "opus");
+ irregular.put("our", "my");
+ irregular.put("oxen", "ox");
+ irregular.put("penes", "penis");
+ irregular.put("penises", "penis");
+ irregular.put("people", "person");
+ irregular.put("sexes", "sex");
+ irregular.put("soliloquies", "soliloquy");
+ irregular.put("teeth", "tooth");
+ irregular.put("testes", "testis");
+ irregular.put("trilbys", "trilby");
+ irregular.put("turves", "turf");
+ irregular.put("zoa", "zoon");
+ }
+
+ private static final List<String> rules = Arrays.asList(
+ // regexp1, replacement1, regexp2, replacement2, ...
+ "(quiz)zes$", "$1", "(matr)ices$", "$1ix", "(vert|ind)ices$", "$1ex", "^(ox)en", "$1",
+ "(alias|status)$", "$1", "(alias|status)es$", "$1", "(octop|vir)us$", "$1us",
+ "(octop|vir)i$", "$1us", "(cris|ax|test)es$", "$1is", "(cris|ax|test)is$", "$1is",
+ "(shoe)s$", "$1", "(o)es$", "$1", "(bus)es$", "$1", "([m|l])ice$", "$1ouse",
+ "(x|ch|ss|sh)es$", "$1", "(m)ovies$", "$1ovie", "(s)eries$", "$1eries",
+ "([^aeiouy]|qu)ies$", "$1y", "([lr])ves$", "$1f", "(tive)s$", "$1", "(hive)s$", "$1",
+ "([^f])ves$", "$1fe", "(^analy)sis$", "$1sis", "(^analy)ses$", "$1sis",
+ "((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$", "$1$2sis", "([ti])a$",
+ "$1um", "(n)ews$", "$1ews", "(s|si|u)s$", "$1s", "s$", "");
+
+ @Nullable
+ public String evaluate(@Nullable String word) {
+ return singularize(word);
+ }
+
+ @Nullable
+ private static String singularize(@Nullable final String word) {
+ if (word == null) {
+ return null;
+ }
+
+ if (word.isEmpty()) {
+ return word;
+ }
+
+ if (Arrays.binarySearch(unchanged, word) >= 0) {
+ return word;
+ }
+
+ if (word.contains("-")) { // compound words (e.g., mothers-in-law)
+ final List<String> chunks = new ArrayList<String>();
+ Collections.addAll(chunks, word.split("-"));
+ if ((chunks.size() > 1) && (Arrays.binarySearch(prepositions, chunks.get(1)) >= 0)) {
+ String head = chunks.remove(0);
+ return singularize(head) + "-" + StringUtils.join(chunks, "-");
+ }
+ }
+
+ if (word.endsWith("'")) { // dogs' => dog's
+ return singularize(word.substring(0, word.length() - 1)) + "'s";
+ }
+
+ if (irregular.containsKey(word)) {
+ return irregular.get(word);
+ }
+
+ for (int i = 0, n = rules.size(); i < n; i += 2) {
+ Pattern pattern = Pattern.compile(rules.get(i), Pattern.CASE_INSENSITIVE);
+ Matcher matcher = pattern.matcher(word);
+ if (matcher.find()) {
+ return matcher.replaceAll(rules.get(i + 1));
+ }
+ }
+
+ return word;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/8df2e365/core/src/main/java/hivemall/utils/lang/StringUtils.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/utils/lang/StringUtils.java b/core/src/main/java/hivemall/utils/lang/StringUtils.java
index c2d17ca..87ad4fb 100644
--- a/core/src/main/java/hivemall/utils/lang/StringUtils.java
+++ b/core/src/main/java/hivemall/utils/lang/StringUtils.java
@@ -182,6 +182,22 @@ public final class StringUtils {
return buf.toString();
}
+ @Nonnull
+ public static String join(@Nonnull final List<String> list, @Nonnull final String sep) {
+ final StringBuilder buf = new StringBuilder(128);
+ for (int i = 0, size = list.size(); i < size; i++) {
+ if (i > 0) { // append separator before each element, except for the head element
+ buf.append(sep);
+ }
+
+ final String s = list.get(i);
+ if (s != null) {
+ buf.append(s);
+ }
+ }
+ return buf.toString();
+ }
+
public static String[] split(final String str, final char separatorChar) {
return split(str, separatorChar, false);
}
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/8df2e365/core/src/test/java/hivemall/tools/text/SingularizeUDFTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/hivemall/tools/text/SingularizeUDFTest.java b/core/src/test/java/hivemall/tools/text/SingularizeUDFTest.java
new file mode 100644
index 0000000..6ea9cc3
--- /dev/null
+++ b/core/src/test/java/hivemall/tools/text/SingularizeUDFTest.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.tools.text;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+public class SingularizeUDFTest {
+
+ private SingularizeUDF udf;
+
+ @Before
+ public void setUp() {
+ this.udf = new SingularizeUDF();
+ }
+
+ @Test
+ public void testNull() {
+ Assert.assertEquals(null, udf.evaluate(null));
+ }
+
+ @Test
+ public void testEmpty() {
+ Assert.assertEquals("", udf.evaluate(""));
+ }
+
+ @Test
+ public void testUnchanged() {
+ Assert.assertEquals("christmas", udf.evaluate("christmas"));
+ }
+
+ @Test
+ public void testCompound() {
+ Assert.assertEquals("mother-in-law", udf.evaluate("mothers-in-law"));
+ }
+
+ @Test
+ public void testTailSingleQuote() {
+ Assert.assertEquals("dog's", udf.evaluate("dogs'"));
+ }
+
+ @Test
+ public void testIrregular() {
+ Assert.assertEquals("child", udf.evaluate("children"));
+ }
+
+ @Test
+ public void testRule() {
+ Assert.assertEquals("apple", udf.evaluate("apples"));
+ Assert.assertEquals("bus", udf.evaluate("buses"));
+ Assert.assertEquals("candy", udf.evaluate("candies"));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/8df2e365/resources/ddl/define-all-as-permanent.hive
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive
index 5f4a57c..9b2f67a 100644
--- a/resources/ddl/define-all-as-permanent.hive
+++ b/resources/ddl/define-all-as-permanent.hive
@@ -481,6 +481,9 @@ CREATE FUNCTION tokenize as 'hivemall.tools.text.TokenizeUDF' USING JAR '${hivem
DROP FUNCTION IF EXISTS is_stopword;
CREATE FUNCTION is_stopword as 'hivemall.tools.text.StopwordUDF' USING JAR '${hivemall_jar}';
+DROP FUNCTION IF EXISTS singularize;
+CREATE FUNCTION singularize as 'hivemall.tools.text.SingularizeUDF' USING JAR '${hivemall_jar}';
+
DROP FUNCTION IF EXISTS split_words;
CREATE FUNCTION split_words as 'hivemall.tools.text.SplitWordsUDF' USING JAR '${hivemall_jar}';
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/8df2e365/resources/ddl/define-all.hive
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive
index 2dd61c7..ae91aa2 100644
--- a/resources/ddl/define-all.hive
+++ b/resources/ddl/define-all.hive
@@ -477,6 +477,9 @@ create temporary function tokenize as 'hivemall.tools.text.TokenizeUDF';
drop temporary function is_stopword;
create temporary function is_stopword as 'hivemall.tools.text.StopwordUDF';
+drop temporary function if exists singularize;
+create temporary function singularize as 'hivemall.tools.text.SingularizeUDF';
+
drop temporary function split_words;
create temporary function split_words as 'hivemall.tools.text.SplitWordsUDF';
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/8df2e365/resources/ddl/define-all.spark
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark
index 7d6e0b2..39a0480 100644
--- a/resources/ddl/define-all.spark
+++ b/resources/ddl/define-all.spark
@@ -388,6 +388,9 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION tokenize AS 'hivemall.tools.text.Token
sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS is_stopword")
sqlContext.sql("CREATE TEMPORARY FUNCTION is_stopword AS 'hivemall.tools.text.StopwordUDF'")
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS singularize")
+sqlContext.sql("CREATE TEMPORARY FUNCTION singularize AS 'hivemall.tools.text.SingularizeUDF'")
+
sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS split_words")
sqlContext.sql("CREATE TEMPORARY FUNCTION split_words AS 'hivemall.tools.text.SplitWordsUDF'")
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/8df2e365/resources/ddl/define-udfs.td.hql
----------------------------------------------------------------------
diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql
index 00ecd30..f11298c 100644
--- a/resources/ddl/define-udfs.td.hql
+++ b/resources/ddl/define-udfs.td.hql
@@ -144,6 +144,7 @@ create temporary function tree_predict as 'hivemall.smile.tools.TreePredictUDF';
create temporary function rf_ensemble as 'hivemall.smile.tools.RandomForestEnsembleUDAF';
create temporary function guess_attribute_types as 'hivemall.smile.tools.GuessAttributesUDF';
create temporary function to_ordered_list as 'hivemall.tools.list.UDAFToOrderedList';
+create temporary function singularize as 'hivemall.tools.text.SingularizeUDF';
-- NLP features
create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';