You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hivemall.apache.org by ta...@apache.org on 2017/09/22 06:58:19 UTC

[5/7] incubator-hivemall git commit: HIVEMALL-142: Implement `singularize`

HIVEMALL-142: Implement `singularize`


Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/8df2e365
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/8df2e365
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/8df2e365

Branch: refs/heads/dev/v0.4.2
Commit: 8df2e36590e5e1d7937121519c76e706915daba4
Parents: e3b2728
Author: Takuya Kitazawa <k....@gmail.com>
Authored: Wed Sep 20 16:30:16 2017 +0900
Committer: Takuya Kitazawa <ta...@apache.org>
Committed: Fri Sep 22 15:49:02 2017 +0900

----------------------------------------------------------------------
 .../hivemall/tools/text/SingularizeUDF.java     | 169 +++++++++++++++++++
 .../java/hivemall/utils/lang/StringUtils.java   |  16 ++
 .../hivemall/tools/text/SingularizeUDFTest.java |  71 ++++++++
 resources/ddl/define-all-as-permanent.hive      |   3 +
 resources/ddl/define-all.hive                   |   3 +
 resources/ddl/define-all.spark                  |   3 +
 resources/ddl/define-udfs.td.hql                |   1 +
 7 files changed, 266 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/8df2e365/core/src/main/java/hivemall/tools/text/SingularizeUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/text/SingularizeUDF.java b/core/src/main/java/hivemall/tools/text/SingularizeUDF.java
new file mode 100644
index 0000000..629dce2
--- /dev/null
+++ b/core/src/main/java/hivemall/tools/text/SingularizeUDF.java
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.tools.text;
+
+import hivemall.utils.lang.StringUtils;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import javax.annotation.Nullable;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDF;
+import org.apache.hadoop.hive.ql.udf.UDFType;
+
+// Inspired by
+//  https://github.com/sundrio/sundrio/blob/95c2b11f7b842bdaa04f61e8e338aea60fb38f70/codegen/src/main/java/io/sundr/codegen/functions/Singularize.java
+//  https://github.com/clips/pattern/blob/3eef00481a4555331cf9a099308910d977f6fc22/pattern/text/en/inflect.py#L445-L623
+@Description(name = "singularize",
+        value = "_FUNC_(string word) - Returns singular form of a given English word")
+@UDFType(deterministic = true, stateful = false)
+public final class SingularizeUDF extends UDF {
+
+    // sorted by an ascending (i.e., alphabetical) order for binary search
+    // plural preposition to detect compound words like "plural-preposition-something"
+    private static final String[] prepositions = new String[] {"about", "above", "across", "after",
+            "among", "around", "at", "athwart", "before", "behind", "below", "beneath", "beside",
+            "besides", "between", "betwixt", "beyond", "but", "by", "during", "except", "for",
+            "from", "in", "into", "near", "of", "off", "on", "onto", "out", "over", "since",
+            "till", "to", "under", "until", "unto", "upon", "with"};
+    // uninfected or uncountable words
+    private static final String[] unchanged = new String[] {"advice", "bison", "bread", "bream",
+            "breeches", "britches", "butter", "carp", "chassis", "cheese", "christmas", "clippers",
+            "cod", "contretemps", "corps", "debris", "diabetes", "djinn", "eland", "electricity",
+            "elk", "equipment", "flounder", "fruit", "furniture", "gallows", "garbage", "georgia",
+            "graffiti", "gravel", "happiness", "headquarters", "herpes", "high-jinks", "homework",
+            "information", "innings", "jackanapes", "ketchup", "knowledge", "love", "luggage",
+            "mackerel", "mathematics", "mayonnaise", "measles", "meat", "mews", "mumps", "mustard",
+            "news", "news", "pincers", "pliers", "proceedings", "progress", "rabies", "research",
+            "rice", "salmon", "sand", "scissors", "series", "shears", "software", "species",
+            "swine", "swiss", "trout", "tuna", "understanding", "water", "whiting", "wildebeest"};
+
+    private static final Map<String, String> irregular = new HashMap<String, String>();
+    static {
+        irregular.put("atlantes", "atlas");
+        irregular.put("atlases", "atlas");
+        irregular.put("axes", "axe");
+        irregular.put("beeves", "beef");
+        irregular.put("brethren", "brother");
+        irregular.put("children", "child");
+        irregular.put("corpora", "corpus");
+        irregular.put("corpuses", "corpus");
+        irregular.put("ephemerides", "ephemeris");
+        irregular.put("feet", "foot");
+        irregular.put("ganglia", "ganglion");
+        irregular.put("geese", "goose");
+        irregular.put("genera", "genus");
+        irregular.put("genii", "genie");
+        irregular.put("graffiti", "graffito");
+        irregular.put("helves", "helve");
+        irregular.put("kine", "cow");
+        irregular.put("leaves", "leaf");
+        irregular.put("loaves", "loaf");
+        irregular.put("men", "man");
+        irregular.put("mongooses", "mongoose");
+        irregular.put("monies", "money");
+        irregular.put("moves", "move");
+        irregular.put("mythoi", "mythos");
+        irregular.put("numena", "numen");
+        irregular.put("occipita", "occiput");
+        irregular.put("octopodes", "octopus");
+        irregular.put("opera", "opus");
+        irregular.put("opuses", "opus");
+        irregular.put("our", "my");
+        irregular.put("oxen", "ox");
+        irregular.put("penes", "penis");
+        irregular.put("penises", "penis");
+        irregular.put("people", "person");
+        irregular.put("sexes", "sex");
+        irregular.put("soliloquies", "soliloquy");
+        irregular.put("teeth", "tooth");
+        irregular.put("testes", "testis");
+        irregular.put("trilbys", "trilby");
+        irregular.put("turves", "turf");
+        irregular.put("zoa", "zoon");
+    }
+
+    private static final List<String> rules = Arrays.asList(
+            // regexp1, replacement1, regexp2, replacement2, ...
+            "(quiz)zes$", "$1", "(matr)ices$", "$1ix", "(vert|ind)ices$", "$1ex", "^(ox)en", "$1",
+            "(alias|status)$", "$1", "(alias|status)es$", "$1", "(octop|vir)us$", "$1us",
+            "(octop|vir)i$", "$1us", "(cris|ax|test)es$", "$1is", "(cris|ax|test)is$", "$1is",
+            "(shoe)s$", "$1", "(o)es$", "$1", "(bus)es$", "$1", "([m|l])ice$", "$1ouse",
+            "(x|ch|ss|sh)es$", "$1", "(m)ovies$", "$1ovie", "(s)eries$", "$1eries",
+            "([^aeiouy]|qu)ies$", "$1y", "([lr])ves$", "$1f", "(tive)s$", "$1", "(hive)s$", "$1",
+            "([^f])ves$", "$1fe", "(^analy)sis$", "$1sis", "(^analy)ses$", "$1sis",
+            "((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$", "$1$2sis", "([ti])a$",
+            "$1um", "(n)ews$", "$1ews", "(s|si|u)s$", "$1s", "s$", "");
+
+    @Nullable
+    public String evaluate(@Nullable String word) {
+        return singularize(word);
+    }
+
+    @Nullable
+    private static String singularize(@Nullable final String word) {
+        if (word == null) {
+            return null;
+        }
+
+        if (word.isEmpty()) {
+            return word;
+        }
+
+        if (Arrays.binarySearch(unchanged, word) >= 0) {
+            return word;
+        }
+
+        if (word.contains("-")) { // compound words (e.g., mothers-in-law)
+            final List<String> chunks = new ArrayList<String>();
+            Collections.addAll(chunks, word.split("-"));
+            if ((chunks.size() > 1) && (Arrays.binarySearch(prepositions, chunks.get(1)) >= 0)) {
+                String head = chunks.remove(0);
+                return singularize(head) + "-" + StringUtils.join(chunks, "-");
+            }
+        }
+
+        if (word.endsWith("'")) { // dogs' => dog's
+            return singularize(word.substring(0, word.length() - 1)) + "'s";
+        }
+
+        if (irregular.containsKey(word)) {
+            return irregular.get(word);
+        }
+
+        for (int i = 0, n = rules.size(); i < n; i += 2) {
+            Pattern pattern = Pattern.compile(rules.get(i), Pattern.CASE_INSENSITIVE);
+            Matcher matcher = pattern.matcher(word);
+            if (matcher.find()) {
+                return matcher.replaceAll(rules.get(i + 1));
+            }
+        }
+
+        return word;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/8df2e365/core/src/main/java/hivemall/utils/lang/StringUtils.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/utils/lang/StringUtils.java b/core/src/main/java/hivemall/utils/lang/StringUtils.java
index c2d17ca..87ad4fb 100644
--- a/core/src/main/java/hivemall/utils/lang/StringUtils.java
+++ b/core/src/main/java/hivemall/utils/lang/StringUtils.java
@@ -182,6 +182,22 @@ public final class StringUtils {
         return buf.toString();
     }
 
+    @Nonnull
+    public static String join(@Nonnull final List<String> list, @Nonnull final String sep) {
+        final StringBuilder buf = new StringBuilder(128);
+        for (int i = 0, size = list.size(); i < size; i++) {
+            if (i > 0) { // append separator before each element, except for the head element
+                buf.append(sep);
+            }
+
+            final String s = list.get(i);
+            if (s != null) {
+                buf.append(s);
+            }
+        }
+        return buf.toString();
+    }
+
     public static String[] split(final String str, final char separatorChar) {
         return split(str, separatorChar, false);
     }

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/8df2e365/core/src/test/java/hivemall/tools/text/SingularizeUDFTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/hivemall/tools/text/SingularizeUDFTest.java b/core/src/test/java/hivemall/tools/text/SingularizeUDFTest.java
new file mode 100644
index 0000000..6ea9cc3
--- /dev/null
+++ b/core/src/test/java/hivemall/tools/text/SingularizeUDFTest.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.tools.text;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+public class SingularizeUDFTest {
+
+    private SingularizeUDF udf;
+
+    @Before
+    public void setUp() {
+        this.udf = new SingularizeUDF();
+    }
+
+    @Test
+    public void testNull() {
+        Assert.assertEquals(null, udf.evaluate(null));
+    }
+
+    @Test
+    public void testEmpty() {
+        Assert.assertEquals("", udf.evaluate(""));
+    }
+
+    @Test
+    public void testUnchanged() {
+        Assert.assertEquals("christmas", udf.evaluate("christmas"));
+    }
+
+    @Test
+    public void testCompound() {
+        Assert.assertEquals("mother-in-law", udf.evaluate("mothers-in-law"));
+    }
+
+    @Test
+    public void testTailSingleQuote() {
+        Assert.assertEquals("dog's", udf.evaluate("dogs'"));
+    }
+
+    @Test
+    public void testIrregular() {
+        Assert.assertEquals("child", udf.evaluate("children"));
+    }
+
+    @Test
+    public void testRule() {
+        Assert.assertEquals("apple", udf.evaluate("apples"));
+        Assert.assertEquals("bus", udf.evaluate("buses"));
+        Assert.assertEquals("candy", udf.evaluate("candies"));
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/8df2e365/resources/ddl/define-all-as-permanent.hive
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive
index 5f4a57c..9b2f67a 100644
--- a/resources/ddl/define-all-as-permanent.hive
+++ b/resources/ddl/define-all-as-permanent.hive
@@ -481,6 +481,9 @@ CREATE FUNCTION tokenize as 'hivemall.tools.text.TokenizeUDF' USING JAR '${hivem
 DROP FUNCTION IF EXISTS is_stopword;
 CREATE FUNCTION is_stopword as 'hivemall.tools.text.StopwordUDF' USING JAR '${hivemall_jar}';
 
+DROP FUNCTION IF EXISTS singularize;
+CREATE FUNCTION singularize as 'hivemall.tools.text.SingularizeUDF' USING JAR '${hivemall_jar}';
+
 DROP FUNCTION IF EXISTS split_words;
 CREATE FUNCTION split_words as 'hivemall.tools.text.SplitWordsUDF' USING JAR '${hivemall_jar}';
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/8df2e365/resources/ddl/define-all.hive
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive
index 2dd61c7..ae91aa2 100644
--- a/resources/ddl/define-all.hive
+++ b/resources/ddl/define-all.hive
@@ -477,6 +477,9 @@ create temporary function tokenize as 'hivemall.tools.text.TokenizeUDF';
 drop temporary function is_stopword;
 create temporary function is_stopword as 'hivemall.tools.text.StopwordUDF';
 
+drop temporary function if exists singularize;
+create temporary function singularize as 'hivemall.tools.text.SingularizeUDF';
+
 drop temporary function split_words;
 create temporary function split_words as 'hivemall.tools.text.SplitWordsUDF';
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/8df2e365/resources/ddl/define-all.spark
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark
index 7d6e0b2..39a0480 100644
--- a/resources/ddl/define-all.spark
+++ b/resources/ddl/define-all.spark
@@ -388,6 +388,9 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION tokenize AS 'hivemall.tools.text.Token
 sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS is_stopword")
 sqlContext.sql("CREATE TEMPORARY FUNCTION is_stopword AS 'hivemall.tools.text.StopwordUDF'")
 
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS singularize")
+sqlContext.sql("CREATE TEMPORARY FUNCTION singularize AS 'hivemall.tools.text.SingularizeUDF'")
+
 sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS split_words")
 sqlContext.sql("CREATE TEMPORARY FUNCTION split_words AS 'hivemall.tools.text.SplitWordsUDF'")
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/8df2e365/resources/ddl/define-udfs.td.hql
----------------------------------------------------------------------
diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql
index 00ecd30..f11298c 100644
--- a/resources/ddl/define-udfs.td.hql
+++ b/resources/ddl/define-udfs.td.hql
@@ -144,6 +144,7 @@ create temporary function tree_predict as 'hivemall.smile.tools.TreePredictUDF';
 create temporary function rf_ensemble as 'hivemall.smile.tools.RandomForestEnsembleUDAF';
 create temporary function guess_attribute_types as 'hivemall.smile.tools.GuessAttributesUDF';
 create temporary function to_ordered_list as 'hivemall.tools.list.UDAFToOrderedList';
+create temporary function singularize as 'hivemall.tools.text.SingularizeUDF';
 
 -- NLP features
 create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';