You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ss...@apache.org on 2021/07/01 13:17:25 UTC
[systemds] branch master updated: [SYSTEMDS-3036] Stemming function
PorterStemmer Functionality added in map() call. Syntax output = map(input,
"x -> x.PorterStemmer.stem(x)")
This is an automated email from the ASF dual-hosted git repository.
ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new 18f4611 [SYSTEMDS-3036] Stemming function PorterStemmer Functionality added in map() call. Syntax output = map(input, "x -> x.PorterStemmer.stem(x)")
18f4611 is described below
commit 18f46113e0eb04720e390a1df9aead6ab8b373bc
Author: Shafaq Siddiqi <sh...@tugraz.at>
AuthorDate: Thu Jul 1 15:00:35 2021 +0200
[SYSTEMDS-3036] Stemming function PorterStemmer
Functionality added in map() call.
Syntax
output = map(input, "x -> x.PorterStemmer.stem(x)")
Closes #1319.
---
.../sysds/runtime/matrix/data/FrameBlock.java | 1 +
.../apache/sysds/runtime/util/PorterStemmer.java | 340 +++++++++++++++++++++
.../builtin/BuiltinPorterStemmerTest.java | 101 ++++++
.../resources/datasets/stemming/dictionary.csv | 110 +++++++
.../functions/builtin/porterStemmerTest.dml | 29 ++
5 files changed, 581 insertions(+)
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java b/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java
index 0f77d53..322cfad 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java
@@ -2212,6 +2212,7 @@ public class FrameBlock implements CacheBlock, Externalizable {
// construct class code
sb.append("import org.apache.sysds.runtime.util.UtilFunctions;\n");
+ sb.append("import org.apache.sysds.runtime.util.PorterStemmer;\n");
sb.append("import org.apache.sysds.runtime.matrix.data.FrameBlock.FrameMapFunction;\n");
sb.append("public class " + cname + " extends FrameMapFunction {\n");
if(varname.length == 1) {
diff --git a/src/main/java/org/apache/sysds/runtime/util/PorterStemmer.java b/src/main/java/org/apache/sysds/runtime/util/PorterStemmer.java
new file mode 100644
index 0000000..bc084fa
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/util/PorterStemmer.java
@@ -0,0 +1,340 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysds.runtime.util;
+
+import org.apache.commons.lang3.StringUtils;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+/**
+ * Stemmer, implementing the Porter Stemming Algorithm
+ *
+ * The Stemmer class transforms a word into its root form. The input
+ * word can be provided a character at time (by calling add()), or at once
+ * by calling one of the various stem(something) methods.
+ */
+
+public class PorterStemmer
+{
+ /* m() measures the number of consonant sequences between 0 and j. if c is
+ a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
+ presence,
+
+ <c><v> gives 0
+ <c>vc<v> gives 1
+ <c>vcvc<v> gives 2
+ <c>vcvcvc<v> gives 3
+ ....
+ */
+
+ private static int calcM(String word)
+ {
+ int l = word.length() ;
+ int count = 0;
+ boolean currentConst = false;
+ for(int c = 0; c < l; c++) {
+ if(cons(word, c))
+ {
+ if(!currentConst && c != 0) {
+ count += 1;
+ }
+ currentConst = true;
+ }
+ else
+ currentConst = false;
+
+ }
+ return count;
+ }
+
+ /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
+
+ private static boolean doublec(String word)
+ { int len = word.length() - 1;
+ if (len < 1) return false;
+ if (word.charAt(len) != word.charAt(len - 1)) return false;
+ return cons(word, len);
+ }
+
+ /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
+ and also if the second c is not w,x or y. this is used when trying to
+ restore an e at the end of a short word. e.g.
+
+ cav(e), lov(e), hop(e), crim(e), but
+ snow, box, tray.
+* */
+ private static boolean cvc(String word)
+ {
+ int len = word.length();
+ int l = len - 1;
+ if (len < 3)
+ return false;
+ if(!cons(word, l) | cons(word, l-1) | !cons(word, (l-2)))
+ return false;
+ String ch = String.valueOf(word.charAt(l));
+ String exceptions = "wxy";
+ return !exceptions.contains(ch);
+ }
+
+ /* vowelinstem() is true <=> 0,...j contains a vowel */
+ private static boolean vowelinStem(String word, String suffix) {
+ int length = word.length() - suffix.length();
+ for(int i=0; i<length; i++)
+ if(!cons(word, i))
+ return true;
+
+ return false;
+ }
+
+ /* cons(i) is true <=> b[i] is a consonant. */
+
+ private static boolean cons(String stem, int i)
+ {
+ String vowels = "aeiou";
+ char ch = stem.charAt(i);
+ if(vowels.contains(String.valueOf(stem.charAt(i))))
+ return false;
+ if(ch == 'y')
+ {
+ if(i == 0)
+ return true;
+ else
+ return (!cons(stem, i - 1));
+ }
+ return true;
+ }
+ // process the collection of tuples to find which prefix matches the case.
+ private static String processMatched(String word, HashMap suffixAndfix, int mCount)
+ {
+ String stemmed = null;
+ Iterator it = suffixAndfix.entrySet().iterator();
+ while (it.hasNext() && (stemmed == null)) {
+ Map.Entry pair = (Map.Entry)it.next();
+ stemmed = replacer(word, pair.getKey().toString(), pair.getValue().toString(), mCount);
+ it.remove();
+ }
+ return stemmed;
+ }
+
+ // replace the suffix with suggeston
+ private static String replacer(String word, String orig, String replace, int mCount)
+ {
+ int l = word.length();
+ int suffixLength = orig.length();
+
+ if (word.endsWith(orig))
+ {
+ String stem = word.substring( 0, l - suffixLength);
+ int m = calcM( stem );
+ if (m > mCount)
+ return stem.concat(replace);
+ else
+ return word;
+
+ }
+
+ return null;
+ }
+
+ /* step1() gets rid of plurals and -ed or -ing. e.g.
+ i.e., condition & suffix -> replacement
+ SSES -> SS
+ IES -> I
+ SS -> SS
+ S -> ""
+ (m > 0) EED -> EE
+ vowelSequence(ED) -> ""
+ vowelsequence(ING) -> ""
+ any("at, bl, iz") -> add(e)
+ doubleconsonant and not("l", "s", "z") -> remove single letter from end
+ (m == 1 and cvc) -> add(e)
+ turns terminal y to i when there is another vowel in the stem.
+ */
+
+ private static String step1(String word)
+ {
+ boolean flag = false;
+ if (word.endsWith("s"))
+ {
+ if (word.endsWith("sses"))
+ word = StringUtils.removeEnd(word, "es");
+ else if (word.endsWith("ies")) {
+ word = StringUtils.removeEnd(word, "ies").concat("i");
+ }
+ else if (!word.endsWith("ss") && word.endsWith("s"))
+ word = StringUtils.removeEnd(word, "s");
+ }
+ if (word.endsWith("eed"))
+ {
+ if (calcM(word) > 1)
+ word = StringUtils.removeEnd(word, "d");
+ }
+ else if(word.endsWith("ed") && vowelinStem(word, "ed")) {
+ word = StringUtils.removeEnd(word, "ed");
+ flag = true;
+ }
+ else if(word.endsWith("ing") && vowelinStem(word, "ing"))
+ {
+ word = StringUtils.removeEnd(word, "ing");
+
+ flag = true;
+ }
+
+ if (flag)
+ {
+ if(word.endsWith("at") || word.endsWith("bl") || word.endsWith("iz"))
+ word = word.concat("e");
+ int m = calcM(word);
+ String last = String.valueOf(word.charAt(word.length() - 1));
+ if (doublec(word) && !"lsz".contains(last))
+ word = word.substring(0, word.length() - 1);
+ else if (m == 1 && cvc(word))
+ word = word.concat("e");
+ }
+ if (word.endsWith("y") && vowelinStem(word, "y"))
+ word = StringUtils.removeEnd(word, "y").concat("i");
+
+ return word;
+ }
+
+ // step2() maps double suffices to single ones
+
+ private static String step2(String word) {
+ int len = word .length();
+ if (len == 0) return word;
+ HashMap<String, String> suffixAndfix = new HashMap<String, String>()
+ {{
+ put("ational", "ate");
+ put("tional","tion");
+ put("enci","ence");
+ put("anci","ance");
+ put("izer","ize");
+ put("bli","ble");
+ put("alli", "al");
+ put("entli","ent");
+ put("eli","e");
+ put("ousli","ous");
+ put("ization","ize");
+ put("ation","ate");
+ put("ator","ate");
+ put("alism","al");
+ put("iveness", "ive");
+ put("fulness","ful");
+ put("ousness", "ous");
+ put("aliti", "al");
+ put("iviti","ive");
+ put("biliti", "ble");
+ put("log", "logi");
+ put("icate", "ic");
+ put("ative","");
+ put("alize","al");
+ put("iciti","ic");
+ put("ical","ic");
+ }};
+
+ String stemmed = processMatched(word, suffixAndfix, 0);
+ return (stemmed != null)? stemmed: word;
+ }
+ // handles -ic-, -full, -ness etc.
+ private static String step3(String word) {
+ int len = word .length();
+ if (len == 0) return word;
+ HashMap<String, String> suffixAndfix = new HashMap<String, String>()
+ {{
+ put("icate", "ic");
+ put("ative","");
+ put("alize","al");
+ put("iciti","ic");
+ put("ical","ic");
+ put("ful","");
+ put("ness","");
+ }};
+
+ String stemmed = processMatched(word, suffixAndfix, 0);
+ return (stemmed != null)? stemmed: word;
+
+ }
+
+ // takes off -ant, -ence etc., in context <c>vcvc<v>
+ private static String step4(String word)
+ {
+ // first part.
+ String[] suffix = new String[] {"al", "ance", "ence", "er", "ic", "able", "ible", "ant",
+ "ement", "ment", "ent"};
+ String stemmed = null;
+ int i = 0;
+ while(stemmed == null && i < suffix.length)
+ {
+ stemmed = replacer(word, suffix[i], "", 1);
+ i++;
+ }
+ // exceptions
+ if(stemmed == null)
+ {
+ if(word.length() > 4)
+ {
+ char ch = word.charAt(word.length() - 4);
+ if(ch == 's' || ch == 't')
+ {
+ stemmed = replacer(word, "ion", "", 1);
+ }
+ }
+ }
+ // exceptions
+ if (stemmed == null)
+ {
+ suffix = new String[] {"ou", "ism", "ate", "iti", "ous", "ive", "ize"};
+ i = 0;
+ while(stemmed == null && i < suffix.length)
+ {
+ stemmed = replacer(word, suffix[i], "", 1);
+ i++;
+ }
+ }
+
+ return (stemmed != null)? stemmed: word;
+ }
+ // handle the last e and l
+ private static String step5(String word)
+ {
+ String stem = StringUtils.removeEnd(word, "e");
+ if(word.endsWith("e") && calcM(word) > 1)
+ word = stem;
+ if(word.endsWith("e") && calcM(word) == 1 && !cvc(stem))
+ word = stem;
+ if(word.endsWith("l") && doublec(word) && calcM(word) > 1)
+ word = word.substring(0, word.length() - 1);
+
+ return word;
+ }
+ public static String stem (String word)
+ {
+ if(word.length() >= 3) {
+ word = step1(word);
+ word = step2(word);
+ word = step3(word);
+ word = step4(word);
+ if(word.length() > 0)
+ word = step5(word);
+ }
+ return word;
+ }
+}
\ No newline at end of file
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinPorterStemmerTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinPorterStemmerTest.java
new file mode 100644
index 0000000..ac13190
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinPorterStemmerTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import org.apache.sysds.common.Types;
+import org.apache.sysds.runtime.matrix.data.FrameBlock;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.IOException;
+
+public class BuiltinPorterStemmerTest extends AutomatedTestBase {
+
+ private final static String TEST_NAME = "porterStemmerTest";
+ private final static String TEST_DIR = "functions/builtin/";
+ private static final String TEST_CLASS_DIR = BuiltinPorterStemmerTest.class.getSimpleName() + "/";
+ private static final String INPUT = DATASET_DIR +"stemming/dictionary.csv";
+
+ @BeforeClass
+ public static void init() {
+ TestUtils.clearDirectory(TEST_DATA_DIR + TEST_CLASS_DIR);
+ }
+
+ @AfterClass
+ public static void cleanUp() {
+ if (TEST_CACHE_ENABLED) {
+ TestUtils.clearDirectory(TEST_DATA_DIR + TEST_CLASS_DIR);
+ }
+ }
+
+ @Override
+ public void setUp() {
+ TestUtils.clearAssertionInformation();
+ addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"D"}));
+ if (TEST_CACHE_ENABLED) {
+ setOutAndExpectedDeletionDisabled(true);
+ }
+ }
+ @Test
+ public void testStemmerCP() {
+ runStemmerTest(Types.ExecMode.SINGLE_NODE);
+ }
+
+ @Test
+ public void testStemmerSpark() {
+ runStemmerTest(Types.ExecMode.SPARK);
+ }
+
+ private void runStemmerTest(Types.ExecMode et)
+ {
+ Types.ExecMode modeOld = setExecMode(et);
+ try {
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+ String HOME = SCRIPT_DIR + TEST_DIR;
+
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
+ programArgs = new String[] {"-args", INPUT, output("S"), output("E")};
+
+ runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
+ FrameBlock outputFrame = readDMLFrameFromHDFS("S", Types.FileFormat.CSV);
+ FrameBlock inputFrame = readDMLFrameFromHDFS("E", Types.FileFormat.CSV);
+ String[] output = (String[])outputFrame.getColumnData(0);
+ String[] input = (String[])inputFrame.getColumnData(0);
+ //expected vs stemmer output
+ int count = 0;
+ for(int i = 0; i<input.length; i++) {
+ if(input[i].equals(output[i]))
+ count++;
+ }
+ Assert.assertEquals(110, count, 10);
+ }
+ catch(IOException e) {
+ e.printStackTrace();
+ }
+ finally {
+ resetExecMode(modeOld);
+ }
+ }
+}
diff --git a/src/test/resources/datasets/stemming/dictionary.csv b/src/test/resources/datasets/stemming/dictionary.csv
new file mode 100644
index 0000000..ec79e4e
--- /dev/null
+++ b/src/test/resources/datasets/stemming/dictionary.csv
@@ -0,0 +1,110 @@
+Words,Stemmed Equivalents
+caresses,caress
+ponies,poni
+ties,ti
+caress,caress
+cats,cat
+feed,feed
+agreed,agre
+disabled,disabl
+matting,mat
+mating,mate
+meeting,meet
+milling,mill
+messing,mess
+meetings,meet
+abbess,abbess
+abbey,abbei
+abbeys,abbei
+abbominable,abbomin
+abbot,abbot
+abbots,abbot
+abbreviated,abbrevi
+abdication,abdic
+abduction,abduct
+abed,ab
+abel,abel
+aberga,aberga
+abergavenny,abergavenni
+abet,abet
+abetting,abet
+abhominable,abhomin
+abhor,abhor
+abhorr,abhorr
+abhorred,abhor
+abhorrence,abhorr
+abhorring,abhor
+abhors,abhor
+abhorson,abhorson
+abide,abid
+abides,abid
+abilities,abil
+ability,abil
+abingdon,abingdon
+abject,abject
+abjectly,abjectli
+abjects,abject
+abjur,abjur
+abjure,abjur
+abler,abler
+ablest,ablest
+aboard,aboard
+accomplishment,accomplish
+boiling,boil
+boils,boil
+booths,booth
+booties,booti
+boundary,boundari
+bounded,bound
+cuts,cut
+cytherea,cytherea
+dawdling,dawdl
+deception,decept
+matting,mat
+management,manag
+manager,manag
+managing,manag
+manakin,manakin
+manasseh,manasseh
+manchen,manchen
+manchester,manchest
+manchus,manchu
+mandate,mandat
+mandragora,mandragora
+mandrake,mandrak
+mandrakes,mandrak
+mane,mane
+manent,manent
+manes,mane
+manet,manet
+manfully,manfulli
+mangelwurzel,mangelwurzel
+mangle,mangl
+mangled,mangl
+mangles,mangl
+mangling,mangl
+mangnall,mangnal
+mango,mango
+mangoes,mango
+mangy,mangi
+manhood,manhood
+manhoods,manhood
+mania,mania
+manifest,manifest
+manifested,manifest
+wived,wive
+wonderfully,wonderfulli
+wondering,wonder
+wonders,wonder
+worms,worm
+wormwood,wormwood
+wormy,wormi
+worn,worn
+worret,worret
+worried,worri
+worries,worri
+worry,worri
+worrying,worri
+worser,worser
+worship,worship
+worshipful,worship
\ No newline at end of file
diff --git a/src/test/scripts/functions/builtin/porterStemmerTest.dml b/src/test/scripts/functions/builtin/porterStemmerTest.dml
new file mode 100644
index 0000000..70fb877
--- /dev/null
+++ b/src/test/scripts/functions/builtin/porterStemmerTest.dml
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# input frame contain two columns of actual words and their stemmed equivalents
+words = read($1, data_type = "frame", header = TRUE, format = "csv")
+stemmed = map(words[, 1], "x -> PorterStemmer.stem(x)")
+# write the results from the stemmer
+write(stemmed, $2, format = "csv")
+# write the dictionary equivalents for matching in java file
+equ = words[, 2]
+write(equ, $3, format = "csv")
\ No newline at end of file