You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ss...@apache.org on 2021/07/01 13:17:25 UTC
[systemds] branch master updated: [SYSTEMDS-3036] Stemming function PorterStemmer Functionality added in map() call. Syntax output = map(input, "x -> x.PorterStemmer.stem(x)")

This is an automated email from the ASF dual-hosted git repository.

ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new 18f4611  [SYSTEMDS-3036] Stemming function PorterStemmer Functionality added in map() call. Syntax output = map(input, "x -> x.PorterStemmer.stem(x)")
18f4611 is described below

commit 18f46113e0eb04720e390a1df9aead6ab8b373bc
Author: Shafaq Siddiqi <sh...@tugraz.at>
AuthorDate: Thu Jul 1 15:00:35 2021 +0200

    [SYSTEMDS-3036] Stemming function PorterStemmer
    Functionality added in map() call.
    Syntax
    output = map(input, "x -> x.PorterStemmer.stem(x)")
    
    Closes #1319.
---
 .../sysds/runtime/matrix/data/FrameBlock.java      |   1 +
 .../apache/sysds/runtime/util/PorterStemmer.java   | 340 +++++++++++++++++++++
 .../builtin/BuiltinPorterStemmerTest.java          | 101 ++++++
 .../resources/datasets/stemming/dictionary.csv     | 110 +++++++
 .../functions/builtin/porterStemmerTest.dml        |  29 ++
 5 files changed, 581 insertions(+)

diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java b/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java
index 0f77d53..322cfad 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java
@@ -2212,6 +2212,7 @@ public class FrameBlock implements CacheBlock, Externalizable  {
 
 		// construct class code
 		sb.append("import org.apache.sysds.runtime.util.UtilFunctions;\n");
+		sb.append("import org.apache.sysds.runtime.util.PorterStemmer;\n");
 		sb.append("import org.apache.sysds.runtime.matrix.data.FrameBlock.FrameMapFunction;\n");
 		sb.append("public class " + cname + " extends FrameMapFunction {\n");
 		if(varname.length == 1) {
diff --git a/src/main/java/org/apache/sysds/runtime/util/PorterStemmer.java b/src/main/java/org/apache/sysds/runtime/util/PorterStemmer.java
new file mode 100644
index 0000000..bc084fa
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/util/PorterStemmer.java
@@ -0,0 +1,340 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysds.runtime.util;
+
+import org.apache.commons.lang3.StringUtils;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+/**
+ * Stemmer, implementing the Porter Stemming Algorithm
+ *
+ * The Stemmer class transforms a word into its root form.  The input
+ * word can be provided a character at time (by calling add()), or at once
+ * by calling one of the various stem(something) methods.
+ */
+
+public class PorterStemmer
+{
+   /* m() measures the number of consonant sequences between 0 and j. if c is
+      a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
+      presence,
+
+         <c><v>       gives 0
+         <c>vc<v>     gives 1
+         <c>vcvc<v>   gives 2
+         <c>vcvcvc<v> gives 3
+         ....
+   */
+
+	private static int calcM(String word)
+	{
+		int l = word.length() ;
+		int count = 0;
+		boolean currentConst = false;
+		for(int c = 0; c < l; c++) {
+			if(cons(word, c))
+			{
+				if(!currentConst && c != 0) {
+					count += 1;
+				}
+				currentConst = true;
+			}
+			else
+				currentConst = false;
+
+		}
+		return  count;
+	}
+
+	/* doublec(j) is true <=> j,(j-1) contain a double consonant. */
+
+	private static boolean doublec(String word)
+	{  int len = word.length() - 1;
+		if (len < 1) return false;
+		if (word.charAt(len) != word.charAt(len - 1)) return false;
+		return cons(word, len);
+	}
+
+   /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
+      and also if the second c is not w,x or y. this is used when trying to
+      restore an e at the end of a short word. e.g.
+
+         cav(e), lov(e), hop(e), crim(e), but
+         snow, box, tray.
+*	*/
+	private static boolean cvc(String word)
+	{
+		int len = word.length();
+		int l = len - 1;
+		if (len < 3)
+			return false;
+		if(!cons(word, l) | cons(word, l-1) | !cons(word, (l-2)))
+			return false;
+		String ch = String.valueOf(word.charAt(l));
+		String exceptions = "wxy";
+		return !exceptions.contains(ch);
+	}
+
+	/* vowelinstem() is true <=> 0,...j contains a vowel */
+	private static boolean  vowelinStem(String word, String suffix) {
+		int length = word.length() - suffix.length();
+		for(int i=0; i<length; i++)
+			if(!cons(word, i))
+				return true;
+
+		return false;
+	}
+
+	/* cons(i) is true <=> b[i] is a consonant. */
+
+	private static boolean cons(String stem, int i)
+	{
+		String vowels = "aeiou";
+		char ch = stem.charAt(i);
+		if(vowels.contains(String.valueOf(stem.charAt(i))))
+			return false;
+		if(ch == 'y')
+		{
+			if(i == 0)
+				return true;
+			else
+				return (!cons(stem, i - 1));
+		}
+		return true;
+	}
+	// process the collection of tuples to find which prefix matches the case.
+	private static String processMatched(String word, HashMap suffixAndfix, int mCount)
+	{
+		String stemmed = null;
+		Iterator it = suffixAndfix.entrySet().iterator();
+		while (it.hasNext() && (stemmed == null)) {
+			Map.Entry pair = (Map.Entry)it.next();
+			stemmed = replacer(word, pair.getKey().toString(), pair.getValue().toString(), mCount);
+			it.remove();
+		}
+		return stemmed;
+	}
+
+	//	replace the suffix with suggeston
+	private static String replacer(String word, String orig, String replace, int mCount)
+	{
+		int l = word.length();
+		int suffixLength = orig.length();
+
+		if (word.endsWith(orig))
+		{
+			String stem = word.substring( 0, l - suffixLength);
+			int  m = calcM( stem );
+			if (m > mCount)
+				return stem.concat(replace);
+			else
+				return word;
+
+		}
+
+		return null;
+	}
+
+	/* step1() gets rid of plurals and -ed or -ing. e.g.
+	i.e., condition & suffix -> replacement
+		SSES -> SS
+		IES  -> I
+		SS -> SS
+		S -> ""
+		(m > 0) EED -> EE
+		vowelSequence(ED) -> ""
+		vowelsequence(ING) -> ""
+		any("at, bl, iz")  -> add(e)
+		doubleconsonant and not("l", "s", "z") -> remove single letter from end
+		(m == 1 and cvc) -> add(e)
+		turns terminal y to i when there is another vowel in the stem.
+   */
+
+	private static String step1(String word)
+	{
+		boolean flag = false;
+		if (word.endsWith("s"))
+		{
+			if (word.endsWith("sses"))
+				word = StringUtils.removeEnd(word, "es");
+			else if (word.endsWith("ies")) {
+				word = StringUtils.removeEnd(word, "ies").concat("i");
+			}
+			else if (!word.endsWith("ss") && word.endsWith("s"))
+				word = StringUtils.removeEnd(word, "s");
+		}
+		if (word.endsWith("eed"))
+		{
+			if (calcM(word) > 1)
+				word = StringUtils.removeEnd(word, "d");
+		}
+		else if(word.endsWith("ed") && vowelinStem(word, "ed")) {
+			word = StringUtils.removeEnd(word, "ed");
+			flag = true;
+		}
+		else if(word.endsWith("ing") && vowelinStem(word, "ing"))
+		{
+			word = StringUtils.removeEnd(word, "ing");
+
+			flag = true;
+		}
+
+		if (flag)
+		{
+			if(word.endsWith("at") || word.endsWith("bl") || word.endsWith("iz"))
+				word = word.concat("e");
+			int m = calcM(word);
+			String last = String.valueOf(word.charAt(word.length() - 1));
+			if (doublec(word) && !"lsz".contains(last))
+				word = word.substring(0, word.length() - 1);
+			else if (m == 1 && cvc(word))
+				word = word.concat("e");
+		}
+		if (word.endsWith("y") && vowelinStem(word, "y"))
+			word = StringUtils.removeEnd(word, "y").concat("i");
+
+		return word;
+	}
+
+	// step2() maps double suffices to single ones
+
+	private static String step2(String word) {
+		int len = word .length();
+		if (len == 0) return word;
+		HashMap<String, String> suffixAndfix = new HashMap<String, String>()
+		{{
+			put("ational", "ate");
+			put("tional","tion");
+			put("enci","ence");
+			put("anci","ance");
+			put("izer","ize");
+			put("bli","ble");
+			put("alli", "al");
+			put("entli","ent");
+			put("eli","e");
+			put("ousli","ous");
+			put("ization","ize");
+			put("ation","ate");
+			put("ator","ate");
+			put("alism","al");
+			put("iveness", "ive");
+			put("fulness","ful");
+			put("ousness", "ous");
+			put("aliti", "al");
+			put("iviti","ive");
+			put("biliti", "ble");
+			put("log", "logi");
+			put("icate", "ic");
+			put("ative","");
+			put("alize","al");
+			put("iciti","ic");
+			put("ical","ic");
+		}};
+
+		String stemmed = processMatched(word, suffixAndfix, 0);
+		return (stemmed != null)? stemmed: word;
+	}
+	// handles -ic-, -full, -ness etc.
+	private static String step3(String word) {
+		int len = word .length();
+		if (len == 0) return word;
+		HashMap<String, String> suffixAndfix = new HashMap<String, String>()
+		{{
+			put("icate", "ic");
+			put("ative","");
+			put("alize","al");
+			put("iciti","ic");
+			put("ical","ic");
+			put("ful","");
+			put("ness","");
+		}};
+
+		String stemmed = processMatched(word, suffixAndfix, 0);
+		return (stemmed != null)? stemmed: word;
+
+	}
+
+	// takes off -ant, -ence etc., in context <c>vcvc<v>
+	private static String step4(String word)
+	{
+		// first part.
+		String[] suffix = new String[] {"al", "ance", "ence", "er", "ic", "able", "ible", "ant",
+			"ement", "ment", "ent"};
+		String stemmed = null;
+		int i = 0;
+		while(stemmed == null && i < suffix.length)
+		{
+			stemmed = replacer(word, suffix[i], "", 1);
+			i++;
+		}
+		// exceptions
+		if(stemmed == null)
+		{
+			if(word.length() > 4)
+			{
+				char ch = word.charAt(word.length() - 4);
+				if(ch == 's' || ch == 't')
+				{
+					stemmed = replacer(word, "ion", "", 1);
+				}
+			}
+		}
+		// exceptions
+		if (stemmed == null)
+		{
+			suffix = new String[] {"ou",  "ism", "ate", "iti", "ous", "ive", "ize"};
+			i = 0;
+			while(stemmed == null && i < suffix.length)
+			{
+				stemmed = replacer(word, suffix[i], "", 1);
+				i++;
+			}
+		}
+
+		return (stemmed != null)? stemmed: word;
+	}
+	//	handle the last e and l
+	private static String step5(String word)
+	{
+		String stem = StringUtils.removeEnd(word, "e");
+		if(word.endsWith("e") && calcM(word) > 1)
+			word =  stem;
+		if(word.endsWith("e") && calcM(word)  == 1 && !cvc(stem))
+			word = stem;
+		if(word.endsWith("l") && doublec(word) && calcM(word) > 1)
+			word = word.substring(0, word.length() - 1);
+
+		return word;
+	}
+	public static String stem (String word)
+	{
+		if(word.length() >= 3) {
+			word = step1(word);
+			word = step2(word);
+			word = step3(word);
+			word = step4(word);
+			if(word.length() > 0)
+				word = step5(word);
+		}
+		return word;
+	}
+}
\ No newline at end of file
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinPorterStemmerTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinPorterStemmerTest.java
new file mode 100644
index 0000000..ac13190
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinPorterStemmerTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import org.apache.sysds.common.Types;
+import org.apache.sysds.runtime.matrix.data.FrameBlock;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.IOException;
+
+public class BuiltinPorterStemmerTest extends AutomatedTestBase {
+
+	private final static String TEST_NAME = "porterStemmerTest";
+	private final static String TEST_DIR = "functions/builtin/";
+	private static final String TEST_CLASS_DIR = BuiltinPorterStemmerTest.class.getSimpleName() + "/";
+	private static final String INPUT = DATASET_DIR +"stemming/dictionary.csv";
+
+	@BeforeClass
+	public static void init() {
+		TestUtils.clearDirectory(TEST_DATA_DIR + TEST_CLASS_DIR);
+	}
+
+	@AfterClass
+	public static void cleanUp() {
+		if (TEST_CACHE_ENABLED) {
+			TestUtils.clearDirectory(TEST_DATA_DIR + TEST_CLASS_DIR);
+		}
+	}
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"D"}));
+		if (TEST_CACHE_ENABLED) {
+			setOutAndExpectedDeletionDisabled(true);
+		}
+	}
+	@Test
+	public void testStemmerCP() {
+		runStemmerTest(Types.ExecMode.SINGLE_NODE);
+	}
+
+	@Test
+	public void testStemmerSpark() {
+		runStemmerTest(Types.ExecMode.SPARK);
+	}
+
+	private void runStemmerTest(Types.ExecMode et)
+	{
+		Types.ExecMode modeOld = setExecMode(et);
+		try {
+			loadTestConfiguration(getTestConfiguration(TEST_NAME));
+			String HOME = SCRIPT_DIR + TEST_DIR;
+
+			fullDMLScriptName = HOME + TEST_NAME + ".dml";
+			programArgs = new String[] {"-args", INPUT, output("S"), output("E")};
+
+			runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
+			FrameBlock outputFrame = readDMLFrameFromHDFS("S", Types.FileFormat.CSV);
+			FrameBlock inputFrame = readDMLFrameFromHDFS("E", Types.FileFormat.CSV);
+			String[] output = (String[])outputFrame.getColumnData(0);
+			String[] input = (String[])inputFrame.getColumnData(0);
+			//expected vs stemmer output
+			int count = 0;
+			for(int i = 0; i<input.length; i++) {
+				if(input[i].equals(output[i]))
+					count++;
+			}
+			Assert.assertEquals(110, count, 10);
+		}
+		catch(IOException e) {
+			e.printStackTrace();
+		}
+		finally {
+			resetExecMode(modeOld);
+		}
+	}
+}
diff --git a/src/test/resources/datasets/stemming/dictionary.csv b/src/test/resources/datasets/stemming/dictionary.csv
new file mode 100644
index 0000000..ec79e4e
--- /dev/null
+++ b/src/test/resources/datasets/stemming/dictionary.csv
@@ -0,0 +1,110 @@
+Words,Stemmed Equivalents
+caresses,caress
+ponies,poni
+ties,ti
+caress,caress
+cats,cat
+feed,feed
+agreed,agre
+disabled,disabl
+matting,mat
+mating,mate
+meeting,meet
+milling,mill
+messing,mess
+meetings,meet
+abbess,abbess
+abbey,abbei
+abbeys,abbei
+abbominable,abbomin
+abbot,abbot
+abbots,abbot
+abbreviated,abbrevi
+abdication,abdic
+abduction,abduct
+abed,ab
+abel,abel
+aberga,aberga
+abergavenny,abergavenni
+abet,abet
+abetting,abet
+abhominable,abhomin
+abhor,abhor
+abhorr,abhorr
+abhorred,abhor
+abhorrence,abhorr
+abhorring,abhor
+abhors,abhor
+abhorson,abhorson
+abide,abid
+abides,abid
+abilities,abil
+ability,abil
+abingdon,abingdon
+abject,abject
+abjectly,abjectli
+abjects,abject
+abjur,abjur
+abjure,abjur
+abler,abler
+ablest,ablest
+aboard,aboard
+accomplishment,accomplish
+boiling,boil
+boils,boil
+booths,booth
+booties,booti
+boundary,boundari
+bounded,bound
+cuts,cut
+cytherea,cytherea
+dawdling,dawdl
+deception,decept
+matting,mat
+management,manag
+manager,manag
+managing,manag
+manakin,manakin
+manasseh,manasseh
+manchen,manchen
+manchester,manchest
+manchus,manchu
+mandate,mandat
+mandragora,mandragora
+mandrake,mandrak
+mandrakes,mandrak
+mane,mane
+manent,manent
+manes,mane
+manet,manet
+manfully,manfulli
+mangelwurzel,mangelwurzel
+mangle,mangl
+mangled,mangl
+mangles,mangl
+mangling,mangl
+mangnall,mangnal
+mango,mango
+mangoes,mango
+mangy,mangi
+manhood,manhood
+manhoods,manhood
+mania,mania
+manifest,manifest
+manifested,manifest
+wived,wive
+wonderfully,wonderfulli
+wondering,wonder
+wonders,wonder
+worms,worm
+wormwood,wormwood
+wormy,wormi
+worn,worn
+worret,worret
+worried,worri
+worries,worri
+worry,worri
+worrying,worri
+worser,worser
+worship,worship
+worshipful,worship
\ No newline at end of file
diff --git a/src/test/scripts/functions/builtin/porterStemmerTest.dml b/src/test/scripts/functions/builtin/porterStemmerTest.dml
new file mode 100644
index 0000000..70fb877
--- /dev/null
+++ b/src/test/scripts/functions/builtin/porterStemmerTest.dml
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# input frame contain two columns of actual words and their stemmed equivalents 
+words = read($1, data_type = "frame", header = TRUE, format = "csv")
+stemmed = map(words[, 1], "x -> PorterStemmer.stem(x)")
+# write the results from the stemmer
+write(stemmed, $2, format = "csv")
+# write the dictionary equivalents for matching in java file
+equ = words[, 2]
+write(equ, $3, format = "csv")
\ No newline at end of file