You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2023/04/22 18:42:14 UTC

[opennlp-sandbox] 01/01: removes `PStemmer` from similarity component, now relying on OpenNLP tools' default PorterStemmer (DRY) improves formatting along the path

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch Remove_copy_of_PorterStemmer_from_similarity_component_(DRY)_
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git

commit c63443bae054d877b5bec296e126fdb80e6ab1f6
Author: Martin Wiesner <ma...@hs-heilbronn.de>
AuthorDate: Sat Apr 22 20:42:05 2023 +0200

    removes `PStemmer` from similarity component, now relying on OpenNLP tools' default PorterStemmer (DRY)
    improves formatting along the path
---
 .../parse_thicket/matching/LemmaFormManager.java   |   5 +-
 .../parse_thicket/matching/LemmaGeneralizer.java   |   5 +-
 .../parse_thicket/matching/PhraseGeneralizer.java  |   7 +-
 .../similarity/apps/StoryDiscourseNavigator.java   |  21 +-
 .../apps/taxo_builder/DomainTaxonomyExtender.java  |  20 +-
 .../taxo_builder/TaxonomyExtenderViaMebMining.java |  15 +-
 .../apps/utils/StringDistanceMeasurer.java         |  14 +-
 .../main/java/opennlp/tools/stemmer/PStemmer.java  | 511 ---------------------
 .../tools/textsimilarity/LemmaFormManager.java     |   7 +-
 .../ParseTreeMatcherDeterministic.java             |  20 +-
 .../tools/textsimilarity/TextProcessor.java        |  25 +-
 .../src/test/resources/sentence_parseObject.csv    |   2 +-
 12 files changed, 60 insertions(+), 592 deletions(-)

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaFormManager.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaFormManager.java
index bdb4052..fa9f951 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaFormManager.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaFormManager.java
@@ -17,12 +17,11 @@
 
 package opennlp.tools.parse_thicket.matching;
 
-import opennlp.tools.stemmer.PStemmer;
+import opennlp.tools.stemmer.Stemmer;
 
 public class LemmaFormManager {
 
-  public String matchLemmas(PStemmer ps, String lemma1, String lemma2,
-      String POS) {
+  public String matchLemmas(Stemmer ps, String lemma1, String lemma2, String POS) {
     if (POS == null) {
       return null;
     }
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaGeneralizer.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaGeneralizer.java
index 42590a2..bd03c52 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaGeneralizer.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaGeneralizer.java
@@ -21,12 +21,13 @@ import java.util.ArrayList;
 import java.util.List;
 
 import opennlp.tools.parse_thicket.IGeneralizer;
-import opennlp.tools.stemmer.PStemmer;
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.stemmer.Stemmer;
 import opennlp.tools.word2vec.W2VDistanceMeasurer;
 
 public class LemmaGeneralizer implements IGeneralizer<String> {
 	public static final String W2V_PREFIX = "w2v_";
-	private final PStemmer ps = new PStemmer();
+	private final Stemmer ps = new PorterStemmer();
 	private String pos = null;
 	private final W2VDistanceMeasurer w2v;
 	public LemmaGeneralizer() {
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGeneralizer.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGeneralizer.java
index 02455a5..976a93e 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGeneralizer.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGeneralizer.java
@@ -22,7 +22,8 @@ import java.util.List;
 
 import opennlp.tools.parse_thicket.IGeneralizer;
 import opennlp.tools.parse_thicket.ParseTreeNode;
-import opennlp.tools.stemmer.PStemmer;
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.stemmer.Stemmer;
 import opennlp.tools.textsimilarity.GeneralizationListReducer;
 import opennlp.tools.textsimilarity.ParseTreeChunk;
 
@@ -33,7 +34,7 @@ public class PhraseGeneralizer implements IGeneralizer<ParseTreeChunk> {
 
 	protected final PartOfSpeechGeneralizer posManager = new PartOfSpeechGeneralizer();
 	
-	protected final PStemmer ps = new PStemmer();
+	private final Stemmer ps = new PorterStemmer();
 	protected final ParseTreeNodeGeneralizer nodeGen = new ParseTreeNodeGeneralizer();
 
 	/**
@@ -248,10 +249,8 @@ public class PhraseGeneralizer implements IGeneralizer<ParseTreeChunk> {
 			ParseTreeChunk currResult = new ParseTreeChunk(results);
 			//currResultOld = new ParseTreeChunk(commonLemmas, commonPOS, 0, 0);
 
-
 			resultChunks.add(currResult);
 		}
-
 		return resultChunks;
 	}
 
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java
index 77777d8..574d9ab 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java
@@ -27,18 +27,18 @@ import org.apache.commons.lang.StringUtils;
 
 import opennlp.tools.similarity.apps.utils.PageFetcher;
 import opennlp.tools.similarity.apps.utils.StringCleaner;
-import opennlp.tools.stemmer.PStemmer;
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.stemmer.Stemmer;
 import opennlp.tools.textsimilarity.ParseTreeChunk;
 import opennlp.tools.textsimilarity.SentencePairMatchResult;
 import opennlp.tools.textsimilarity.TextProcessor;
 import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
 
 public class StoryDiscourseNavigator {
-	protected final BingQueryRunner yrunner = new BingQueryRunner();
-	final ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor
-			.getInstance();
-	private final PStemmer ps = new PStemmer();
-	final PageFetcher pFetcher = new PageFetcher();
+	private final BingQueryRunner yrunner = new BingQueryRunner();
+	private final ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor.getInstance();
+	private final Stemmer ps = new PorterStemmer();
+	private final PageFetcher pFetcher = new PageFetcher();
 
 	public static final String[] FREQUENT_PERFORMING_VERBS = {
 		" born raised meet learn ", " graduated enter discover",
@@ -100,8 +100,7 @@ public class StoryDiscourseNavigator {
 		return res;
 	}
 
-	private List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query,
-			String domain, String lang, int numbOfHits) {
+	private List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query, String domain, String lang, int numbOfHits) {
 		List<List<ParseTreeChunk>> genResult = new ArrayList<>();
 		try {
 			List<HitBase> resultList = yrunner.runSearch(query, numbOfHits);
@@ -129,8 +128,7 @@ public class StoryDiscourseNavigator {
 
 		return genResult;
 	}
-	private List<List<String>> getCommonWordsFromList_List_ParseTreeChunk(
-			List<List<ParseTreeChunk>> matchList) {
+	private List<List<String>> getCommonWordsFromList_List_ParseTreeChunk(List<List<ParseTreeChunk>> matchList) {
 		List<List<String>> res = new ArrayList<>();
 		for (List<ParseTreeChunk> chunks : matchList) {
 			List<String> wordRes = new ArrayList<>();
@@ -141,7 +139,7 @@ public class StoryDiscourseNavigator {
 							&& ((ch.getPOSs().get(w).startsWith("NN") || ch.getPOSs().get(w)
 									.startsWith("VB"))) && lemmas.get(w).length() > 2) {
 						String formedWord = lemmas.get(w);
-						String stemmedFormedWord = ps.stem(formedWord);
+						String stemmedFormedWord = ps.stem(formedWord).toString();
 						if (!stemmedFormedWord.startsWith("invalid"))
 							wordRes.add(formedWord);
 					}
@@ -154,6 +152,7 @@ public class StoryDiscourseNavigator {
 		res = new ArrayList<>(new HashSet<>(res));
 		return res;
 	}
+	
 	public static void main(String[] args){
 		String[] res = new StoryDiscourseNavigator().obtainAdditionalKeywordsForAnEntity("Albert Einstein");
 		System.out.println(Arrays.asList(res));
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/DomainTaxonomyExtender.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/DomainTaxonomyExtender.java
index f7cb34b..6402e2d 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/DomainTaxonomyExtender.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/DomainTaxonomyExtender.java
@@ -27,7 +27,8 @@ import org.apache.commons.lang.StringUtils;
 import opennlp.tools.similarity.apps.BingQueryRunner;
 import opennlp.tools.similarity.apps.HitBase;
 import opennlp.tools.similarity.apps.utils.StringCleaner;
-import opennlp.tools.stemmer.PStemmer;
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.stemmer.Stemmer;
 import opennlp.tools.textsimilarity.ParseTreeChunk;
 import opennlp.tools.textsimilarity.SentencePairMatchResult;
 import opennlp.tools.textsimilarity.TextProcessor;
@@ -39,13 +40,12 @@ public class DomainTaxonomyExtender {
 	private final ParserChunker2MatcherProcessor matcher = ParserChunker2MatcherProcessor.getInstance();
 
 	protected static final String BING_KEY = "WFoNMM706MMJ5JYfcHaSEDP+faHj3xAxt28CPljUAHA";
-	
 
 	private final static String TAXO_FILENAME = "taxo_data.dat";
 
 	private Map<String, List<List<String>>> lemma_ExtendedAssocWords = new HashMap<>();
 	private final Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords = new HashMap<>();
-	private final PStemmer ps;
+	private final Stemmer ps = new PorterStemmer();
 
 	final CsvAdapter adapter = new CsvAdapter();
 
@@ -63,14 +63,13 @@ public class DomainTaxonomyExtender {
 	}
 
 	public DomainTaxonomyExtender() {
-		ps = new PStemmer();
 		adapter.importCSV();
 		brunner.setKey(BING_KEY);
 	}
 
 	private List<List<String>> getCommonWordsFromList_List_ParseTreeChunk(
-			List<List<ParseTreeChunk>> matchList, List<String> queryWordsToRemove,
-			List<String> toAddAtEnd) {
+			List<List<ParseTreeChunk>> matchList, List<String> queryWordsToRemove, List<String> toAddAtEnd) {
+
 		List<List<String>> res = new ArrayList<>();
 		for (List<ParseTreeChunk> chunks : matchList) {
 			List<String> wordRes = new ArrayList<>();
@@ -81,7 +80,7 @@ public class DomainTaxonomyExtender {
 							&& ((ch.getPOSs().get(w).startsWith("NN") || ch.getPOSs().get(w).startsWith("JJ") || ch.getPOSs().get(w)
 									.startsWith("VB"))) && lemmas.get(w).length() > 2) {
 						String formedWord = lemmas.get(w);
-						String stemmedFormedWord = ps.stem(formedWord);
+						String stemmedFormedWord = ps.stem(formedWord).toString();
 						if (!stemmedFormedWord.startsWith("invalid"))
 							wordRes.add(formedWord);
 					}
@@ -104,7 +103,6 @@ public class DomainTaxonomyExtender {
 
 	public void extendTaxonomy(String fileNameWithTaxonomyRoot, String domain, String lang) {
 
-
 		List<String> entries = new ArrayList<>((adapter.lemma_AssocWords.keySet()));
 		try {
 			for (String entity : entries) { // .
@@ -166,8 +164,7 @@ public class DomainTaxonomyExtender {
 		return genResult;
 	}
 
-	public List<String> runSearchForTaxonomyPathFlatten(String query,
-			String domain, String lang, int numbOfHits) {
+	public List<String> runSearchForTaxonomyPathFlatten(String query, String domain, String lang, int numbOfHits) {
 		List<String> genResult = new ArrayList<>();
 		try {
 			List<HitBase> resultList = brunner.runSearch(query, numbOfHits);
@@ -213,8 +210,7 @@ public class DomainTaxonomyExtender {
 
 	public static void main(String[] args) {
 		DomainTaxonomyExtender self = new DomainTaxonomyExtender();
-		self.extendTaxonomy("", "music",
-				"en");
+		self.extendTaxonomy("", "music", "en");
 
 	}
 
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java
index e780330..cbd1229 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java
@@ -25,7 +25,8 @@ import java.util.Map;
 import opennlp.tools.similarity.apps.BingQueryRunner;
 import opennlp.tools.similarity.apps.HitBase;
 import opennlp.tools.similarity.apps.utils.StringCleaner;
-import opennlp.tools.stemmer.PStemmer;
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.stemmer.Stemmer;
 import opennlp.tools.textsimilarity.ParseTreeChunk;
 import opennlp.tools.textsimilarity.SentencePairMatchResult;
 import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
@@ -44,7 +45,7 @@ public class TaxonomyExtenderViaMebMining extends BingQueryRunner {
 
   private Map<String, List<List<String>>> lemma_ExtendedAssocWords = new HashMap<>();
   private final Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords = new HashMap<>();
-  private final PStemmer ps;
+  private final Stemmer ps = new PorterStemmer();
 
   public Map<List<String>, List<List<String>>> getAssocWords_ExtendedAssocWords() {
     return assocWords_ExtendedAssocWords;
@@ -66,8 +67,6 @@ public class TaxonomyExtenderViaMebMining extends BingQueryRunner {
       System.err.println("Problem loading synt matcher");
 
     }
-    ps = new PStemmer();
-
   }
 
   private List<List<String>> getCommonWordsFromList_List_ParseTreeChunk(
@@ -83,7 +82,7 @@ public class TaxonomyExtenderViaMebMining extends BingQueryRunner {
               && ((ch.getPOSs().get(w).startsWith("NN") || ch.getPOSs().get(w)
                   .startsWith("VB"))) && lemmas.get(w).length() > 2) {
             String formedWord = lemmas.get(w);
-            String stemmedFormedWord = ps.stem(formedWord);
+            String stemmedFormedWord = ps.stem(formedWord).toString();
             if (!stemmedFormedWord.startsWith("invalid"))
               wordRes.add(formedWord);
           }
@@ -114,8 +113,7 @@ public class TaxonomyExtenderViaMebMining extends BingQueryRunner {
                                                                             // here
           query = query.replace('[', ' ').replace(']', ' ').replace(',', ' ')
               .replace('_', ' ');
-          List<List<ParseTreeChunk>> matchList = runSearchForTaxonomyPath(
-              query, "", lang, 30);
+          List<List<ParseTreeChunk>> matchList = runSearchForTaxonomyPath(query, "", lang, 30);
           List<String> toRemoveFromExtension = new ArrayList<>(taxoPath);
           toRemoveFromExtension.add(entity);
           toRemoveFromExtension.add(domain);
@@ -135,8 +133,7 @@ public class TaxonomyExtenderViaMebMining extends BingQueryRunner {
     ser.writeTaxonomy(fileName.replace(".ari", "Taxo.dat"));
   }
 
-  public List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query,
-      String domain, String lang, int numbOfHits) {
+  public List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query, String domain, String lang, int numbOfHits) {
     List<List<ParseTreeChunk>> genResult = new ArrayList<>();
     try {
       List<HitBase> resultList = runSearch(query, numbOfHits);
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java
index 8424dab..88a3bac 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java
@@ -17,14 +17,14 @@
 
 package opennlp.tools.similarity.apps.utils;
 
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.stemmer.Stemmer;
+
 import java.util.ArrayList;
 import java.util.List;
 
-import opennlp.tools.stemmer.PStemmer;
-
 public class StringDistanceMeasurer {
-  // external tools
-  private final PStemmer ps; // stemmer
+  private final Stemmer ps = new PorterStemmer();
 
   private static final int MIN_STRING_LENGTH_FOR_WORD = 4;
 
@@ -35,11 +35,7 @@ public class StringDistanceMeasurer {
   private static final double MIN_SCORE_FOR_LING = 100; // 0.7;
 
   public StringDistanceMeasurer() {
-    // first get stemmer
-    ps = new PStemmer();
-    if (MIN_SCORE_FOR_LING > 1.0)
-      return;
-
+    
   }
 
   // gets string array and process numbers, applies stemming and forms a list
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/stemmer/PStemmer.java b/opennlp-similarity/src/main/java/opennlp/tools/stemmer/PStemmer.java
deleted file mode 100644
index 464d064..0000000
--- a/opennlp-similarity/src/main/java/opennlp/tools/stemmer/PStemmer.java
+++ /dev/null
@@ -1,511 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.stemmer;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.FileInputStream;
-
-import org.apache.lucene.util.ArrayUtil;
-
-/**
- * Stemmer, implementing the Porter Stemming Algorithm
- * <p>
- * The Stemmer class transforms a word into its root form.  The input
- * word can be provided a character at time (by calling add()), or at once
- * by calling one of the various stem(something) methods.
- */
-public class PStemmer {
-	private char[] b;
-	private int i,    /* offset into b */
-		j, k, k0;
-	private boolean dirty = false;
-	private static final int INITIAL_SIZE = 50;
-
-	public PStemmer() {
-		b = new char[INITIAL_SIZE];
-		i = 0;
-	}
-
-	/**
-	 * Resets the stemmer, so it can stem another word. If you invoke
-	 * the stemmer by calling {@link #add(char)} and then {@link #stem()}, you must call
-	 * {@code reset()} before starting another word.
-	 */
-	public void reset() { i = 0; dirty = false; }
-
-	/**
-	 * Add a character to the word being stemmed. When you are finished
-	 * adding characters, you can call {@link #stem()} to process the word.
-	 */
-	public void add(char ch) {
-		if (b.length <= i) {
-			b = ArrayUtil.grow(b, i+1);
-		}
-		b[i++] = ch;
-	}
-
-	/**
-	 * After a word has been stemmed, it can be retrieved by toString(),
-	 * or a reference to the internal buffer can be retrieved by getResultBuffer
-	 * and getResultLength (which is generally more efficient.)
-	 */
-	@Override
-	public String toString() { return new String(b,0,i); }
-
-	/**
-	 * Returns the length of the word resulting from the stemming process.
-	 */
-	public int getResultLength() { return i; }
-
-	/**
-	 * Returns a reference to a character buffer containing the results of
-	 * the stemming process.  You also need to consult getResultLength()
-	 * to determine the length of the result.
-	 */
-	public char[] getResultBuffer() { return b; }
-
-	/* cons(i) is true <=> b[i] is a consonant. */
-
-	private boolean cons(int i) {
-		switch (b[i]) {
-		case 'a': case 'e': case 'i': case 'o': case 'u':
-			return false;
-		case 'y':
-			return (i==k0) ? true : !cons(i-1);
-		default:
-			return true;
-		}
-	}
-
-	/* m() measures the number of consonant sequences between k0 and j. if c is
-		 a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
-		 presence,
-
-					<c><v>       gives 0
-					<c>vc<v>     gives 1
-					<c>vcvc<v>   gives 2
-					<c>vcvcvc<v> gives 3
-					....
-	*/
-
-	private int m() {
-		int n = 0;
-		int i = k0;
-		while(true) {
-			if (i > j)
-				return n;
-			if (! cons(i))
-				break;
-			i++;
-		}
-		i++;
-		while(true) {
-			while(true) {
-				if (i > j)
-					return n;
-				if (cons(i))
-					break;
-				i++;
-			}
-			i++;
-			n++;
-			while(true) {
-				if (i > j)
-					return n;
-				if (! cons(i))
-					break;
-				i++;
-			}
-			i++;
-		}
-	}
-
-	/* vowelinstem() is true <=> k0,...j contains a vowel */
-
-	private boolean vowelinstem() {
-		int i;
-		for (i = k0; i <= j; i++)
-			if (! cons(i))
-				return true;
-		return false;
-	}
-
-	/* doublec(j) is true <=> j,(j-1) contain a double consonant. */
-
-	private boolean doublec(int j) {
-		if (j < k0+1)
-			return false;
-		if (b[j] != b[j-1])
-			return false;
-		return cons(j);
-	}
-
-	/* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
-		 and also if the second c is not w,x or y. this is used when trying to
-		 restore an e at the end of a short word. e.g.
-
-					cav(e), lov(e), hop(e), crim(e), but
-					snow, box, tray.
-
-	*/
-
-	private boolean cvc(int i) {
-		if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2))
-			return false;
-		else {
-			int ch = b[i];
-			if (ch == 'w' || ch == 'x' || ch == 'y') return false;
-		}
-		return true;
-	}
-
-	private boolean ends(String s) {
-		int l = s.length();
-		int o = k-l+1;
-		if (o < k0)
-			return false;
-		for (int i = 0; i < l; i++)
-			if (b[o+i] != s.charAt(i))
-				return false;
-		j = k-l;
-		return true;
-	}
-
-	/* setto(s) sets (j+1),...k to the characters in the string s, readjusting
-		 k. */
-
-	void setto(String s) {
-		int l = s.length();
-		int o = j+1;
-		for (int i = 0; i < l; i++)
-			b[o+i] = s.charAt(i);
-		k = j+l;
-		dirty = true;
-	}
-
-	/* r(s) is used further down. */
-
-	void r(String s) { if (m() > 0) setto(s); }
-
-	/* step1() gets rid of plurals and -ed or -ing. e.g.
-
-					 caresses  ->  caress
-					 ponies    ->  poni
-					 ties      ->  ti
-					 caress    ->  caress
-					 cats      ->  cat
-
-					 feed      ->  feed
-					 agreed    ->  agree
-					 disabled  ->  disable
-
-					 matting   ->  mat
-					 mating    ->  mate
-					 meeting   ->  meet
-					 milling   ->  mill
-					 messing   ->  mess
-
-					 meetings  ->  meet
-
-	*/
-
-	private void step1() {
-		if (b[k] == 's') {
-			if (ends("sses")) k -= 2;
-			else if (ends("ies")) setto("i");
-			else if (b[k-1] != 's') k--;
-		}
-		if (ends("eed")) {
-			if (m() > 0)
-				k--;
-		}
-		else if ((ends("ed") || ends("ing")) && vowelinstem()) {
-			k = j;
-			if (ends("at")) setto("ate");
-			else if (ends("bl")) setto("ble");
-			else if (ends("iz")) setto("ize");
-			else if (doublec(k)) {
-				int ch = b[k--];
-				if (ch == 'l' || ch == 's' || ch == 'z')
-					k++;
-			}
-			else if (m() == 1 && cvc(k))
-				setto("e");
-		}
-	}
-
-	/* step2() turns terminal y to i when there is another vowel in the stem. */
-
-	private void step2() {
-		if (ends("y") && vowelinstem()) {
-			b[k] = 'i';
-			dirty = true;
-		}
-	}
-
-	/* step3() maps double suffices to single ones. so -ization ( = -ize plus
-		 -ation) maps to -ize etc. note that the string before the suffix must give
-		 m() > 0. */
-
-	private void step3() {
-		if (k == k0) return; /* For Bug 1 */
-		switch (b[k-1]) {
-		case 'a':
-			if (ends("ational")) { r("ate"); break; }
-			if (ends("tional")) { r("tion"); break; }
-			break;
-		case 'c':
-			if (ends("enci")) { r("ence"); break; }
-			if (ends("anci")) { r("ance"); break; }
-			break;
-		case 'e':
-			if (ends("izer")) { r("ize"); break; }
-			break;
-		case 'l':
-			if (ends("bli")) { r("ble"); break; }
-			if (ends("alli")) { r("al"); break; }
-			if (ends("entli")) { r("ent"); break; }
-			if (ends("eli")) { r("e"); break; }
-			if (ends("ousli")) { r("ous"); break; }
-			break;
-		case 'o':
-			if (ends("ization")) { r("ize"); break; }
-			if (ends("ation")) { r("ate"); break; }
-			if (ends("ator")) { r("ate"); break; }
-			break;
-		case 's':
-			if (ends("alism")) { r("al"); break; }
-			if (ends("iveness")) { r("ive"); break; }
-			if (ends("fulness")) { r("ful"); break; }
-			if (ends("ousness")) { r("ous"); break; }
-			break;
-		case 't':
-			if (ends("aliti")) { r("al"); break; }
-			if (ends("iviti")) { r("ive"); break; }
-			if (ends("biliti")) { r("ble"); break; }
-			break;
-		case 'g':
-			if (ends("logi")) { r("log"); break; }
-		}
-	}
-
-	/* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
-
-	private void step4() {
-		switch (b[k]) {
-		case 'e':
-			if (ends("icate")) { r("ic"); break; }
-			if (ends("ative")) { r(""); break; }
-			if (ends("alize")) { r("al"); break; }
-			break;
-		case 'i':
-			if (ends("iciti")) { r("ic"); break; }
-			break;
-		case 'l':
-			if (ends("ical")) { r("ic"); break; }
-			if (ends("ful")) { r(""); break; }
-			break;
-		case 's':
-			if (ends("ness")) { r(""); break; }
-			break;
-		}
-	}
-
-	/* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
-
-	private void step5() {
-		if (k == k0) return; /* for Bug 1 */
-		switch (b[k-1]) {
-		case 'a':
-			if (ends("al")) break;
-			return;
-		case 'c':
-			if (ends("ance")) break;
-			if (ends("ence")) break;
-			return;
-		case 'e':
-			if (ends("er")) break; return;
-		case 'i':
-			if (ends("ic")) break; return;
-		case 'l':
-			if (ends("able")) break;
-			if (ends("ible")) break; return;
-		case 'n':
-			if (ends("ant")) break;
-			if (ends("ement")) break;
-			if (ends("ment")) break;
-			/* element etc. not stripped before the m */
-			if (ends("ent")) break;
-			return;
-		case 'o':
-			if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
-			/* j >= 0 fixes Bug 2 */
-			if (ends("ou")) break;
-			return;
-			/* takes care of -ous */
-		case 's':
-			if (ends("ism")) break;
-			return;
-		case 't':
-			if (ends("ate")) break;
-			if (ends("iti")) break;
-			return;
-		case 'u':
-			if (ends("ous")) break;
-			return;
-		case 'v':
-			if (ends("ive")) break;
-			return;
-		case 'z':
-			if (ends("ize")) break;
-			return;
-		default:
-			return;
-		}
-		if (m() > 1)
-			k = j;
-	}
-
-	/* step6() removes a final -e if m() > 1. */
-
-	private void step6() {
-		j = k;
-		if (b[k] == 'e') {
-			int a = m();
-			if (a > 1 || a == 1 && !cvc(k-1))
-				k--;
-		}
-		if (b[k] == 'l' && doublec(k) && m() > 1)
-			k--;
-	}
-
-
-	/**
-	 * Stem a word provided as a String.  Returns the result as a String.
-	 */
-	public String stem(String s) {
-		if (stem(s.toCharArray(), s.length()))
-			return toString();
-		else
-			return s;
-	}
-
-	/** Stem a word contained in a char[].  Returns true if the stemming process
-	 * resulted in a word different from the input.  You can retrieve the
-	 * result with getResultLength()/getResultBuffer() or toString().
-	 */
-	public boolean stem(char[] word) {
-		return stem(word, word.length);
-	}
-
-	/** Stem a word contained in a portion of a char[] array.  Returns
-	 * true if the stemming process resulted in a word different from
-	 * the input.  You can retrieve the result with
-	 * getResultLength()/getResultBuffer() or toString().
-	 */
-	public boolean stem(char[] wordBuffer, int offset, int wordLen) {
-		reset();
-		if (b.length < wordLen) {
-			b = new char[ArrayUtil.oversize(wordLen, Character.BYTES)];
-		}
-		System.arraycopy(wordBuffer, offset, b, 0, wordLen);
-		i = wordLen;
-		return stem(0);
-	}
-
-	/** Stem a word contained in a leading portion of a char[] array.
-	 * Returns true if the stemming process resulted in a word different
-	 * from the input.  You can retrieve the result with
-	 * getResultLength()/getResultBuffer() or toString().
-	 */
-	public boolean stem(char[] word, int wordLen) {
-		return stem(word, 0, wordLen);
-	}
-
-	/** Stem the word placed into the Stemmer buffer through calls to add().
-	 * Returns true if the stemming process resulted in a word different
-	 * from the input.  You can retrieve the result with
-	 * getResultLength()/getResultBuffer() or toString().
-	 */
-	public boolean stem() {
-		return stem(0);
-	}
-
-	public boolean stem(int i0) {
-		k = i - 1;
-		k0 = i0;
-		if (k > k0+1) {
-			step1(); step2(); step3(); step4(); step5(); step6();
-		}
-		// Also, a word is considered dirty if we lopped off letters
-		// Thanks to Ifigenia Vairelles for pointing this out.
-		if (i != k+1)
-			dirty = true;
-		i = k+1;
-		return dirty;
-	}
-
-	/** Test program for demonstrating the Stemmer.  It reads a file and
-	 * stems each word, writing the result to standard out.
-	 * Usage: Stemmer file-name
-	 */
-	public static void main(String[] args) {
-		PStemmer s = new PStemmer();
-
-		for (String arg : args) {
-			try (InputStream in = new FileInputStream(arg)) {
-				byte[] buffer = new byte[1024];
-				int bufferLen, offset, ch;
-
-				bufferLen = in.read(buffer);
-				offset = 0;
-				s.reset();
-
-				while (true) {
-					if (offset < bufferLen)
-						ch = buffer[offset++];
-					else {
-						bufferLen = in.read(buffer);
-						offset = 0;
-						if (bufferLen < 0)
-							ch = -1;
-						else
-							ch = buffer[offset++];
-					}
-
-					if (Character.isLetter((char) ch)) {
-						s.add(Character.toLowerCase((char) ch));
-					} else {
-						s.stem();
-						System.out.print(s);
-						s.reset();
-						if (ch < 0)
-							break;
-						else {
-							System.out.print((char) ch);
-						}
-					}
-				}
-			} catch (IOException e) {
-				System.out.println("error reading " + arg);
-			}
-		}
-	}
-}
-
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java
index a72583e..c40767f 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java
@@ -17,14 +17,13 @@
 
 package opennlp.tools.textsimilarity;
 
-import java.util.List;
+import opennlp.tools.stemmer.Stemmer;
 
-import opennlp.tools.stemmer.PStemmer;
+import java.util.List;
 
 public class LemmaFormManager {
 
-  public String matchLemmas(PStemmer ps, String lemma1, String lemma2,
-      String POS) {
+  public String matchLemmas(Stemmer ps, String lemma1, String lemma2, String POS) {
     if (POS == null) {
       return null;
     }
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministic.java b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministic.java
index 0f2fd20..9a78670 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministic.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministic.java
@@ -17,11 +17,12 @@
 
 package opennlp.tools.textsimilarity;
 
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.stemmer.Stemmer;
+
 import java.util.ArrayList;
 import java.util.List;
 
-import opennlp.tools.stemmer.PStemmer;
-
 public class ParseTreeMatcherDeterministic {
 
   private final GeneralizationListReducer generalizationListReducer = new GeneralizationListReducer();
@@ -30,6 +31,8 @@ public class ParseTreeMatcherDeterministic {
 
   private final POSManager posManager = new POSManager();
 
+  private final Stemmer ps = new PorterStemmer();
+
   /**
    * key matching function which takes two phrases, aligns them and finds a set
    * of maximum common sub-phrase
@@ -38,7 +41,6 @@ public class ParseTreeMatcherDeterministic {
    * @param chunk2
    * @return
    */
-
   public List<ParseTreeChunk> generalizeTwoGroupedPhrasesDeterministic(
       ParseTreeChunk chunk1, ParseTreeChunk chunk2) {
     List<String> pos1 = chunk1.getPOSs();
@@ -49,10 +51,9 @@ public class ParseTreeMatcherDeterministic {
     List<String> lem1stem = new ArrayList<>();
     List<String> lem2stem = new ArrayList<>();
 
-    PStemmer ps = new PStemmer();
     for (String word : lem1) {
       try {
-        lem1stem.add(ps.stem(word.toLowerCase()));
+        lem1stem.add(ps.stem(word.toLowerCase()).toString());
       } catch (Exception e) {
         // e.printStackTrace();
 
@@ -62,7 +63,7 @@ public class ParseTreeMatcherDeterministic {
     }
     try {
       for (String word : lem2) {
-        lem2stem.add(ps.stem(word.toLowerCase()));
+        lem2stem.add(ps.stem(word.toLowerCase()).toString());
       }
     } catch (Exception e) {
       System.err.println("problem processing word " + lem2.toString());
@@ -130,8 +131,7 @@ public class ParseTreeMatcherDeterministic {
         String lemmaMatch = lemmaFormManager.matchLemmas(ps, lem1.get(k1),
             lem2.get(k2), sim);
         if ((sim != null)
-            && (lemmaMatch == null || (lemmaMatch != null && !lemmaMatch
-                .equals("fail")))) {
+            && (lemmaMatch == null || (lemmaMatch != null && !lemmaMatch.equals("fail")))) {
           commonPOS.add(pos1.get(k1));
           if (lemmaMatch != null) {
             commonLemmas.add(lemmaMatch);
@@ -198,8 +198,7 @@ public class ParseTreeMatcherDeterministic {
           bReachedCommonWord = false;
         }
       }
-      ParseTreeChunk currResult = new ParseTreeChunk(commonLemmas, commonPOS,
-          0, 0);
+      ParseTreeChunk currResult = new ParseTreeChunk(commonLemmas, commonPOS, 0, 0);
       results.add(currResult);
     }
 
@@ -267,7 +266,6 @@ public class ParseTreeMatcherDeterministic {
       resultComps = generalizationListReducer.applyFilteringBySubsumption(resultComps);
       results.add(resultComps);
     }
-
     return results;
   }
 
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/TextProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/TextProcessor.java
index 55cbf04..cae4565 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/TextProcessor.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/TextProcessor.java
@@ -34,8 +34,9 @@ import java.util.regex.Pattern;
 
 import org.apache.commons.lang.StringUtils;
 
-import opennlp.tools.stemmer.PStemmer;
 import opennlp.tools.similarity.apps.utils.Pair;
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.stemmer.Stemmer;
 
 public class TextProcessor {
 
@@ -480,7 +481,7 @@ public class TextProcessor {
       }
     }
 
-    return new PStemmer().stem(token).toString();
+    return new PorterStemmer().stem(token);
   }
 
   public static String cleanToken(String token) {
@@ -525,8 +526,7 @@ public class TextProcessor {
 
   public static String stemTerm(String term) {
     term = stripToken(term);
-    PStemmer st = new PStemmer();
-
+    Stemmer st = new PorterStemmer();
     return st.stem(term).toString();
   }
 
@@ -639,9 +639,7 @@ public class TextProcessor {
     StringBuilder finalSummary;
 
     try {
-
-      String[] puncChars = { ":", "--", "PM", "MST", "EST", "CST", "PST",
-          "GMT", "AM", "  " };
+      String[] puncChars = { ":", "--", "PM", "MST", "EST", "CST", "PST", "GMT", "AM", "  " };
 
       txt = txt.replace(" | ", " ");
       txt = txt.replace(" |", " ");
@@ -808,8 +806,7 @@ public class TextProcessor {
 
     // scrub the title
     if (title.trim().length() > 0 && txt.contains(title.trim())) {
-      txt = txt
-          .substring(txt.indexOf(title.trim()) + title.trim().length() - 1);
+      txt = txt.substring(txt.indexOf(title.trim()) + title.trim().length() - 1);
     }
 
     // scrub before first -
@@ -863,7 +860,7 @@ public class TextProcessor {
   public static List<String> extractUrlsFromText(String txt) {
     List<String> urls = new ArrayList<>();
     // tokenize and iterate
-    String[] tokens = txt.split(" ");
+    String[] tokens = txt.split("\\s+");
     for (String t : tokens) {
       if (t.startsWith("http://")) {
         if (!urls.contains(t)) {
@@ -881,13 +878,12 @@ public class TextProcessor {
     if (segments.size() > 1) {
       List<String> allTokens = new ArrayList<>();
       for (String s : segments) {
-        String[] tks = s.split(" ");
+        String[] tks = s.split("\\s+");
         List<String> tokens = Arrays.asList(tks);
         HashMap<String, Integer> ut = TextProcessor.getUniqueTokenIndex(tokens);
         allTokens.addAll(ut.keySet());
       }
-      HashMap<String, Integer> uniqueTokens = TextProcessor
-          .getUniqueTokenIndex(allTokens);
+      Map<String, Integer> uniqueTokens = TextProcessor.getUniqueTokenIndex(allTokens);
       for (String t : uniqueTokens.keySet()) {
         Integer freq = uniqueTokens.get(t);
         if (freq == segments.size()) {
@@ -895,14 +891,13 @@ public class TextProcessor {
         }
       }
     }
-
     return commonTokens;
   }
 
   public static int numTokensInString(String txt) {
     int retVal = 0;
     if (txt != null && txt.trim().length() > 0) {
-      retVal = txt.trim().split(" ").length;
+      retVal = txt.trim().split("\\s+").length;
     }
     return retVal;
   }
diff --git a/opennlp-similarity/src/test/resources/sentence_parseObject.csv b/opennlp-similarity/src/test/resources/sentence_parseObject.csv
index 6d2d5dd..c11ec1d 100644
--- a/opennlp-similarity/src/test/resources/sentence_parseObject.csv
+++ b/opennlp-similarity/src/test/resources/sentence_parseObject.csv
@@ -1337,4 +1337,4 @@
 "My rental profits are added to my taxable income.  "
 "B-NP","I-NP","I-NP","B-VP","I-VP","B-PP","B-NP","I-NP","I-NP"
 "PRP$","JJ","NNS","VBP","VBN","TO","PRP$","JJ","NN"
-"My","rental","profits","are","added","to","my","taxable","income"
\ No newline at end of file
+"My","rental","profits","are","added","to","my","taxable","income"