You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2023/04/22 18:42:14 UTC
[opennlp-sandbox] 01/01: removes `PStemmer` from similarity component, now relying on OpenNLP tools' default PorterStemmer (DRY) improves formatting along the path
This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch Remove_copy_of_PorterStemmer_from_similarity_component_(DRY)_
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
commit c63443bae054d877b5bec296e126fdb80e6ab1f6
Author: Martin Wiesner <ma...@hs-heilbronn.de>
AuthorDate: Sat Apr 22 20:42:05 2023 +0200
removes `PStemmer` from similarity component, now relying on OpenNLP tools' default PorterStemmer (DRY)
improves formatting along the path
---
.../parse_thicket/matching/LemmaFormManager.java | 5 +-
.../parse_thicket/matching/LemmaGeneralizer.java | 5 +-
.../parse_thicket/matching/PhraseGeneralizer.java | 7 +-
.../similarity/apps/StoryDiscourseNavigator.java | 21 +-
.../apps/taxo_builder/DomainTaxonomyExtender.java | 20 +-
.../taxo_builder/TaxonomyExtenderViaMebMining.java | 15 +-
.../apps/utils/StringDistanceMeasurer.java | 14 +-
.../main/java/opennlp/tools/stemmer/PStemmer.java | 511 ---------------------
.../tools/textsimilarity/LemmaFormManager.java | 7 +-
.../ParseTreeMatcherDeterministic.java | 20 +-
.../tools/textsimilarity/TextProcessor.java | 25 +-
.../src/test/resources/sentence_parseObject.csv | 2 +-
12 files changed, 60 insertions(+), 592 deletions(-)
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaFormManager.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaFormManager.java
index bdb4052..fa9f951 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaFormManager.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaFormManager.java
@@ -17,12 +17,11 @@
package opennlp.tools.parse_thicket.matching;
-import opennlp.tools.stemmer.PStemmer;
+import opennlp.tools.stemmer.Stemmer;
public class LemmaFormManager {
- public String matchLemmas(PStemmer ps, String lemma1, String lemma2,
- String POS) {
+ public String matchLemmas(Stemmer ps, String lemma1, String lemma2, String POS) {
if (POS == null) {
return null;
}
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaGeneralizer.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaGeneralizer.java
index 42590a2..bd03c52 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaGeneralizer.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaGeneralizer.java
@@ -21,12 +21,13 @@ import java.util.ArrayList;
import java.util.List;
import opennlp.tools.parse_thicket.IGeneralizer;
-import opennlp.tools.stemmer.PStemmer;
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.stemmer.Stemmer;
import opennlp.tools.word2vec.W2VDistanceMeasurer;
public class LemmaGeneralizer implements IGeneralizer<String> {
public static final String W2V_PREFIX = "w2v_";
- private final PStemmer ps = new PStemmer();
+ private final Stemmer ps = new PorterStemmer();
private String pos = null;
private final W2VDistanceMeasurer w2v;
public LemmaGeneralizer() {
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGeneralizer.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGeneralizer.java
index 02455a5..976a93e 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGeneralizer.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGeneralizer.java
@@ -22,7 +22,8 @@ import java.util.List;
import opennlp.tools.parse_thicket.IGeneralizer;
import opennlp.tools.parse_thicket.ParseTreeNode;
-import opennlp.tools.stemmer.PStemmer;
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.stemmer.Stemmer;
import opennlp.tools.textsimilarity.GeneralizationListReducer;
import opennlp.tools.textsimilarity.ParseTreeChunk;
@@ -33,7 +34,7 @@ public class PhraseGeneralizer implements IGeneralizer<ParseTreeChunk> {
protected final PartOfSpeechGeneralizer posManager = new PartOfSpeechGeneralizer();
- protected final PStemmer ps = new PStemmer();
+ private final Stemmer ps = new PorterStemmer();
protected final ParseTreeNodeGeneralizer nodeGen = new ParseTreeNodeGeneralizer();
/**
@@ -248,10 +249,8 @@ public class PhraseGeneralizer implements IGeneralizer<ParseTreeChunk> {
ParseTreeChunk currResult = new ParseTreeChunk(results);
//currResultOld = new ParseTreeChunk(commonLemmas, commonPOS, 0, 0);
-
resultChunks.add(currResult);
}
-
return resultChunks;
}
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java
index 77777d8..574d9ab 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java
@@ -27,18 +27,18 @@ import org.apache.commons.lang.StringUtils;
import opennlp.tools.similarity.apps.utils.PageFetcher;
import opennlp.tools.similarity.apps.utils.StringCleaner;
-import opennlp.tools.stemmer.PStemmer;
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.stemmer.Stemmer;
import opennlp.tools.textsimilarity.ParseTreeChunk;
import opennlp.tools.textsimilarity.SentencePairMatchResult;
import opennlp.tools.textsimilarity.TextProcessor;
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
public class StoryDiscourseNavigator {
- protected final BingQueryRunner yrunner = new BingQueryRunner();
- final ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor
- .getInstance();
- private final PStemmer ps = new PStemmer();
- final PageFetcher pFetcher = new PageFetcher();
+ private final BingQueryRunner yrunner = new BingQueryRunner();
+ private final ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor.getInstance();
+ private final Stemmer ps = new PorterStemmer();
+ private final PageFetcher pFetcher = new PageFetcher();
public static final String[] FREQUENT_PERFORMING_VERBS = {
" born raised meet learn ", " graduated enter discover",
@@ -100,8 +100,7 @@ public class StoryDiscourseNavigator {
return res;
}
- private List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query,
- String domain, String lang, int numbOfHits) {
+ private List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query, String domain, String lang, int numbOfHits) {
List<List<ParseTreeChunk>> genResult = new ArrayList<>();
try {
List<HitBase> resultList = yrunner.runSearch(query, numbOfHits);
@@ -129,8 +128,7 @@ public class StoryDiscourseNavigator {
return genResult;
}
- private List<List<String>> getCommonWordsFromList_List_ParseTreeChunk(
- List<List<ParseTreeChunk>> matchList) {
+ private List<List<String>> getCommonWordsFromList_List_ParseTreeChunk(List<List<ParseTreeChunk>> matchList) {
List<List<String>> res = new ArrayList<>();
for (List<ParseTreeChunk> chunks : matchList) {
List<String> wordRes = new ArrayList<>();
@@ -141,7 +139,7 @@ public class StoryDiscourseNavigator {
&& ((ch.getPOSs().get(w).startsWith("NN") || ch.getPOSs().get(w)
.startsWith("VB"))) && lemmas.get(w).length() > 2) {
String formedWord = lemmas.get(w);
- String stemmedFormedWord = ps.stem(formedWord);
+ String stemmedFormedWord = ps.stem(formedWord).toString();
if (!stemmedFormedWord.startsWith("invalid"))
wordRes.add(formedWord);
}
@@ -154,6 +152,7 @@ public class StoryDiscourseNavigator {
res = new ArrayList<>(new HashSet<>(res));
return res;
}
+
public static void main(String[] args){
String[] res = new StoryDiscourseNavigator().obtainAdditionalKeywordsForAnEntity("Albert Einstein");
System.out.println(Arrays.asList(res));
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/DomainTaxonomyExtender.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/DomainTaxonomyExtender.java
index f7cb34b..6402e2d 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/DomainTaxonomyExtender.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/DomainTaxonomyExtender.java
@@ -27,7 +27,8 @@ import org.apache.commons.lang.StringUtils;
import opennlp.tools.similarity.apps.BingQueryRunner;
import opennlp.tools.similarity.apps.HitBase;
import opennlp.tools.similarity.apps.utils.StringCleaner;
-import opennlp.tools.stemmer.PStemmer;
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.stemmer.Stemmer;
import opennlp.tools.textsimilarity.ParseTreeChunk;
import opennlp.tools.textsimilarity.SentencePairMatchResult;
import opennlp.tools.textsimilarity.TextProcessor;
@@ -39,13 +40,12 @@ public class DomainTaxonomyExtender {
private final ParserChunker2MatcherProcessor matcher = ParserChunker2MatcherProcessor.getInstance();
protected static final String BING_KEY = "WFoNMM706MMJ5JYfcHaSEDP+faHj3xAxt28CPljUAHA";
-
private final static String TAXO_FILENAME = "taxo_data.dat";
private Map<String, List<List<String>>> lemma_ExtendedAssocWords = new HashMap<>();
private final Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords = new HashMap<>();
- private final PStemmer ps;
+ private final Stemmer ps = new PorterStemmer();
final CsvAdapter adapter = new CsvAdapter();
@@ -63,14 +63,13 @@ public class DomainTaxonomyExtender {
}
public DomainTaxonomyExtender() {
- ps = new PStemmer();
adapter.importCSV();
brunner.setKey(BING_KEY);
}
private List<List<String>> getCommonWordsFromList_List_ParseTreeChunk(
- List<List<ParseTreeChunk>> matchList, List<String> queryWordsToRemove,
- List<String> toAddAtEnd) {
+ List<List<ParseTreeChunk>> matchList, List<String> queryWordsToRemove, List<String> toAddAtEnd) {
+
List<List<String>> res = new ArrayList<>();
for (List<ParseTreeChunk> chunks : matchList) {
List<String> wordRes = new ArrayList<>();
@@ -81,7 +80,7 @@ public class DomainTaxonomyExtender {
&& ((ch.getPOSs().get(w).startsWith("NN") || ch.getPOSs().get(w).startsWith("JJ") || ch.getPOSs().get(w)
.startsWith("VB"))) && lemmas.get(w).length() > 2) {
String formedWord = lemmas.get(w);
- String stemmedFormedWord = ps.stem(formedWord);
+ String stemmedFormedWord = ps.stem(formedWord).toString();
if (!stemmedFormedWord.startsWith("invalid"))
wordRes.add(formedWord);
}
@@ -104,7 +103,6 @@ public class DomainTaxonomyExtender {
public void extendTaxonomy(String fileNameWithTaxonomyRoot, String domain, String lang) {
-
List<String> entries = new ArrayList<>((adapter.lemma_AssocWords.keySet()));
try {
for (String entity : entries) { // .
@@ -166,8 +164,7 @@ public class DomainTaxonomyExtender {
return genResult;
}
- public List<String> runSearchForTaxonomyPathFlatten(String query,
- String domain, String lang, int numbOfHits) {
+ public List<String> runSearchForTaxonomyPathFlatten(String query, String domain, String lang, int numbOfHits) {
List<String> genResult = new ArrayList<>();
try {
List<HitBase> resultList = brunner.runSearch(query, numbOfHits);
@@ -213,8 +210,7 @@ public class DomainTaxonomyExtender {
public static void main(String[] args) {
DomainTaxonomyExtender self = new DomainTaxonomyExtender();
- self.extendTaxonomy("", "music",
- "en");
+ self.extendTaxonomy("", "music", "en");
}
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java
index e780330..cbd1229 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java
@@ -25,7 +25,8 @@ import java.util.Map;
import opennlp.tools.similarity.apps.BingQueryRunner;
import opennlp.tools.similarity.apps.HitBase;
import opennlp.tools.similarity.apps.utils.StringCleaner;
-import opennlp.tools.stemmer.PStemmer;
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.stemmer.Stemmer;
import opennlp.tools.textsimilarity.ParseTreeChunk;
import opennlp.tools.textsimilarity.SentencePairMatchResult;
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
@@ -44,7 +45,7 @@ public class TaxonomyExtenderViaMebMining extends BingQueryRunner {
private Map<String, List<List<String>>> lemma_ExtendedAssocWords = new HashMap<>();
private final Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords = new HashMap<>();
- private final PStemmer ps;
+ private final Stemmer ps = new PorterStemmer();
public Map<List<String>, List<List<String>>> getAssocWords_ExtendedAssocWords() {
return assocWords_ExtendedAssocWords;
@@ -66,8 +67,6 @@ public class TaxonomyExtenderViaMebMining extends BingQueryRunner {
System.err.println("Problem loading synt matcher");
}
- ps = new PStemmer();
-
}
private List<List<String>> getCommonWordsFromList_List_ParseTreeChunk(
@@ -83,7 +82,7 @@ public class TaxonomyExtenderViaMebMining extends BingQueryRunner {
&& ((ch.getPOSs().get(w).startsWith("NN") || ch.getPOSs().get(w)
.startsWith("VB"))) && lemmas.get(w).length() > 2) {
String formedWord = lemmas.get(w);
- String stemmedFormedWord = ps.stem(formedWord);
+ String stemmedFormedWord = ps.stem(formedWord).toString();
if (!stemmedFormedWord.startsWith("invalid"))
wordRes.add(formedWord);
}
@@ -114,8 +113,7 @@ public class TaxonomyExtenderViaMebMining extends BingQueryRunner {
// here
query = query.replace('[', ' ').replace(']', ' ').replace(',', ' ')
.replace('_', ' ');
- List<List<ParseTreeChunk>> matchList = runSearchForTaxonomyPath(
- query, "", lang, 30);
+ List<List<ParseTreeChunk>> matchList = runSearchForTaxonomyPath(query, "", lang, 30);
List<String> toRemoveFromExtension = new ArrayList<>(taxoPath);
toRemoveFromExtension.add(entity);
toRemoveFromExtension.add(domain);
@@ -135,8 +133,7 @@ public class TaxonomyExtenderViaMebMining extends BingQueryRunner {
ser.writeTaxonomy(fileName.replace(".ari", "Taxo.dat"));
}
- public List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query,
- String domain, String lang, int numbOfHits) {
+ public List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query, String domain, String lang, int numbOfHits) {
List<List<ParseTreeChunk>> genResult = new ArrayList<>();
try {
List<HitBase> resultList = runSearch(query, numbOfHits);
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java
index 8424dab..88a3bac 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java
@@ -17,14 +17,14 @@
package opennlp.tools.similarity.apps.utils;
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.stemmer.Stemmer;
+
import java.util.ArrayList;
import java.util.List;
-import opennlp.tools.stemmer.PStemmer;
-
public class StringDistanceMeasurer {
- // external tools
- private final PStemmer ps; // stemmer
+ private final Stemmer ps = new PorterStemmer();
private static final int MIN_STRING_LENGTH_FOR_WORD = 4;
@@ -35,11 +35,7 @@ public class StringDistanceMeasurer {
private static final double MIN_SCORE_FOR_LING = 100; // 0.7;
public StringDistanceMeasurer() {
- // first get stemmer
- ps = new PStemmer();
- if (MIN_SCORE_FOR_LING > 1.0)
- return;
-
+
}
// gets string array and process numbers, applies stemming and forms a list
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/stemmer/PStemmer.java b/opennlp-similarity/src/main/java/opennlp/tools/stemmer/PStemmer.java
deleted file mode 100644
index 464d064..0000000
--- a/opennlp-similarity/src/main/java/opennlp/tools/stemmer/PStemmer.java
+++ /dev/null
@@ -1,511 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.stemmer;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.FileInputStream;
-
-import org.apache.lucene.util.ArrayUtil;
-
-/**
- * Stemmer, implementing the Porter Stemming Algorithm
- * <p>
- * The Stemmer class transforms a word into its root form. The input
- * word can be provided a character at time (by calling add()), or at once
- * by calling one of the various stem(something) methods.
- */
-public class PStemmer {
- private char[] b;
- private int i, /* offset into b */
- j, k, k0;
- private boolean dirty = false;
- private static final int INITIAL_SIZE = 50;
-
- public PStemmer() {
- b = new char[INITIAL_SIZE];
- i = 0;
- }
-
- /**
- * Resets the stemmer, so it can stem another word. If you invoke
- * the stemmer by calling {@link #add(char)} and then {@link #stem()}, you must call
- * {@code reset()} before starting another word.
- */
- public void reset() { i = 0; dirty = false; }
-
- /**
- * Add a character to the word being stemmed. When you are finished
- * adding characters, you can call {@link #stem()} to process the word.
- */
- public void add(char ch) {
- if (b.length <= i) {
- b = ArrayUtil.grow(b, i+1);
- }
- b[i++] = ch;
- }
-
- /**
- * After a word has been stemmed, it can be retrieved by toString(),
- * or a reference to the internal buffer can be retrieved by getResultBuffer
- * and getResultLength (which is generally more efficient.)
- */
- @Override
- public String toString() { return new String(b,0,i); }
-
- /**
- * Returns the length of the word resulting from the stemming process.
- */
- public int getResultLength() { return i; }
-
- /**
- * Returns a reference to a character buffer containing the results of
- * the stemming process. You also need to consult getResultLength()
- * to determine the length of the result.
- */
- public char[] getResultBuffer() { return b; }
-
- /* cons(i) is true <=> b[i] is a consonant. */
-
- private boolean cons(int i) {
- switch (b[i]) {
- case 'a': case 'e': case 'i': case 'o': case 'u':
- return false;
- case 'y':
- return (i==k0) ? true : !cons(i-1);
- default:
- return true;
- }
- }
-
- /* m() measures the number of consonant sequences between k0 and j. if c is
- a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
- presence,
-
- <c><v> gives 0
- <c>vc<v> gives 1
- <c>vcvc<v> gives 2
- <c>vcvcvc<v> gives 3
- ....
- */
-
- private int m() {
- int n = 0;
- int i = k0;
- while(true) {
- if (i > j)
- return n;
- if (! cons(i))
- break;
- i++;
- }
- i++;
- while(true) {
- while(true) {
- if (i > j)
- return n;
- if (cons(i))
- break;
- i++;
- }
- i++;
- n++;
- while(true) {
- if (i > j)
- return n;
- if (! cons(i))
- break;
- i++;
- }
- i++;
- }
- }
-
- /* vowelinstem() is true <=> k0,...j contains a vowel */
-
- private boolean vowelinstem() {
- int i;
- for (i = k0; i <= j; i++)
- if (! cons(i))
- return true;
- return false;
- }
-
- /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
-
- private boolean doublec(int j) {
- if (j < k0+1)
- return false;
- if (b[j] != b[j-1])
- return false;
- return cons(j);
- }
-
- /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
- and also if the second c is not w,x or y. this is used when trying to
- restore an e at the end of a short word. e.g.
-
- cav(e), lov(e), hop(e), crim(e), but
- snow, box, tray.
-
- */
-
- private boolean cvc(int i) {
- if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2))
- return false;
- else {
- int ch = b[i];
- if (ch == 'w' || ch == 'x' || ch == 'y') return false;
- }
- return true;
- }
-
- private boolean ends(String s) {
- int l = s.length();
- int o = k-l+1;
- if (o < k0)
- return false;
- for (int i = 0; i < l; i++)
- if (b[o+i] != s.charAt(i))
- return false;
- j = k-l;
- return true;
- }
-
- /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
- k. */
-
- void setto(String s) {
- int l = s.length();
- int o = j+1;
- for (int i = 0; i < l; i++)
- b[o+i] = s.charAt(i);
- k = j+l;
- dirty = true;
- }
-
- /* r(s) is used further down. */
-
- void r(String s) { if (m() > 0) setto(s); }
-
- /* step1() gets rid of plurals and -ed or -ing. e.g.
-
- caresses -> caress
- ponies -> poni
- ties -> ti
- caress -> caress
- cats -> cat
-
- feed -> feed
- agreed -> agree
- disabled -> disable
-
- matting -> mat
- mating -> mate
- meeting -> meet
- milling -> mill
- messing -> mess
-
- meetings -> meet
-
- */
-
- private void step1() {
- if (b[k] == 's') {
- if (ends("sses")) k -= 2;
- else if (ends("ies")) setto("i");
- else if (b[k-1] != 's') k--;
- }
- if (ends("eed")) {
- if (m() > 0)
- k--;
- }
- else if ((ends("ed") || ends("ing")) && vowelinstem()) {
- k = j;
- if (ends("at")) setto("ate");
- else if (ends("bl")) setto("ble");
- else if (ends("iz")) setto("ize");
- else if (doublec(k)) {
- int ch = b[k--];
- if (ch == 'l' || ch == 's' || ch == 'z')
- k++;
- }
- else if (m() == 1 && cvc(k))
- setto("e");
- }
- }
-
- /* step2() turns terminal y to i when there is another vowel in the stem. */
-
- private void step2() {
- if (ends("y") && vowelinstem()) {
- b[k] = 'i';
- dirty = true;
- }
- }
-
- /* step3() maps double suffices to single ones. so -ization ( = -ize plus
- -ation) maps to -ize etc. note that the string before the suffix must give
- m() > 0. */
-
- private void step3() {
- if (k == k0) return; /* For Bug 1 */
- switch (b[k-1]) {
- case 'a':
- if (ends("ational")) { r("ate"); break; }
- if (ends("tional")) { r("tion"); break; }
- break;
- case 'c':
- if (ends("enci")) { r("ence"); break; }
- if (ends("anci")) { r("ance"); break; }
- break;
- case 'e':
- if (ends("izer")) { r("ize"); break; }
- break;
- case 'l':
- if (ends("bli")) { r("ble"); break; }
- if (ends("alli")) { r("al"); break; }
- if (ends("entli")) { r("ent"); break; }
- if (ends("eli")) { r("e"); break; }
- if (ends("ousli")) { r("ous"); break; }
- break;
- case 'o':
- if (ends("ization")) { r("ize"); break; }
- if (ends("ation")) { r("ate"); break; }
- if (ends("ator")) { r("ate"); break; }
- break;
- case 's':
- if (ends("alism")) { r("al"); break; }
- if (ends("iveness")) { r("ive"); break; }
- if (ends("fulness")) { r("ful"); break; }
- if (ends("ousness")) { r("ous"); break; }
- break;
- case 't':
- if (ends("aliti")) { r("al"); break; }
- if (ends("iviti")) { r("ive"); break; }
- if (ends("biliti")) { r("ble"); break; }
- break;
- case 'g':
- if (ends("logi")) { r("log"); break; }
- }
- }
-
- /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
-
- private void step4() {
- switch (b[k]) {
- case 'e':
- if (ends("icate")) { r("ic"); break; }
- if (ends("ative")) { r(""); break; }
- if (ends("alize")) { r("al"); break; }
- break;
- case 'i':
- if (ends("iciti")) { r("ic"); break; }
- break;
- case 'l':
- if (ends("ical")) { r("ic"); break; }
- if (ends("ful")) { r(""); break; }
- break;
- case 's':
- if (ends("ness")) { r(""); break; }
- break;
- }
- }
-
- /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
-
- private void step5() {
- if (k == k0) return; /* for Bug 1 */
- switch (b[k-1]) {
- case 'a':
- if (ends("al")) break;
- return;
- case 'c':
- if (ends("ance")) break;
- if (ends("ence")) break;
- return;
- case 'e':
- if (ends("er")) break; return;
- case 'i':
- if (ends("ic")) break; return;
- case 'l':
- if (ends("able")) break;
- if (ends("ible")) break; return;
- case 'n':
- if (ends("ant")) break;
- if (ends("ement")) break;
- if (ends("ment")) break;
- /* element etc. not stripped before the m */
- if (ends("ent")) break;
- return;
- case 'o':
- if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
- /* j >= 0 fixes Bug 2 */
- if (ends("ou")) break;
- return;
- /* takes care of -ous */
- case 's':
- if (ends("ism")) break;
- return;
- case 't':
- if (ends("ate")) break;
- if (ends("iti")) break;
- return;
- case 'u':
- if (ends("ous")) break;
- return;
- case 'v':
- if (ends("ive")) break;
- return;
- case 'z':
- if (ends("ize")) break;
- return;
- default:
- return;
- }
- if (m() > 1)
- k = j;
- }
-
- /* step6() removes a final -e if m() > 1. */
-
- private void step6() {
- j = k;
- if (b[k] == 'e') {
- int a = m();
- if (a > 1 || a == 1 && !cvc(k-1))
- k--;
- }
- if (b[k] == 'l' && doublec(k) && m() > 1)
- k--;
- }
-
-
- /**
- * Stem a word provided as a String. Returns the result as a String.
- */
- public String stem(String s) {
- if (stem(s.toCharArray(), s.length()))
- return toString();
- else
- return s;
- }
-
- /** Stem a word contained in a char[]. Returns true if the stemming process
- * resulted in a word different from the input. You can retrieve the
- * result with getResultLength()/getResultBuffer() or toString().
- */
- public boolean stem(char[] word) {
- return stem(word, word.length);
- }
-
- /** Stem a word contained in a portion of a char[] array. Returns
- * true if the stemming process resulted in a word different from
- * the input. You can retrieve the result with
- * getResultLength()/getResultBuffer() or toString().
- */
- public boolean stem(char[] wordBuffer, int offset, int wordLen) {
- reset();
- if (b.length < wordLen) {
- b = new char[ArrayUtil.oversize(wordLen, Character.BYTES)];
- }
- System.arraycopy(wordBuffer, offset, b, 0, wordLen);
- i = wordLen;
- return stem(0);
- }
-
- /** Stem a word contained in a leading portion of a char[] array.
- * Returns true if the stemming process resulted in a word different
- * from the input. You can retrieve the result with
- * getResultLength()/getResultBuffer() or toString().
- */
- public boolean stem(char[] word, int wordLen) {
- return stem(word, 0, wordLen);
- }
-
- /** Stem the word placed into the Stemmer buffer through calls to add().
- * Returns true if the stemming process resulted in a word different
- * from the input. You can retrieve the result with
- * getResultLength()/getResultBuffer() or toString().
- */
- public boolean stem() {
- return stem(0);
- }
-
- public boolean stem(int i0) {
- k = i - 1;
- k0 = i0;
- if (k > k0+1) {
- step1(); step2(); step3(); step4(); step5(); step6();
- }
- // Also, a word is considered dirty if we lopped off letters
- // Thanks to Ifigenia Vairelles for pointing this out.
- if (i != k+1)
- dirty = true;
- i = k+1;
- return dirty;
- }
-
- /** Test program for demonstrating the Stemmer. It reads a file and
- * stems each word, writing the result to standard out.
- * Usage: Stemmer file-name
- */
- public static void main(String[] args) {
- PStemmer s = new PStemmer();
-
- for (String arg : args) {
- try (InputStream in = new FileInputStream(arg)) {
- byte[] buffer = new byte[1024];
- int bufferLen, offset, ch;
-
- bufferLen = in.read(buffer);
- offset = 0;
- s.reset();
-
- while (true) {
- if (offset < bufferLen)
- ch = buffer[offset++];
- else {
- bufferLen = in.read(buffer);
- offset = 0;
- if (bufferLen < 0)
- ch = -1;
- else
- ch = buffer[offset++];
- }
-
- if (Character.isLetter((char) ch)) {
- s.add(Character.toLowerCase((char) ch));
- } else {
- s.stem();
- System.out.print(s);
- s.reset();
- if (ch < 0)
- break;
- else {
- System.out.print((char) ch);
- }
- }
- }
- } catch (IOException e) {
- System.out.println("error reading " + arg);
- }
- }
- }
-}
-
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java
index a72583e..c40767f 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java
@@ -17,14 +17,13 @@
package opennlp.tools.textsimilarity;
-import java.util.List;
+import opennlp.tools.stemmer.Stemmer;
-import opennlp.tools.stemmer.PStemmer;
+import java.util.List;
public class LemmaFormManager {
- public String matchLemmas(PStemmer ps, String lemma1, String lemma2,
- String POS) {
+ public String matchLemmas(Stemmer ps, String lemma1, String lemma2, String POS) {
if (POS == null) {
return null;
}
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministic.java b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministic.java
index 0f2fd20..9a78670 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministic.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministic.java
@@ -17,11 +17,12 @@
package opennlp.tools.textsimilarity;
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.stemmer.Stemmer;
+
import java.util.ArrayList;
import java.util.List;
-import opennlp.tools.stemmer.PStemmer;
-
public class ParseTreeMatcherDeterministic {
private final GeneralizationListReducer generalizationListReducer = new GeneralizationListReducer();
@@ -30,6 +31,8 @@ public class ParseTreeMatcherDeterministic {
private final POSManager posManager = new POSManager();
+ private final Stemmer ps = new PorterStemmer();
+
/**
* key matching function which takes two phrases, aligns them and finds a set
* of maximum common sub-phrase
@@ -38,7 +41,6 @@ public class ParseTreeMatcherDeterministic {
* @param chunk2
* @return
*/
-
public List<ParseTreeChunk> generalizeTwoGroupedPhrasesDeterministic(
ParseTreeChunk chunk1, ParseTreeChunk chunk2) {
List<String> pos1 = chunk1.getPOSs();
@@ -49,10 +51,9 @@ public class ParseTreeMatcherDeterministic {
List<String> lem1stem = new ArrayList<>();
List<String> lem2stem = new ArrayList<>();
- PStemmer ps = new PStemmer();
for (String word : lem1) {
try {
- lem1stem.add(ps.stem(word.toLowerCase()));
+ lem1stem.add(ps.stem(word.toLowerCase()).toString());
} catch (Exception e) {
// e.printStackTrace();
@@ -62,7 +63,7 @@ public class ParseTreeMatcherDeterministic {
}
try {
for (String word : lem2) {
- lem2stem.add(ps.stem(word.toLowerCase()));
+ lem2stem.add(ps.stem(word.toLowerCase()).toString());
}
} catch (Exception e) {
System.err.println("problem processing word " + lem2.toString());
@@ -130,8 +131,7 @@ public class ParseTreeMatcherDeterministic {
String lemmaMatch = lemmaFormManager.matchLemmas(ps, lem1.get(k1),
lem2.get(k2), sim);
if ((sim != null)
- && (lemmaMatch == null || (lemmaMatch != null && !lemmaMatch
- .equals("fail")))) {
+ && (lemmaMatch == null || (lemmaMatch != null && !lemmaMatch.equals("fail")))) {
commonPOS.add(pos1.get(k1));
if (lemmaMatch != null) {
commonLemmas.add(lemmaMatch);
@@ -198,8 +198,7 @@ public class ParseTreeMatcherDeterministic {
bReachedCommonWord = false;
}
}
- ParseTreeChunk currResult = new ParseTreeChunk(commonLemmas, commonPOS,
- 0, 0);
+ ParseTreeChunk currResult = new ParseTreeChunk(commonLemmas, commonPOS, 0, 0);
results.add(currResult);
}
@@ -267,7 +266,6 @@ public class ParseTreeMatcherDeterministic {
resultComps = generalizationListReducer.applyFilteringBySubsumption(resultComps);
results.add(resultComps);
}
-
return results;
}
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/TextProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/TextProcessor.java
index 55cbf04..cae4565 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/TextProcessor.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/TextProcessor.java
@@ -34,8 +34,9 @@ import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
-import opennlp.tools.stemmer.PStemmer;
import opennlp.tools.similarity.apps.utils.Pair;
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.stemmer.Stemmer;
public class TextProcessor {
@@ -480,7 +481,7 @@ public class TextProcessor {
}
}
- return new PStemmer().stem(token).toString();
+ return new PorterStemmer().stem(token);
}
public static String cleanToken(String token) {
@@ -525,8 +526,7 @@ public class TextProcessor {
public static String stemTerm(String term) {
term = stripToken(term);
- PStemmer st = new PStemmer();
-
+ Stemmer st = new PorterStemmer();
return st.stem(term).toString();
}
@@ -639,9 +639,7 @@ public class TextProcessor {
StringBuilder finalSummary;
try {
-
- String[] puncChars = { ":", "--", "PM", "MST", "EST", "CST", "PST",
- "GMT", "AM", " " };
+ String[] puncChars = { ":", "--", "PM", "MST", "EST", "CST", "PST", "GMT", "AM", " " };
txt = txt.replace(" | ", " ");
txt = txt.replace(" |", " ");
@@ -808,8 +806,7 @@ public class TextProcessor {
// scrub the title
if (title.trim().length() > 0 && txt.contains(title.trim())) {
- txt = txt
- .substring(txt.indexOf(title.trim()) + title.trim().length() - 1);
+ txt = txt.substring(txt.indexOf(title.trim()) + title.trim().length() - 1);
}
// scrub before first -
@@ -863,7 +860,7 @@ public class TextProcessor {
public static List<String> extractUrlsFromText(String txt) {
List<String> urls = new ArrayList<>();
// tokenize and iterate
- String[] tokens = txt.split(" ");
+ String[] tokens = txt.split("\\s+");
for (String t : tokens) {
if (t.startsWith("http://")) {
if (!urls.contains(t)) {
@@ -881,13 +878,12 @@ public class TextProcessor {
if (segments.size() > 1) {
List<String> allTokens = new ArrayList<>();
for (String s : segments) {
- String[] tks = s.split(" ");
+ String[] tks = s.split("\\s+");
List<String> tokens = Arrays.asList(tks);
HashMap<String, Integer> ut = TextProcessor.getUniqueTokenIndex(tokens);
allTokens.addAll(ut.keySet());
}
- HashMap<String, Integer> uniqueTokens = TextProcessor
- .getUniqueTokenIndex(allTokens);
+ Map<String, Integer> uniqueTokens = TextProcessor.getUniqueTokenIndex(allTokens);
for (String t : uniqueTokens.keySet()) {
Integer freq = uniqueTokens.get(t);
if (freq == segments.size()) {
@@ -895,14 +891,13 @@ public class TextProcessor {
}
}
}
-
return commonTokens;
}
public static int numTokensInString(String txt) {
int retVal = 0;
if (txt != null && txt.trim().length() > 0) {
- retVal = txt.trim().split(" ").length;
+ retVal = txt.trim().split("\\s+").length;
}
return retVal;
}
diff --git a/opennlp-similarity/src/test/resources/sentence_parseObject.csv b/opennlp-similarity/src/test/resources/sentence_parseObject.csv
index 6d2d5dd..c11ec1d 100644
--- a/opennlp-similarity/src/test/resources/sentence_parseObject.csv
+++ b/opennlp-similarity/src/test/resources/sentence_parseObject.csv
@@ -1337,4 +1337,4 @@
"My rental profits are added to my taxable income. "
"B-NP","I-NP","I-NP","B-VP","I-VP","B-PP","B-NP","I-NP","I-NP"
"PRP$","JJ","NNS","VBP","VBN","TO","PRP$","JJ","NN"
-"My","rental","profits","are","added","to","my","taxable","income"
\ No newline at end of file
+"My","rental","profits","are","added","to","my","taxable","income"