You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@opennlp.apache.org by ma...@apache.org on 2023/04/16 15:05:21 UTC

[opennlp-sandbox] branch remove_copy_of_porterstemmer_from_summarizer_component created (now eb14b33)

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a change to branch remove_copy_of_porterstemmer_from_summarizer_component
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git


      at eb14b33  removes copy of PorterStemmer from summarizer component now relying on OpenNLP tools' default `PorterStemmer` (DRY) improves `DefaultDocProcessor` to better re-use the stemmer instance and make use of pre-compiled Pattern improves formatting along the path

This branch includes the following new commits:

     new eb14b33  removes copy of PorterStemmer from summarizer component now relying on OpenNLP tools' default `PorterStemmer` (DRY) improves `DefaultDocProcessor` to better re-use the stemmer instance and make use of pre-compiled Pattern improves formatting along the path

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.

[opennlp-sandbox] 01/01: removes copy of PorterStemmer from summarizer component now relying on OpenNLP tools' default `PorterStemmer` (DRY) improves `DefaultDocProcessor` to better re-use the stemmer instance and make use of pre-compiled Pattern improves formatting along the path

Posted by ma...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch remove_copy_of_porterstemmer_from_summarizer_component
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git

commit eb14b3379a3af101b965a9b6dccaef1093ecea26
Author: Martin Wiesner <ma...@hs-heilbronn.de>
AuthorDate: Sun Apr 16 17:05:14 2023 +0200

    removes copy of PorterStemmer from summarizer component now relying on OpenNLP tools' default `PorterStemmer` (DRY)
    improves `DefaultDocProcessor` to better re-use the stemmer instance and make use of pre-compiled Pattern
    improves formatting along the path
---
 .../main/java/opennlp/summarization/Sentence.java  |  11 +-
 .../opennlp/summarization/meta/MetaSummarizer.java |  14 +-
 .../preprocess/DefaultDocProcessor.java            |  23 +-
 .../summarization/preprocess/PorterStemmer.java    | 388 ---------------------
 4 files changed, 24 insertions(+), 412 deletions(-)

diff --git a/summarizer/src/main/java/opennlp/summarization/Sentence.java b/summarizer/src/main/java/opennlp/summarization/Sentence.java
index fad8cf1..07079b2 100755
--- a/summarizer/src/main/java/opennlp/summarization/Sentence.java
+++ b/summarizer/src/main/java/opennlp/summarization/Sentence.java
@@ -23,8 +23,8 @@ import java.util.ArrayList;
 import java.util.Locale;
 import java.util.Objects;
 
-import opennlp.summarization.preprocess.PorterStemmer;
 import opennlp.summarization.preprocess.StopWords;
+import opennlp.tools.stemmer.PorterStemmer;
 
 /**
  * A representation of a sentence geared toward pagerank and summarization.
@@ -159,16 +159,15 @@ public class Sentence {
 		StringBuilder b = new StringBuilder();
 		wrdItr.setText(stringVal);	
 		for(int wrdEnd = wrdItr.next(); wrdEnd != BreakIterator.DONE; 
-				wrdStrt = wrdEnd, wrdEnd = wrdItr.next())
-		{
+				wrdStrt = wrdEnd, wrdEnd = wrdItr.next()) {
 			String word = this.getStringVal().substring(wrdStrt, wrdEnd);//words[i].trim();
 			word = word.replace("\"|'","");
 
-			//Skip stop words and stem the word.
+			// Skip stop words and stem the word.
 			if(sw.isStopWord(word)) continue;
-			
+
 			stemmer.stem(word);
-			b.append(stemmer);
+			b.append(stemmer.toString());
 			b.append(SPACE);
 		}
 		return b.toString();
diff --git a/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java b/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java
index e6eca05..d4743ce 100644
--- a/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java
+++ b/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java
@@ -33,7 +33,8 @@ import opennlp.summarization.preprocess.DefaultDocProcessor;
 import opennlp.summarization.textrank.TextRankSummarizer;
 
 import opennlp.summarization.DocProcessor;
-/*
+
+/**
  * A summarizer that combines results from the text rank algorithm and the lexical chaining algorithm.
  * It runs both algorithm and uses the lexical chains to identify the main topics and relative importance
  * and the text rank to pick sentences from lexical chains.
@@ -60,9 +61,8 @@ public class MetaSummarizer {
   // Rank sentences by merging the scores from lexical chaining and text rank.
   // maxWords -1 indicates rank all sentences.
   public int getBestSent(LexicalChain l, Hashtable<Integer, Score> pageRankScores) {
-    double bestScore = 0; int bestStr=-1;
-    for(Sentence s : l.getSentences())
-    {
+    double bestScore = 0; int bestStr = -1;
+    for(Sentence s : l.getSentences()) {
       Score sc = pageRankScores.get(s.getSentId());
       if(sc!=null && sc.getScore() > bestScore)
       {
@@ -115,12 +115,10 @@ public class MetaSummarizer {
   }
 
   //Default Summarization using only lexical chains..
-  public String summarize(String article, int maxWords)
-  {
+  public String summarize(String article, int maxWords) {
     //Build lexical Chains..
     List<Sentence> sent = dp.getSentencesFromStr(article);
-
-    List<Score>finalSc = rankSentences(article, sent, maxWords);
+    List<Score> finalSc = rankSentences(article, sent, maxWords);
 
     StringBuilder sb = new StringBuilder();
     for (Score score : finalSc) {
diff --git a/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java b/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
index c54f76e..f4e1a0e 100755
--- a/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
+++ b/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
@@ -29,11 +29,13 @@ import java.util.Locale;
 import java.util.Hashtable;
 import java.util.logging.Level;
 import java.util.logging.Logger;
+import java.util.regex.Pattern;
 
 import opennlp.summarization.Sentence;
 import opennlp.summarization.DocProcessor;
 import opennlp.tools.sentdetect.SentenceDetectorME;
 import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.stemmer.PorterStemmer;
 import opennlp.tools.stemmer.Stemmer;
 
 /**
@@ -41,15 +43,19 @@ import opennlp.tools.stemmer.Stemmer;
  */
 public class DefaultDocProcessor implements DocProcessor {
   private SentenceModel sentModel;
-  private Stemmer stemmer;
-  private StopWords sw;
+  private final Stemmer stemmer;
+
+  private final static Pattern REPLACEMENT_PATTERN =
+          Pattern.compile("&#?[0-9 a-z A-Z][0-9 a-z A-Z][0-9 a-z A-Z]?;");
 
   // Sentence fragmentation to use..
   private static final int OPEN_NLP = 1;
   private static final int SIMPLE = 2;
-  private static final int SENTENCE_FRAG= OPEN_NLP;
+  private static final int SENTENCE_FRAG = OPEN_NLP;
 
   public DefaultDocProcessor(InputStream fragModelFile) {
+    stemmer = new PorterStemmer();
+
     try (InputStream modelIn = new BufferedInputStream(fragModelFile)){
       sentModel = new SentenceModel(modelIn);
     } catch(Exception ex){
@@ -65,7 +71,6 @@ public class DefaultDocProcessor implements DocProcessor {
                             Hashtable<String, List<Integer>> iidx, List<String> processedSent) {
     int oldSentEndIdx = 0;
     int sentEndIdx = 0;
-    Stemmer stemmer = new PorterStemmer();
     StopWords sw = StopWords.getInstance();
     BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US);
     BreakIterator wrdItr = BreakIterator.getWordInstance(Locale.US);
@@ -73,8 +78,7 @@ public class DefaultDocProcessor implements DocProcessor {
     int start = iterator.first();
     int sentCnt = 0;
 
-    for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next())
-    {
+    for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) {
       String sentence = str.substring(start,end);//str.substring(oldSentEndIdx, sentEndIdx).trim();
 
       //Add the sentence as-is; do any processing at the word level
@@ -100,7 +104,7 @@ public class DefaultDocProcessor implements DocProcessor {
         {
           if(stemedWrd.length()>1)
           {
-            List<Integer> sentList= iidx.get(stemedWrd);
+            List<Integer> sentList = iidx.get(stemedWrd);
             if(sentList==null)
             {
               sentList = new ArrayList<>();
@@ -130,7 +134,7 @@ public class DefaultDocProcessor implements DocProcessor {
       while ((nextLine = lnr.readLine()) != null) {
         String trimmedLine = nextLine.trim();
         if (!trimmedLine.isEmpty() ) {
-          docBuffer.append(trimmedLine.replaceAll("&#?[0-9 a-z A-Z][0-9 a-z A-Z][0-9 a-z A-Z]?;", "")).append(" ");
+          docBuffer.append(REPLACEMENT_PATTERN.matcher(trimmedLine).replaceAll("")).append(" ");
         }
       }
     } catch (Exception ex) {
@@ -154,8 +158,7 @@ public class DefaultDocProcessor implements DocProcessor {
           List<String> cleanedSents = new ArrayList<>();
           this.getSentences(trimmedLine, sents, null, cleanedSents);
           int paraPos = 1;
-          for(String sen:sents)
-          {
+          for(String sen:sents) {
             Sentence s = new Sentence();
             s.setSentId(sentNo++);
             s.setParagraph(paraNo);
diff --git a/summarizer/src/main/java/opennlp/summarization/preprocess/PorterStemmer.java b/summarizer/src/main/java/opennlp/summarization/preprocess/PorterStemmer.java
deleted file mode 100755
index 3b787f6..0000000
--- a/summarizer/src/main/java/opennlp/summarization/preprocess/PorterStemmer.java
+++ /dev/null
@@ -1,388 +0,0 @@
-/*
- 	* Licensed to the Apache Software Foundation (ASF) under one or more
- 	* contributor license agreements. See the NOTICE file distributed with
- 	* this work for additional information regarding copyright ownership.
- 	* The ASF licenses this file to You under the Apache License, Version 2.0
- 	* (the "License"); you may not use this file except in compliance with
- 	* the License. You may obtain a copy of the License at
- 	*
- 	* http://www.apache.org/licenses/LICENSE-2.0
- 	*
- 	* Unless required by applicable law or agreed to in writing, software
- 	* distributed under the License is distributed on an "AS IS" BASIS,
- 	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- 	* See the License for the specific language governing permissions and
- 	* limitations under the License.
-*/
-
-package opennlp.summarization.preprocess;
-
-/*
-
-   Porter stemmer in Java. The original paper is in
-
-       Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
-       no. 3, pp 130-137,
-
-   See also http://www.tartarus.org/~martin/PorterStemmer
-
-   History:
-
-   Release 1
-
-   Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
-   The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
-   is then out outside the bounds of b.
-
-   Release 2
-
-   Similarly,
-
-   Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
-   'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
-   b[j] is then outside the bounds of b.
-
-   Release 3
-
-   Considerably revised 4/9/00 in the light of many helpful suggestions
-   from Brian Goetz of Quiotix Corporation (brian@quiotix.com).
-
-   Release 4
-
-*/
-
-import opennlp.tools.stemmer.Stemmer;
-
-/**
-  * Stemmer, implementing the Porter Stemming Algorithm
-  * <p>
-  * The Stemmer class transforms a word into its root form.  The input
-  * word can be provided a character at time (by calling add()), or at once
-  * by calling one of the various stem(something) methods.
-  */
-public class PorterStemmer implements Stemmer {
-
-  private char[] b;
-  private int i,     /* offset into b */
-               i_end, /* offset to end of stemmed word */
-               j, k;
-  private static final int INC = 50;
-                     /* unit of size whereby b is increased */
-  public PorterStemmer() {
-    b = new char[INC];
-      i = 0;
-      i_end = 0;
-  }
-
-  /**
-  * Add a character to the word being stemmed.  When you are finished
-  * adding characters, you can call stem(void) to stem the word.
-  */
-
-  public void add(char ch) {
-    if (i == b.length)
-    {  char[] new_b = new char[i+INC];
-       for (int c = 0; c < i; c++) new_b[c] = b[c];
-       b = new_b;
-    }
-    b[i++] = ch;
-  }
-
-
-  /** Adds wLen characters to the word being stemmed contained in a portion
-  * of a char[] array. This is like repeated calls of add(char ch), but
-  * faster.
-  */
-
-  public void add(char[] w, int wLen) {
-    if (i+wLen >= b.length)
-    {  char[] new_b = new char[i+wLen+INC];
-       for (int c = 0; c < i; c++) new_b[c] = b[c];
-       b = new_b;
-    }
-    for (int c = 0; c < wLen; c++) b[i++] = w[c];
-  }
-
-  /**
-  * After a word has been stemmed, it can be retrieved by toString(),
-  * or a reference to the internal buffer can be retrieved by getResultBuffer
-  * and getResultLength (which is generally more efficient.)
-  */
-  @Override
-  public String toString() { return new String(b,0,i_end); }
-
-  /**
-  * Returns the length of the word resulting from the stemming process.
-  */
-  public int getResultLength() { return i_end; }
-
-  /**
-  * Returns a reference to a character buffer containing the results of
-  * the stemming process.  You also need to consult getResultLength()
-  * to determine the length of the result.
-  */
-  public char[] getResultBuffer() { return b; }
-
-  /* cons(i) is true <=> b[i] is a consonant. */
-
-  private boolean cons(int i) {
-    switch (b[i])
-    {  case 'a': case 'e': case 'i': case 'o': case 'u': return false;
-       case 'y': return (i==0) ? true : !cons(i-1);
-       default: return true;
-    }
-  }
-
-  /* m() measures the number of consonant sequences between 0 and j. if c is
-    a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
-    presence,
-
-       <c><v>       gives 0
-       <c>vc<v>     gives 1
-       <c>vcvc<v>   gives 2
-       <c>vcvcvc<v> gives 3
-       ....
-  */
-
-  private int m() {
-    int n = 0;
-    int i = 0;
-    while(true)
-    {  if (i > j) return n;
-       if (! cons(i)) break; i++;
-    }
-    i++;
-    while(true)
-    {  while(true)
-       {  if (i > j) return n;
-             if (cons(i)) break;
-             i++;
-       }
-       i++;
-       n++;
-       while(true)
-       {  if (i > j) return n;
-          if (! cons(i)) break;
-          i++;
-       }
-       i++;
-     }
-  }
-
-  /* vowelinstem() is true <=> 0,...j contains a vowel */
-
-  private boolean vowelinstem() {
-    int i; for (i = 0; i <= j; i++) if (! cons(i)) return true;
-    return false;
-  }
-
-  /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
-
-  private boolean doublec(int j) {
-    if (j < 1) return false;
-    if (b[j] != b[j-1]) return false;
-    return cons(j);
-  }
-
-  /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
-    and also if the second c is not w,x or y. this is used when trying to
-    restore an e at the end of a short word. e.g.
-
-       cav(e), lov(e), hop(e), crim(e), but
-       snow, box, tray.
-
-  */
-
-  private boolean cvc(int i) {
-    if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) return false;
-    {  int ch = b[i];
-       if (ch == 'w' || ch == 'x' || ch == 'y') return false;
-    }
-    return true;
-  }
-
-  private boolean ends(String s) {
-    int l = s.length();
-    int o = k-l+1;
-    if (o < 0) return false;
-    for (int i = 0; i < l; i++) if (b[o+i] != s.charAt(i)) return false;
-    j = k-l;
-    return true;
-  }
-
-  /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
-    k. */
-
-  private void setto(String s) {
-    int l = s.length();
-    int o = j+1;
-    for (int i = 0; i < l; i++) b[o+i] = s.charAt(i);
-    k = j+l;
-  }
-
-  /* r(s) is used further down. */
-
-  private void r(String s) { if (m() > 0) setto(s); }
-
-  /* step1() gets rid of plurals and -ed or -ing. e.g.
-
-        caresses  ->  caress
-        ponies    ->  poni
-        ties      ->  ti
-        caress    ->  caress
-        cats      ->  cat
-
-        feed      ->  feed
-        agreed    ->  agree
-        disabled  ->  disable
-
-        matting   ->  mat
-        mating    ->  mate
-        meeting   ->  meet
-        milling   ->  mill
-        messing   ->  mess
-
-        meetings  ->  meet
-
-  */
-
-  private void step1() {
-    if (b[k] == 's')
-    {
-       if (ends("sses")) k -= 2; else
-       if (ends("ies")) setto("i"); else
-       if (b[k-1] != 's') k--;
-    }
-    if (ends("eed")) { if (m() > 0) k--; } else
-    if ((ends("ed") || ends("ing")) && vowelinstem())
-    {  k = j;
-       if (ends("at")) setto("ate"); else
-       if (ends("bl")) setto("ble"); else
-       if (ends("iz")) setto("ize"); else
-       if (doublec(k))
-       {  k--;
-          {  int ch = b[k];
-             if (ch == 'l' || ch == 's' || ch == 'z') k++;
-          }
-       }
-       else if (m() == 1 && cvc(k)) setto("e");
-   }
-  }
-
-  /* step2() turns terminal y to i when there is another vowel in the stem. */
-
-  private void step2() { if (ends("y") && vowelinstem()) b[k] = 'i'; }
-
-  /* step3() maps double suffices to single ones. so -ization ( = -ize plus
-    -ation) maps to -ize etc. note that the string before the suffix must give
-    m() > 0. */
-
-  private void step3() { if (k == 0) return; /* For Bug 1 */ switch (b[k-1]) {
-     case 'a': if (ends("ational")) { r("ate"); break; }
-               if (ends("tional")) { r("tion"); break; }
-               break;
-     case 'c': if (ends("enci")) { r("ence"); break; }
-               if (ends("anci")) { r("ance"); break; }
-               break;
-     case 'e': if (ends("izer")) { r("ize"); break; }
-               break;
-     case 'l': if (ends("bli")) { r("ble"); break; }
-               if (ends("alli")) { r("al"); break; }
-               if (ends("entli")) { r("ent"); break; }
-               if (ends("eli")) { r("e"); break; }
-               if (ends("ousli")) { r("ous"); break; }
-               break;
-     case 'o': if (ends("ization")) { r("ize"); break; }
-               if (ends("ation")) { r("ate"); break; }
-               if (ends("ator")) { r("ate"); break; }
-               break;
-     case 's': if (ends("alism")) { r("al"); break; }
-               if (ends("iveness")) { r("ive"); break; }
-               if (ends("fulness")) { r("ful"); break; }
-               if (ends("ousness")) { r("ous"); break; }
-               break;
-     case 't': if (ends("aliti")) { r("al"); break; }
-               if (ends("iviti")) { r("ive"); break; }
-               if (ends("biliti")) { r("ble"); break; }
-               break;
-     case 'g': if (ends("logi")) { r("log"); break; }
-  } }
-
-  /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
-
-  private void step4() { switch (b[k]) {
-     case 'e': if (ends("icate")) { r("ic"); break; }
-               if (ends("ative")) { r(""); break; }
-               if (ends("alize")) { r("al"); break; }
-               break;
-     case 'i': if (ends("iciti")) { r("ic"); break; }
-               break;
-     case 'l': if (ends("ical")) { r("ic"); break; }
-               if (ends("ful")) { r(""); break; }
-               break;
-     case 's': if (ends("ness")) { r(""); break; }
-               break;
-  } }
-
-  /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
-
-  private void step5() {
-    if (k == 0) return; /* for Bug 1 */ switch (b[k-1])
-     {  case 'a': if (ends("al")) break; return;
-        case 'c': if (ends("ance")) break;
-                  if (ends("ence")) break; return;
-        case 'e': if (ends("er")) break; return;
-        case 'i': if (ends("ic")) break; return;
-        case 'l': if (ends("able")) break;
-                  if (ends("ible")) break; return;
-        case 'n': if (ends("ant")) break;
-                  if (ends("ement")) break;
-                  if (ends("ment")) break;
-                  /* element etc. not stripped before the m */
-                  if (ends("ent")) break; return;
-        case 'o': if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
-                                  /* j >= 0 fixes Bug 2 */
-                  if (ends("ou")) break; return;
-                  /* takes care of -ous */
-        case 's': if (ends("ism")) break; return;
-        case 't': if (ends("ate")) break;
-                  if (ends("iti")) break; return;
-        case 'u': if (ends("ous")) break; return;
-        case 'v': if (ends("ive")) break; return;
-        case 'z': if (ends("ize")) break; return;
-        default: return;
-     }
-     if (m() > 1) k = j;
-  }
-
-  /* step6() removes a final -e if m() > 1. */
-
-  private void step6() {
-    j = k;
-    if (b[k] == 'e')
-    {  int a = m();
-       if (a > 1 || a == 1 && !cvc(k-1)) k--;
-    }
-    if (b[k] == 'l' && doublec(k) && m() > 1) k--;
-  }
-
-  /** Stem the word placed into the Stemmer buffer through calls to add().
-  * Returns true if the stemming process resulted in a word different
-  * from the input.  You can retrieve the result with
-  * getResultLength()/getResultBuffer() or toString().
-  */
-  public void stem() {
-    k = i - 1;
-    if (k > 1) { step1(); step2(); step3(); step4(); step5(); step6(); }
-    i_end = k+1; i = 0;
-  }
-
-  public CharSequence stem(CharSequence word) {
-     b = new char[word.length()];
-     char[] arr = word.toString().toCharArray();
-     for(k=0;k<arr.length;k++) this.add(arr[k]);
-     stem();
-     return this.toString();
-  }
-}
-