You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@opennlp.apache.org by ma...@apache.org on 2023/04/17 08:28:16 UTC

[opennlp-sandbox] branch enhance_split_for_space_operations_to_cover_more_cases created (now 54f3ae1)

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a change to branch enhance_split_for_space_operations_to_cover_more_cases
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git


      at 54f3ae1  enhances existing code in the `summarizer` component to use "\\s+" as split pattern instead of simply using a regular whitespace (" "). improves JavaDoc along the path improves formatting along the path

This branch includes the following new commits:

     new 54f3ae1  enhances existing code in the `summarizer` component to use "\\s+" as split pattern instead of simply using a regular whitespace (" "). improves JavaDoc along the path improves formatting along the path

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.

[opennlp-sandbox] 01/01: enhances existing code in the `summarizer` component to use "\\s+" as split pattern instead of simply using a regular whitespace (" "). improves JavaDoc along the path improves formatting along the path

Posted by ma...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch enhance_split_for_space_operations_to_cover_more_cases
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git

commit 54f3ae1a54d6dfd7f9232b20583bd5787fa367b6
Author: Martin Wiesner <ma...@hs-heilbronn.de>
AuthorDate: Mon Apr 17 10:28:09 2023 +0200

    enhances existing code in the `summarizer` component to use "\\s+" as split pattern instead of simply using a regular whitespace (" ").
    improves JavaDoc along the path
    improves formatting along the path
---
 .../src/main/java/opennlp/summarization/Score.java |  6 +--
 .../main/java/opennlp/summarization/Sentence.java  |  4 +-
 .../WordRelationshipDetermination.java             | 44 ++++++++++------------
 .../preprocess/DefaultDocProcessor.java            |  2 +-
 .../opennlp/summarization/textrank/TextRank.java   | 12 +++---
 5 files changed, 31 insertions(+), 37 deletions(-)

diff --git a/summarizer/src/main/java/opennlp/summarization/Score.java b/summarizer/src/main/java/opennlp/summarization/Score.java
index 2fc2977..eeda3e7 100755
--- a/summarizer/src/main/java/opennlp/summarization/Score.java
+++ b/summarizer/src/main/java/opennlp/summarization/Score.java
@@ -18,11 +18,11 @@
 package opennlp.summarization;
 
 /**
- * A utility class to store the score of a sentence for ranking sentences within a document.
+ * Stores the score of a sentence for ranking sentences within a document.
  */
 public class Score implements Comparable<Score> {
-  int sentId;
-  public double score;
+  private int sentId;
+  private double score;
 
   public Score()
   {
diff --git a/summarizer/src/main/java/opennlp/summarization/Sentence.java b/summarizer/src/main/java/opennlp/summarization/Sentence.java
index 07079b2..a0a96c8 100755
--- a/summarizer/src/main/java/opennlp/summarization/Sentence.java
+++ b/summarizer/src/main/java/opennlp/summarization/Sentence.java
@@ -93,7 +93,7 @@ public class Sentence {
 	private int calcWrdCnt(String stringVal2) {
 		int ret = 0;
 		StopWords sw = StopWords.getInstance();
-		String[] wrds = stringVal.split(" ");
+		String[] wrds = stringVal.split("\\s+");
 		for(String wrd: wrds){
 			if(!sw.isStopWord(wrd)&&!wrd.startsWith("'")&&!wrd.equals(".")&&!wrd.equals("?"))
 				ret++;
@@ -131,7 +131,7 @@ public class Sentence {
 	
 	public int getWordCnt()
 	{
-		return wordCnt==0? this.getStringVal().split(" ").length: wordCnt;
+		return wordCnt==0? this.getStringVal().split("\\s+").length: wordCnt;
 	}
 
 	// Should add an article id to the sentence class. For now returns true if the ids are the same.
diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java
index 524b420..eb960d0 100644
--- a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java
+++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java
@@ -35,11 +35,11 @@ import edu.mit.jwi.RAMDictionary;
 
 /**
  * Uses wordnet to determine the relation of two words.
- * Words have -
+ * Words have:
  * <ul>
- * <li>strong relationship: same word</li>
+ * <li>Strong relationship: same word</li>
  * <li>Med relationship: synonym, hyponym</li>
- * <li>weak relationship: antonym, hypernym</li>
+ * <li>Weak relationship: antonym, hypernym</li>
  * <li>No relationship: otherwise</li>
  * </ul>
  */
@@ -54,7 +54,7 @@ public class WordRelationshipDetermination {
   private final Hashtable<ISynsetID, ISynset> cache = new Hashtable<>();
   private final Hashtable<ISynset, List<IWord>> synsetWordCache = new Hashtable<>();
 
-  public WordRelationshipDetermination() throws Exception {
+  public WordRelationshipDetermination() {
     dictionary = new RAMDictionary(WordRelationshipDetermination.class.getResource(DICTIONARY_FILE), ILoadPolicy.IMMEDIATE_LOAD);
     ((RAMDictionary)dictionary).load();
     openDict();
@@ -76,9 +76,9 @@ public class WordRelationshipDetermination {
 
       //Get the synset in which word is present.
       ISynset wordSynset;
-      if(ww.synonyms!=null)
+      if (ww.synonyms!=null)
         wordSynset = ww.synonyms;
-      else{
+      else {
         IWord word = dictionary.getWord((IWordID)w.getID());
         wordSynset = word.getSynset();
         ww.synonyms = wordSynset;
@@ -89,11 +89,11 @@ public class WordRelationshipDetermination {
     }
     return ret;
   }
+
   /*
    * Returns true if the word represented by idxNoun is present in a synset.
    */
-  private IWord inSynset(ISynset wordSynset, IIndexWord idxNoun)
-  {
+  private IWord inSynset(ISynset wordSynset, IIndexWord idxNoun) {
     IWord ret = null;
     List<IWord> wrds;
 
@@ -105,12 +105,9 @@ public class WordRelationshipDetermination {
 //		}
 
     //Returns all the words present in the synset wordSynset
-    for(IWord synonym : wrds)
-    {
-      for(IWordID nounID : idxNoun.getWordIDs())
-      {
-        if(synonym.equals(dictionary.getWord(nounID)))
-        {
+    for(IWord synonym : wrds) {
+      for(IWordID nounID : idxNoun.getWordIDs()) {
+        if(synonym.equals(dictionary.getWord(nounID))) {
           ret = synonym;
           break;
         }
@@ -140,16 +137,15 @@ public class WordRelationshipDetermination {
     ISynset wordSynset = word.getSynset();
 
     for(Pointer p : rels) {
-
       List<ISynsetID> rels;
-      if(ww.rels.get(p)!=null)
+      if (ww.rels.get(p)!=null)
         rels = ww.rels.get(p);
       else {
         rels = wordSynset.getRelatedSynsets(p);
         ww.rels.put(p, rels);
       }
 
-      for(ISynsetID id: rels) {
+      for (ISynsetID id: rels) {
         ISynset s = this.dictionary.getSynset(id);
         IWord mat = inSynset(s, idxNoun);
         if(mat!=null)
@@ -174,7 +170,7 @@ public class WordRelationshipDetermination {
   public WordRelation getRelation(LexicalChain l, String noun, boolean checkMed) {
     WordRelation ret = new WordRelation();
     ret.relation = WordRelation.NO_RELATION;
-    for(Word w : l.word) {
+    for (Word w : l.word) {
       //Exact match is a string relation.
       if(w.getLexicon().equalsIgnoreCase(noun)) {
         ret.relation = WordRelation.STRONG_RELATION;
@@ -185,8 +181,7 @@ public class WordRelationshipDetermination {
       //  else it is a Wordnet word and is it a synonym or hyponym of LCs (medium relation)
       else if(w.getID()!=null && checkMed){
         Word wrel = isMediumRel(noun, w) ;
-        if(wrel!=null)
-        {
+        if(wrel!=null) {
           ret.relation = WordRelation.MED_RELATION;
           ret.src = w;
           ret.dest = wrel;
@@ -205,20 +200,19 @@ public class WordRelationshipDetermination {
         e.printStackTrace();
       }
   }
+
   public List<Word> getWordSenses(String noun) {
     List<Word> ret = new ArrayList<>();
-    try{
+    try {
       //		openDict();
       List<IWordID> wordIDs = this.dictionary.getIndexWord(noun, POS.NOUN).getWordIDs();
-      for(IWordID wid: wordIDs)
-      {
+      for(IWordID wid: wordIDs) {
         Word w = new WordnetWord();
         w.setLexicon(noun);
         w.setID(wid);
         ret.add(w);
       }
-    }catch(Exception ex){
-      // ex.printStackTrace();
+    } catch(Exception ex){
       //Not in dictionary
       Word w = new WordnetWord();
       w.setLexicon(noun);
diff --git a/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java b/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
index f4e1a0e..e491aec 100755
--- a/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
+++ b/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
@@ -208,7 +208,7 @@ public class DefaultDocProcessor implements DocProcessor {
   @Override
   public String[] getWords(String sent)
   {
-    return sent.split(" ");
+    return sent.trim().split("\\s+");
   }
 
   @Override
diff --git a/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java b/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java
index b6072eb..f4b5470 100755
--- a/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java
+++ b/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java
@@ -75,8 +75,8 @@ public class TextRank {
   // words..
   public double getWeightedSimilarity(String sent1, String sent2,
                                       Hashtable<String, Double> wrdWts) {
-    String[] words1 = sent1.split(" ");
-    String[] words2 = sent2.split(" ");
+    String[] words1 = sent1.trim().split("\\s+");
+    String[] words2 = sent2.trim().split("\\s+");
     double wordsInCommon = 0;
     Hashtable<String, Boolean> dups = new Hashtable<>();
     for (String s : words1) {
@@ -173,8 +173,7 @@ public class TextRank {
 
     for (int i = 0; i < sentences.size(); i++) {
       String nextSent = sentences.get(i);
-      String[] words = nextSent.split(" ");
-      List<Integer> processed = new ArrayList<>();
+      String[] words = nextSent.trim().split("\\s+");
       Score s = new Score();
       s.setSentId(i);
 
@@ -185,6 +184,7 @@ public class TextRank {
         if (otherSents == null)
           continue;
 
+        List<Integer> processed = new ArrayList<>();
         for (int idx : otherSents) {
           if (idx != i && !processed.contains(idx)) {
             double currS = getWeightedSimilarity(sentences.get(i),
@@ -233,7 +233,7 @@ public class TextRank {
 
     if (HIGHER_TITLE_WEIGHT && getSentences().size()>0) {
       String sent = getSentences().get(0);
-      String[] wrds = sent.split(" ");
+      String[] wrds = sent.trim().split("\\s+");
       for (String wrd : wrds)
         wrdWts.put(wrd, TITLE_WRD_WT);
     }
@@ -278,7 +278,7 @@ public class TextRank {
 
 /*
  * public double getScore(String sent1, String sent2, boolean toPrint) {
- * String[] words1 = sent1.split(" "); String[] words2 = sent2.split(" ");
+ * String[] words1 = sent1.split("\\s+"); String[] words2 = sent2.split("\\s+");
  * double wordsInCommon = 0; for(int i=0;i< words1.length;i++) { for(int
  * j=0;j<words2.length;j++) { if(!sw.isStopWord(words1[i]) &&
  * !words1[i].trim().isEmpty() && words1[i].equals(words2[j])) { wordsInCommon+=