You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by rz...@apache.org on 2023/04/17 12:03:43 UTC
[opennlp-sandbox] branch master updated: enhances existing code in the `summarizer` component to use "\\s+" as split pattern instead of simply using a regular whitespace (" "). improves JavaDoc along the path improves formatting along the path
This is an automated email from the ASF dual-hosted git repository.
rzo1 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
The following commit(s) were added to refs/heads/master by this push:
new 27a5176 enhances existing code in the `summarizer` component to use "\\s+" as split pattern instead of simply using a regular whitespace (" "). improves JavaDoc along the path improves formatting along the path
27a5176 is described below
commit 27a517603cab076466b20fa8d754ff4ab870d3bf
Author: Martin Wiesner <ma...@hs-heilbronn.de>
AuthorDate: Mon Apr 17 10:28:09 2023 +0200
enhances existing code in the `summarizer` component to use "\\s+" as split pattern instead of simply using a regular whitespace (" ").
improves JavaDoc along the path
improves formatting along the path
---
.../src/main/java/opennlp/summarization/Score.java | 6 +--
.../main/java/opennlp/summarization/Sentence.java | 4 +-
.../WordRelationshipDetermination.java | 44 ++++++++++------------
.../preprocess/DefaultDocProcessor.java | 2 +-
.../opennlp/summarization/textrank/TextRank.java | 12 +++---
5 files changed, 31 insertions(+), 37 deletions(-)
diff --git a/summarizer/src/main/java/opennlp/summarization/Score.java b/summarizer/src/main/java/opennlp/summarization/Score.java
index 2fc2977..eeda3e7 100755
--- a/summarizer/src/main/java/opennlp/summarization/Score.java
+++ b/summarizer/src/main/java/opennlp/summarization/Score.java
@@ -18,11 +18,11 @@
package opennlp.summarization;
/**
- * A utility class to store the score of a sentence for ranking sentences within a document.
+ * Stores the score of a sentence for ranking sentences within a document.
*/
public class Score implements Comparable<Score> {
- int sentId;
- public double score;
+ private int sentId;
+ private double score;
public Score()
{
diff --git a/summarizer/src/main/java/opennlp/summarization/Sentence.java b/summarizer/src/main/java/opennlp/summarization/Sentence.java
index 07079b2..a0a96c8 100755
--- a/summarizer/src/main/java/opennlp/summarization/Sentence.java
+++ b/summarizer/src/main/java/opennlp/summarization/Sentence.java
@@ -93,7 +93,7 @@ public class Sentence {
private int calcWrdCnt(String stringVal2) {
int ret = 0;
StopWords sw = StopWords.getInstance();
- String[] wrds = stringVal.split(" ");
+ String[] wrds = stringVal.split("\\s+");
for(String wrd: wrds){
if(!sw.isStopWord(wrd)&&!wrd.startsWith("'")&&!wrd.equals(".")&&!wrd.equals("?"))
ret++;
@@ -131,7 +131,7 @@ public class Sentence {
public int getWordCnt()
{
- return wordCnt==0? this.getStringVal().split(" ").length: wordCnt;
+ return wordCnt==0? this.getStringVal().split("\\s+").length: wordCnt;
}
// Should add an article id to the sentence class. For now returns true if the ids are the same.
diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java
index 524b420..eb960d0 100644
--- a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java
+++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java
@@ -35,11 +35,11 @@ import edu.mit.jwi.RAMDictionary;
/**
* Uses wordnet to determine the relation of two words.
- * Words have -
+ * Words have:
* <ul>
- * <li>strong relationship: same word</li>
+ * <li>Strong relationship: same word</li>
* <li>Med relationship: synonym, hyponym</li>
- * <li>weak relationship: antonym, hypernym</li>
+ * <li>Weak relationship: antonym, hypernym</li>
* <li>No relationship: otherwise</li>
* </ul>
*/
@@ -54,7 +54,7 @@ public class WordRelationshipDetermination {
private final Hashtable<ISynsetID, ISynset> cache = new Hashtable<>();
private final Hashtable<ISynset, List<IWord>> synsetWordCache = new Hashtable<>();
- public WordRelationshipDetermination() throws Exception {
+ public WordRelationshipDetermination() {
dictionary = new RAMDictionary(WordRelationshipDetermination.class.getResource(DICTIONARY_FILE), ILoadPolicy.IMMEDIATE_LOAD);
((RAMDictionary)dictionary).load();
openDict();
@@ -76,9 +76,9 @@ public class WordRelationshipDetermination {
//Get the synset in which word is present.
ISynset wordSynset;
- if(ww.synonyms!=null)
+ if (ww.synonyms!=null)
wordSynset = ww.synonyms;
- else{
+ else {
IWord word = dictionary.getWord((IWordID)w.getID());
wordSynset = word.getSynset();
ww.synonyms = wordSynset;
@@ -89,11 +89,11 @@ public class WordRelationshipDetermination {
}
return ret;
}
+
/*
* Returns true if the word represented by idxNoun is present in a synset.
*/
- private IWord inSynset(ISynset wordSynset, IIndexWord idxNoun)
- {
+ private IWord inSynset(ISynset wordSynset, IIndexWord idxNoun) {
IWord ret = null;
List<IWord> wrds;
@@ -105,12 +105,9 @@ public class WordRelationshipDetermination {
// }
//Returns all the words present in the synset wordSynset
- for(IWord synonym : wrds)
- {
- for(IWordID nounID : idxNoun.getWordIDs())
- {
- if(synonym.equals(dictionary.getWord(nounID)))
- {
+ for(IWord synonym : wrds) {
+ for(IWordID nounID : idxNoun.getWordIDs()) {
+ if(synonym.equals(dictionary.getWord(nounID))) {
ret = synonym;
break;
}
@@ -140,16 +137,15 @@ public class WordRelationshipDetermination {
ISynset wordSynset = word.getSynset();
for(Pointer p : rels) {
-
List<ISynsetID> rels;
- if(ww.rels.get(p)!=null)
+ if (ww.rels.get(p)!=null)
rels = ww.rels.get(p);
else {
rels = wordSynset.getRelatedSynsets(p);
ww.rels.put(p, rels);
}
- for(ISynsetID id: rels) {
+ for (ISynsetID id: rels) {
ISynset s = this.dictionary.getSynset(id);
IWord mat = inSynset(s, idxNoun);
if(mat!=null)
@@ -174,7 +170,7 @@ public class WordRelationshipDetermination {
public WordRelation getRelation(LexicalChain l, String noun, boolean checkMed) {
WordRelation ret = new WordRelation();
ret.relation = WordRelation.NO_RELATION;
- for(Word w : l.word) {
+ for (Word w : l.word) {
//Exact match is a string relation.
if(w.getLexicon().equalsIgnoreCase(noun)) {
ret.relation = WordRelation.STRONG_RELATION;
@@ -185,8 +181,7 @@ public class WordRelationshipDetermination {
// else it is a Wordnet word and is it a synonym or hyponym of LCs (medium relation)
else if(w.getID()!=null && checkMed){
Word wrel = isMediumRel(noun, w) ;
- if(wrel!=null)
- {
+ if(wrel!=null) {
ret.relation = WordRelation.MED_RELATION;
ret.src = w;
ret.dest = wrel;
@@ -205,20 +200,19 @@ public class WordRelationshipDetermination {
e.printStackTrace();
}
}
+
public List<Word> getWordSenses(String noun) {
List<Word> ret = new ArrayList<>();
- try{
+ try {
// openDict();
List<IWordID> wordIDs = this.dictionary.getIndexWord(noun, POS.NOUN).getWordIDs();
- for(IWordID wid: wordIDs)
- {
+ for(IWordID wid: wordIDs) {
Word w = new WordnetWord();
w.setLexicon(noun);
w.setID(wid);
ret.add(w);
}
- }catch(Exception ex){
- // ex.printStackTrace();
+ } catch(Exception ex){
//Not in dictionary
Word w = new WordnetWord();
w.setLexicon(noun);
diff --git a/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java b/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
index f4e1a0e..e491aec 100755
--- a/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
+++ b/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
@@ -208,7 +208,7 @@ public class DefaultDocProcessor implements DocProcessor {
@Override
public String[] getWords(String sent)
{
- return sent.split(" ");
+ return sent.trim().split("\\s+");
}
@Override
diff --git a/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java b/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java
index b6072eb..f4b5470 100755
--- a/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java
+++ b/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java
@@ -75,8 +75,8 @@ public class TextRank {
// words..
public double getWeightedSimilarity(String sent1, String sent2,
Hashtable<String, Double> wrdWts) {
- String[] words1 = sent1.split(" ");
- String[] words2 = sent2.split(" ");
+ String[] words1 = sent1.trim().split("\\s+");
+ String[] words2 = sent2.trim().split("\\s+");
double wordsInCommon = 0;
Hashtable<String, Boolean> dups = new Hashtable<>();
for (String s : words1) {
@@ -173,8 +173,7 @@ public class TextRank {
for (int i = 0; i < sentences.size(); i++) {
String nextSent = sentences.get(i);
- String[] words = nextSent.split(" ");
- List<Integer> processed = new ArrayList<>();
+ String[] words = nextSent.trim().split("\\s+");
Score s = new Score();
s.setSentId(i);
@@ -185,6 +184,7 @@ public class TextRank {
if (otherSents == null)
continue;
+ List<Integer> processed = new ArrayList<>();
for (int idx : otherSents) {
if (idx != i && !processed.contains(idx)) {
double currS = getWeightedSimilarity(sentences.get(i),
@@ -233,7 +233,7 @@ public class TextRank {
if (HIGHER_TITLE_WEIGHT && getSentences().size()>0) {
String sent = getSentences().get(0);
- String[] wrds = sent.split(" ");
+ String[] wrds = sent.trim().split("\\s+");
for (String wrd : wrds)
wrdWts.put(wrd, TITLE_WRD_WT);
}
@@ -278,7 +278,7 @@ public class TextRank {
/*
* public double getScore(String sent1, String sent2, boolean toPrint) {
- * String[] words1 = sent1.split(" "); String[] words2 = sent2.split(" ");
+ * String[] words1 = sent1.split("\\s+"); String[] words2 = sent2.split("\\s+");
* double wordsInCommon = 0; for(int i=0;i< words1.length;i++) { for(int
* j=0;j<words2.length;j++) { if(!sw.isStopWord(words1[i]) &&
* !words1[i].trim().isEmpty() && words1[i].equals(words2[j])) { wordsInCommon+=