You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2015/02/11 09:53:14 UTC
svn commit: r1658901 [1/2] - in /opennlp/sandbox/summarizer: ./ src/
src/main/ src/main/java/ src/main/java/opennlp/
src/main/java/opennlp/summarization/
src/main/java/opennlp/summarization/lexicalchaining/
src/main/java/opennlp/summarization/meta/ src...
Author: joern
Date: Wed Feb 11 08:53:14 2015
New Revision: 1658901
URL: http://svn.apache.org/r1658901
Log:
OPENNLP-752 Added the summarizer contribution. Thanks to Ram Soma for contributing it.
Added:
opennlp/sandbox/summarizer/
opennlp/sandbox/summarizer/src/
opennlp/sandbox/summarizer/src/main/
opennlp/sandbox/summarizer/src/main/java/
opennlp/sandbox/summarizer/src/main/java/opennlp/
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/DocProcessor.java
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/Score.java (with props)
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/Sentence.java (with props)
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/Summarizer.java
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractor.java
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChain.java
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizer.java (with props)
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/OpenNLPPOSTagger.java
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/POSTagger.java
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/Word.java
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelation.java
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordnetWord.java
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/meta/
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java (with props)
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/IDFWordWeight.java (with props)
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/PorterStemmer.java (with props)
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/StopWords.java (with props)
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/WordWeight.java (with props)
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/textrank/
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java (with props)
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/textrank/TextRankSummarizer.java (with props)
opennlp/sandbox/summarizer/src/test/
opennlp/sandbox/summarizer/src/test/java/
opennlp/sandbox/summarizer/src/test/java/unittests/
opennlp/sandbox/summarizer/src/test/java/unittests/DocProcessorTest.java
opennlp/sandbox/summarizer/src/test/java/unittests/LexChainTest.java
opennlp/sandbox/summarizer/src/test/java/unittests/LexChainingKeywordExtractorTest.java
Added: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/DocProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/DocProcessor.java?rev=1658901&view=auto
==============================================================================
--- opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/DocProcessor.java (added)
+++ opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/DocProcessor.java Wed Feb 11 08:53:14 2015
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization;
+
+import java.util.List;
+
+import opennlp.tools.stemmer.Stemmer;
+
+/*
+ * A document processor abstracts a lot of the underlying complexities of parsing the document and
+ * preparing it (e.g. stemming, stop word removal) from the summarization algorithm. The current package
+ * supports sentence extraction based algorithms. Thus extracting Sentences from the text is the
+ * first step and the basis for the algorithms.
+ */
+public interface DocProcessor {
+ /* Extract sentences from a string representing an article.*/
+ public List<Sentence> getSentencesFromStr(String text) ;
+ /* Utility method to parse out words from a string.*/
+ public String[] getWords(String sent);
+ /* Provide a stemmer to stem words*/
+ public Stemmer getStemmer();
+}
Added: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/Score.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/Score.java?rev=1658901&view=auto
==============================================================================
--- opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/Score.java (added)
+++ opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/Score.java Wed Feb 11 08:53:14 2015
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization;
+
+/*
+ * Utility class to store the score of a sentence for ranking sentences within a document.
+ */
+public class Score implements Comparable<Score>
+{
+ int sentId;
+ public double score;
+
+ public Score()
+ {
+ score = 0;
+ }
+
+ public int getSentId(){
+ return sentId;
+ }
+
+ public double getScore()
+ {
+ return score;
+ }
+
+ public void setScore(double score)
+ {
+ this.score = score;
+ }
+
+ public void setSentId(int sentId)
+ {
+ this.sentId = sentId;
+ }
+
+ public int compareTo(Score o)
+ {
+
+ if(o.score > score) return 1;
+ else if (o.score < score) return -1;
+ return 0;
+ }
+
+ public String toString()
+ {
+ return sentId +" "+score;
+ }
+}
Propchange: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/Score.java
------------------------------------------------------------------------------
svn:executable = *
Added: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/Sentence.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/Sentence.java?rev=1658901&view=auto
==============================================================================
--- opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/Sentence.java (added)
+++ opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/Sentence.java Wed Feb 11 08:53:14 2015
@@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization;
+
+import java.text.BreakIterator;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Locale;
+
+import opennlp.summarization.preprocess.PorterStemmer;
+import opennlp.summarization.preprocess.StopWords;
+
+/*
+ * A representation of a sentence geared toward pagerank and summarization.
+ */
+public class Sentence {
+ //sentId is always position of sentence in doc..
+ private int sentId;
+ private String stringVal, procStringVal;
+ private Score pageRankScore;
+ private int paragraph;
+ private int paraPos;
+ private boolean hasQuote;
+ private double wordWt = 0;
+ private int wordCnt;
+
+ private List<Sentence> links;
+ private PorterStemmer stemmer;
+
+ public Sentence(){
+ links = new ArrayList<Sentence>();
+ }
+
+ public Sentence(int id){
+ this();
+ this.sentId = id;
+ }
+
+ public void setSentId(int sentId) {
+ this.sentId = sentId;
+ }
+
+ public int getSentId() {
+ return sentId;
+ }
+
+ public void setPageRankScore(Score pageRankScore) {
+ this.pageRankScore = pageRankScore;
+ }
+
+ public Score getPageRankScore() {
+ return pageRankScore;
+ }
+
+ public void setParagraph(int paragraph) {
+ this.paragraph = paragraph;
+ }
+
+ public int getParagraph() {
+ return paragraph;
+ }
+
+ public void setParaPos(int paraPos) {
+ this.paraPos = paraPos;
+ }
+
+ public int getParaPos() {
+ return paraPos;
+ }
+
+ public void setStringVal(String stringVal) {
+ this.stringVal = stringVal;
+ if(stringVal.contains("\"")) this.hasQuote = true;
+ this.wordCnt = calcWrdCnt(stringVal);
+ }
+
+ private int calcWrdCnt(String stringVal2) {
+ int ret = 0;
+ StopWords sw = StopWords.getInstance();
+ String[] wrds = stringVal.split(" ");
+ for(String wrd: wrds){
+ if(!sw.isStopWord(wrd)&&!wrd.startsWith("'")&&!wrd.equals(".")&&!wrd.equals("?"))
+ ret++;
+ }
+ return ret;
+ }
+
+ public String getStringVal() {
+ return stringVal;
+ }
+
+ public void addLink(Sentence s)
+ {
+ this.links.add(s);
+ }
+
+ public List<Sentence> getLinks()
+ {
+ return this.links;
+ }
+
+ public String toString()
+ {
+ return this.stringVal ;//+ "("+ this.paragraph +", "+this.paraPos+")";
+ }
+
+ public void setWordWt(double wordWt) {
+ this.wordWt = wordWt;
+ }
+
+ public double getWordWt() {
+ return wordWt;
+ }
+
+ public int getWordCnt()
+ {
+ return wordCnt==0? this.getStringVal().split(" ").length: wordCnt;
+ }
+
+ //Should add an article id to the sentence class.. For now returns true if the ids are the same..
+ public boolean equals(Object o){
+ if(! (o instanceof Sentence)) return false;
+
+ Sentence s = (Sentence)o;
+ if(s.sentId == this.sentId) return true;
+ return false;
+ }
+
+ static final String space=" ";
+ public String stem() {
+ PorterStemmer stemmer = new PorterStemmer();
+ StopWords sw = StopWords.getInstance();
+
+ BreakIterator wrdItr = BreakIterator.getWordInstance(Locale.US);
+ int wrdStrt = 0;
+ StringBuffer b = new StringBuffer();
+ wrdItr.setText(stringVal);
+ for(int wrdEnd = wrdItr.next(); wrdEnd != BreakIterator.DONE;
+ wrdStrt = wrdEnd, wrdEnd = wrdItr.next())
+ {
+ String word = this.getStringVal().substring(wrdStrt, wrdEnd);//words[i].trim();
+ word.replaceAll("\"|'","");
+
+ //Skip stop words and stem the word..
+ if(sw.isStopWord(word)) continue;
+ stemmer.stem(word);
+ b.append(stemmer.toString());
+ b.append(space);
+ }
+ // TODO Auto-generated method stub
+ return b.toString();
+ }
+}
Propchange: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/Sentence.java
------------------------------------------------------------------------------
svn:executable = *
Added: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/Summarizer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/Summarizer.java?rev=1658901&view=auto
==============================================================================
--- opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/Summarizer.java (added)
+++ opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/Summarizer.java Wed Feb 11 08:53:14 2015
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization;
+
+public interface Summarizer {
+ public String summarize(String article, DocProcessor dp, int maxWords);
+}
Added: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractor.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractor.java?rev=1658901&view=auto
==============================================================================
--- opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractor.java (added)
+++ opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractor.java Wed Feb 11 08:53:14 2015
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.lexicalchaining;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+/*
+ * Use the lexical chaining algorithm to extract keywords.
+ */
+public class LexChainingKeywordExtractor {
+
+ //Simple logic to pull out the keyword based on longest lexical chains..
+ public List<String> getKeywords(List<LexicalChain> lexicalChains, int noOfKeywrds){
+ Collections.sort(lexicalChains);
+ List<String> ret = new ArrayList<String>();
+ for(int i=0;i<Math.min(lexicalChains.size(), noOfKeywrds);i++)
+ {
+ List<Word> words = lexicalChains.get(i).getWord();
+ if(words.size()>0 &&!ret.contains(words.get(0).getLexicon())){
+ ret.add(words.get(0).getLexicon());
+ }
+ }
+ return ret;
+ }
+}
Added: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChain.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChain.java?rev=1658901&view=auto
==============================================================================
--- opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChain.java (added)
+++ opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChain.java Wed Feb 11 08:53:14 2015
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.lexicalchaining;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.summarization.Sentence;
+
+
+public class LexicalChain implements Comparable<LexicalChain>{
+ List<Word> word;
+
+ List<Sentence> sentences;
+
+ int start, last;
+ int score;
+ int occurences=1;
+
+ public LexicalChain()
+ {
+ word = new ArrayList<Word>();
+ sentences = new ArrayList<Sentence>();
+ }
+
+ public double score()
+ {
+ return length() ;//* homogeneity();
+ }
+
+ public int length(){
+ return word.size();
+ }
+
+ public float homogeneity()
+ {
+ return (1.0f - (float)occurences/(float)length());
+ }
+
+ public void addWord(Word w)
+ {
+ word.add(w);
+ }
+
+ public void addSentence(Sentence sent)
+ {
+ if(!sentences.contains(sent))
+ sentences.add(sent);
+ }
+
+ public List<Word> getWord()
+ {
+ return word;
+ }
+
+ public List<Sentence>getSentences()
+ {
+ return this.sentences;
+ }
+
+ @Override
+ public int compareTo(LexicalChain o) {
+ double diff = (score() - o.score());
+ return diff ==0? 0: diff > 0 ?1:-1;
+ }
+
+ @Override
+ public boolean equals(Object o){
+ return super.equals(o);
+ }
+}
Added: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizer.java?rev=1658901&view=auto
==============================================================================
--- opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizer.java (added)
+++ opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizer.java Wed Feb 11 08:53:14 2015
@@ -0,0 +1,226 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.lexicalchaining;
+
+import java.util.*;
+import java.util.logging.Logger;
+
+import opennlp.summarization.DocProcessor;
+import opennlp.summarization.Score;
+import opennlp.summarization.Sentence;
+import opennlp.summarization.Summarizer;
+import opennlp.summarization.preprocess.DefaultDocProcessor;
+
+/*
+ * Implements the algorithm outlined in - "Summarization Using Lexical Chains" by R. Berzilay et al.
+ * The algorithm is based on so extracting so called lexical chains - a set of sentences in the article
+ * that share a word that are very closely related. Thus the longest chain represents the most important
+ * topic and so forth. A summary can then be formed by identifying the most important lexical chains
+ * and "pulling" out sentences from them.
+ */
+public class LexicalChainingSummarizer implements Summarizer{
+
+ private POSTagger tagger;
+ private DocProcessor dp;
+ private WordRelationshipDetermination wordRel;
+ private Logger log;
+ public LexicalChainingSummarizer(DocProcessor dp, String posModelFile) throws Exception
+ {
+ wordRel = new WordRelationshipDetermination();
+ tagger = new OpenNLPPOSTagger(dp, posModelFile);
+ log = Logger.getLogger("LexicalChainingSummarizer");
+ }
+
+ //Build Lexical chains..
+ public List<LexicalChain> buildLexicalChains(String article, List<Sentence> sent)
+ {
+ // POS tag article
+ Hashtable<String, List<LexicalChain>> chains = new Hashtable<String, List<LexicalChain>>();
+ List<LexicalChain> lc = new ArrayList<LexicalChain>();
+ // Build lexical chains
+ // For each sentence
+ for(Sentence currSent : sent)
+ {
+ log.info(currSent.getStringVal());
+ String taggedSent = tagger.getTaggedString(currSent.getStringVal());
+ List<String> nouns = tagger.getWordsOfType(taggedSent, POSTagger.NOUN);
+ // For each noun
+ for(String noun : nouns)
+ {
+ int chainsAddCnt = 0;
+ // Loop through each LC
+ for(LexicalChain l: lc)
+ {
+ try{
+ WordRelation rel = wordRel.getRelation(l, noun, (currSent.getSentId() - l.start)>7);
+ // Is the noun an exact match to one of the current LCs (Strong relation)
+ // Add sentence to chain
+ if(rel.relation == WordRelation.STRONG_RELATION)
+ {
+ addToChain(rel.dest, l, chains, currSent);
+ if(currSent.getSentId() - l.last > 10)
+ {
+ l.occurences++; l.start = currSent.getSentId();
+ }
+ chainsAddCnt++;
+ }
+ else if(rel.relation == WordRelation.MED_RELATION)
+ {
+ // Add sentence to chain if it is 7 sentences away from start of chain
+ addToChain(rel.dest, l, chains, currSent);
+ chainsAddCnt++;
+ //If greater than 7 we will add it but call it a new occurence of the lexical chain...
+ if(currSent.getSentId() - l.start > 7)
+ {
+ l.occurences++;
+ l.start = currSent.getSentId();
+ }
+ }
+ else if(rel.relation == WordRelation.WEAK_RELATION)
+ {
+ if(currSent.getSentId() - l.start <= 3)
+ {
+ addToChain(rel.dest, l, chains, currSent);
+ chainsAddCnt++;
+ }
+ }
+ }catch(Exception ex){}
+ // add sentence and update last occurence..
+ //chaincnt++
+ // else 1 hop-relation in Wordnet (weak relation)
+ // Add sentence to chain if it is 3 sentences away from start of chain
+ //chaincnt++
+ // End loop LC
+ }
+ //Could not add the word to any existing list.. Start a new lexical chain with the word..
+ if(chainsAddCnt==0)
+ {
+ List<Word> senses = wordRel.getWordSenses(noun);
+ for(Word w : senses)
+ {
+ LexicalChain newLc = new LexicalChain();
+ newLc.start = currSent.getSentId();
+ addToChain(w, newLc, chains, currSent);
+ lc.add(newLc);
+ }
+ }
+ if(lc.size()> 20)
+ purge(lc, currSent.getSentId(), sent.size());
+ }
+ //End sentence
+ }
+
+// diambiguateAndCleanChains(lc, chains);
+ // Calculate score
+ // Length of chain * homogeneity
+ //sort LC by strength..
+ return lc;
+ }
+
+ /*
+ * A way to manage the number of lexical chains generated. Expire very small lexical chains ..
+ * Takes care to only remove small chains that were added "long back"
+ */
+ private void purge(List<LexicalChain> lc, int sentId, int totSents) {
+ //Do nothing for the first 50 sentences..
+ if(lc.size()<20 ) return;
+
+ Collections.sort(lc);
+ double min = lc.get(0).score();
+ double max = lc.get(lc.size()-1).score();
+
+ int cutOff = Math.max(3, (int)min);
+ Hashtable<String, Boolean> words = new Hashtable<String, Boolean>();
+ List<LexicalChain> toRem = new ArrayList<LexicalChain>();
+ for(int i=lc.size()-1; i>=0;i--)
+ {
+ LexicalChain l = lc.get(i);
+ if(l.score() < cutOff && (sentId - l.last) > totSents/3)// && containsAllWords(words, l.word))
+ toRem.add(l);
+ //A different sense and added long back..
+ else if(words.containsKey(l.getWord().get(0).getLexicon()) && (sentId - l.start) > totSents/10)
+ toRem.add(l);
+ else
+ {
+ //Check if this is from a word with different sense..
+ for(Word w: l.word)
+ words.put(w.getLexicon(), new Boolean(true));
+ }
+ }
+
+ for(LexicalChain l: toRem)
+ lc.remove(l);
+ }
+
+ private boolean containsAllWords(Hashtable<Word, Boolean> words,
+ List<Word> word) {
+ boolean ret = true;
+ for(Word w: word)
+ if(!words.containsKey(word)) return false;
+
+ return ret;
+ }
+
+ private void addToChain(Word noun, LexicalChain l,
+ Hashtable<String, List<LexicalChain>> chains, Sentence sent) {
+
+ l.addWord(noun);
+ l.addSentence(sent);
+ l.last = sent.getSentId();
+ if(!chains.contains(noun))
+ chains.put(noun.getLexicon(), new ArrayList<LexicalChain>());
+ chains.get(noun.getLexicon()).add(l);
+ }
+
+ POSTagger getTagger() {
+ return tagger;
+ }
+
+ void setTagger(POSTagger tagger) {
+ this.tagger = tagger;
+ }
+
+ @Override
+ public String summarize(String article, DocProcessor dp, int maxWords) {
+ List<Sentence> sent = dp.getSentencesFromStr(article);
+ List<LexicalChain> lc = buildLexicalChains(article, sent);
+ Collections.sort(lc);
+ int summSize=0;
+ List<Sentence>summ = new ArrayList<Sentence>();
+ StringBuffer sb = new StringBuffer();
+ for(int i=0;i<lc.size();i++)
+ {
+ for(int j=0;j<lc.size();j++)
+ {
+ Sentence candidate = lc.get(i).sentences.get(j);
+ if(!summ.contains(candidate))
+ {
+ summ.add(candidate);
+ sb.append(candidate.getStringVal());
+ summSize += candidate.getWordCnt();
+ break;
+ }
+ }
+ if(summSize>=maxWords) break;
+ }
+ return sb.toString();
+ }
+
+}
+
+
\ No newline at end of file
Propchange: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizer.java
------------------------------------------------------------------------------
svn:executable = *
Added: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/OpenNLPPOSTagger.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/OpenNLPPOSTagger.java?rev=1658901&view=auto
==============================================================================
--- opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/OpenNLPPOSTagger.java (added)
+++ opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/OpenNLPPOSTagger.java Wed Feb 11 08:53:14 2015
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.lexicalchaining;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Hashtable;
+import java.util.List;
+import java.util.logging.Logger;
+
+import javax.annotation.processing.Processor;
+
+import opennlp.summarization.DocProcessor;
+import opennlp.summarization.preprocess.DefaultDocProcessor;
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSTaggerME;
+
+public class OpenNLPPOSTagger implements POSTagger{
+ private POSTaggerME tagger;
+ private Hashtable<Integer, String[]> tagMap;
+ private DocProcessor dp;
+ private Logger log;
+
+ public OpenNLPPOSTagger(DocProcessor dp, String posModelFileName) throws Exception{
+ log = Logger.getLogger("OpenNLPPOSTagger");
+ InputStream modelIn = null;
+ this.dp = dp;
+ initTagMap();
+ try {
+ modelIn = new FileInputStream(posModelFileName);
+ POSModel model = new POSModel(modelIn);
+ tagger = new POSTaggerME(model);
+ }
+ catch (IOException e) {
+ // Model loading failed, handle the error
+ e.printStackTrace();
+ throw e;
+ }
+ finally {
+ if (modelIn != null) {
+ try {
+ modelIn.close();
+ }
+ catch (IOException e) {
+ }
+ }
+ }
+ }
+
+ private String[] nounTags = {"NN", "NNS","NNP","NNPS"};
+ private void initTagMap()
+ {
+ tagMap = new Hashtable<Integer, String[]>();
+ tagMap.put(POSTagger.NOUN, nounTags);
+ }
+
+ //Returns true if the typestring belongs to one of the tags for the type..
+ public boolean isType(String typeStr, int type)
+ {
+ boolean ret = false;
+ String[] tags = tagMap.get(type);
+ for(String tag: tags)
+ if(typeStr.equalsIgnoreCase(tag)) ret = true;
+
+ return ret;
+ }
+
+ @Override
+ public String getTaggedString(String article) {
+ return tagger.tag(article);
+ }
+
+ @Override
+ public List<String> getWordsOfType(String sent, int type)
+ {
+ List<String> ret = new ArrayList<String>();
+ String[] tokens = dp.getWords(sent);
+ for(String t:tokens)
+ {
+ String[] wordPlusType = t.split("/");
+ if(wordPlusType.length ==2)
+ {
+ if(isType(wordPlusType[1], type))
+ ret.add(wordPlusType[0]);
+ }
+ }
+ log.info(ret.toString());
+ return ret;
+ }
+}
Added: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/POSTagger.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/POSTagger.java?rev=1658901&view=auto
==============================================================================
--- opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/POSTagger.java (added)
+++ opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/POSTagger.java Wed Feb 11 08:53:14 2015
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.lexicalchaining;
+
+import java.util.List;
+
+public interface POSTagger {
+ //Tagger types..
+ public static final int NOUN=0;
+ public static final int VERB=1;
+ public static final int ADJECTIVE=2;
+ public static final int ADVERB=3;
+ public static final int PRONOUN=4;
+
+ public String getTaggedString(String article);
+ public List<String> getWordsOfType(String sent, int type);
+}
Added: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/Word.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/Word.java?rev=1658901&view=auto
==============================================================================
--- opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/Word.java (added)
+++ opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/Word.java Wed Feb 11 08:53:14 2015
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.lexicalchaining;
+
+public interface Word {
+ //Lexicon..
+ public String getLexicon();
+ public void setLexicon(String lex);
+
+ //Sense of a word..
+ public Object getSense();
+ public void setSense(Object senseID);
+
+ //ID for a word..
+ public Object getID();
+ public void setID(Object id);
+}
\ No newline at end of file
Added: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelation.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelation.java?rev=1658901&view=auto
==============================================================================
--- opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelation.java (added)
+++ opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelation.java Wed Feb 11 08:53:14 2015
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.lexicalchaining;
+
+public class WordRelation {
+ //Match strength constants for lexical chains..
+ public static int STRONG_RELATION = 0;
+ public static int MED_RELATION = 1;
+ public static int WEAK_RELATION = 2;
+ public static int NO_RELATION = 3;
+
+ public Word src;
+ public Word dest;
+ public int relation;
+}
Added: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java?rev=1658901&view=auto
==============================================================================
--- opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java (added)
+++ opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java Wed Feb 11 08:53:14 2015
@@ -0,0 +1,239 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.lexicalchaining;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Hashtable;
+import java.util.List;
+
+import edu.mit.jwi.data.ILoadPolicy;
+import edu.mit.jwi.item.IIndexWord;
+import edu.mit.jwi.item.ISynset;
+import edu.mit.jwi.item.ISynsetID;
+import edu.mit.jwi.item.IWord;
+import edu.mit.jwi.item.IWordID;
+import edu.mit.jwi.item.POS;
+import edu.mit.jwi.item.Pointer;
+import edu.mit.jwi.Dictionary;
+import edu.mit.jwi.IDictionary;
+import edu.mit.jwi.RAMDictionary;
+
+/*
+ * Uses wordnet to determine the relation of two words.
+ * Words have -
+ * strong relationship: same word
+ * Med relationship: synonym, hyponym
+ * weak relationship: antonym, hypernym..
+ * No relationship: otherwise
+ */
+public class WordRelationshipDetermination {
+
+ IDictionary dictionary;
+ String dictionaryFile="resources/wordnet/dict";
+ int MAX_DIST_MED_REL = 1000;
+
+ public WordRelationshipDetermination() throws Exception
+ {
+ dictionary = new RAMDictionary(new File(dictionaryFile), ILoadPolicy.IMMEDIATE_LOAD);
+ ((RAMDictionary)dictionary).load();
+ openDict();
+ }
+
+ private IWord isSynonynm(String noun, Word w)
+ {
+ WordnetWord ww = (WordnetWord)w;
+ IWord ret = null;
+ IIndexWord idxNoun = dictionary.getIndexWord(noun, POS.NOUN);
+
+ /*getWordIDs() returns all the WordID associated with a index
+ *
+ */
+// for(IWordID wordID : idxWord.getWordIDs())
+ {
+ //Construct an IWord object representing word associated with wordID
+// IWord word = dictionary.getWord(wordID);
+
+ //Get the synset in which word is present.
+ ISynset wordSynset = null;
+ if(ww.synonyms!=null)
+ wordSynset = ww.synonyms;
+ else{
+ IWord word = dictionary.getWord((IWordID)w.getID());
+ wordSynset = word.getSynset();
+ ww.synonyms = wordSynset;
+ }
+ IWord syn = inSynset(wordSynset, idxNoun);
+ if(w!=null){
+ ret = syn;
+// break;
+ }
+ }
+ return ret;
+ }
+ /*
+ * Returns true if the word represented by idxNoun is present in a synset..
+ */
+ Hashtable<ISynset, List<IWord>> synsetWordCache = new Hashtable<ISynset, List<IWord>>();
+ private IWord inSynset(ISynset wordSynset, IIndexWord idxNoun)
+ {
+ IWord ret = null;
+ List<IWord> wrds = null;
+
+ // if(synsetWordCache.get(wordSynset)!=null)
+// wrds = synsetWordCache.get(wordSynset);
+// else{
+ wrds = wordSynset.getWords();
+// synsetWordCache.put(wordSynset, wrds);
+// }
+
+ //Returns all the words present in the synset wordSynset
+ for(IWord synonym : wrds)
+ {
+ for(IWordID nounID : idxNoun.getWordIDs())
+ {
+ if(synonym.equals(dictionary.getWord(nounID)))
+ {
+ ret = synonym;
+ break;
+ }
+ }
+ }
+ return ret;
+ }
+
+ Pointer[] rels = {Pointer.ANTONYM, Pointer.HYPERNYM, Pointer.HYPONYM, Pointer.MERONYM_PART,
+ Pointer.MERONYM_SUBSTANCE, Pointer.PARTICIPLE, Pointer.HYPERNYM_INSTANCE};
+ Hashtable<ISynsetID, ISynset> cache = new Hashtable<ISynsetID, ISynset>();
+ //Returns a word if w has a medium strength relationship with noun. Returns null otherwise.
+ private Word isMediumRel(String noun, Word w)
+ {
+ // openDict();
+ WordnetWord ret = null;
+ WordnetWord ww = (WordnetWord) w;
+ IWord syn = null;
+ if((syn = this.isSynonynm(noun, w))!=null) {
+ ret = new WordnetWord();
+ ret.lexicon = noun;
+ ret.id = syn.getID();
+ ret.wordSense = syn .getSenseKey();
+ }
+
+ //Construct an IWord object representing word associated with wordID
+ IWord word = dictionary.getWord((IWordID)w.getID());
+
+ IIndexWord idxNoun = dictionary.getIndexWord(noun, POS.NOUN);
+ //Get the synset in which word is present.
+ ISynset wordSynset = word.getSynset();
+
+ for(Pointer p : rels)
+ {
+
+ List<ISynsetID> rels = null;
+ if(ww.rels.get(p)!=null)
+ rels = ww.rels.get(p);
+ else{
+ rels = wordSynset.getRelatedSynsets(p);
+ ww.rels.put(p, rels);
+ }
+
+ for(ISynsetID id: rels)
+ {
+ ISynset s = this.dictionary.getSynset(id);
+ IWord mat = inSynset(s, idxNoun);
+ if(mat!=null)
+ {
+ ret = new WordnetWord();
+ ret.lexicon = noun;
+ ret.id = mat.getID();
+ ret.wordSense = mat.getSenseKey();
+ break;
+ }
+ }
+ if(ret!=null) break;
+ }
+
+ return ret;
+ }
+
+ /*
+ * Returns the type of relation between a lexical chain and the noun. The return value is one of STRONG_RELATION, MEDIUM, WEAK, or NO
+ * Strong relation means exact match. Medium relation means synonym or hyponym
+ */
+ public WordRelation getRelation(LexicalChain l, String noun, boolean checkMed) throws Exception{
+ WordRelation ret = new WordRelation();
+ ret.relation = ret.NO_RELATION;
+ for(Word w : l.word)
+ {
+ //Exact match is a string relation..
+ if(w.getLexicon().equalsIgnoreCase(noun))
+ {
+ ret.relation = WordRelation.STRONG_RELATION;
+ ret.src = w;
+ ret.dest = w;
+ break;
+ }
+ // else it is a Wordnet word and is it a synonym or hyponym of LCs (medium relation)
+ else if(w.getID()!=null && checkMed){
+ Word wrel = isMediumRel(noun, w) ;
+ if(wrel!=null)
+ {
+ ret.relation = WordRelation.MED_RELATION;
+ ret.src = w;
+ ret.dest = wrel;
+ break;
+ }
+ }
+ }
+ return ret;
+ }
+
+ private void openDict()
+ {
+ if(!dictionary.isOpen())
+ try {
+ dictionary.open();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ public List<Word> getWordSenses(String noun)
+ {
+ List<Word> ret = new ArrayList<Word>();
+ try{
+ // openDict();
+ List<IWordID> wordIDs = this.dictionary.getIndexWord(noun, POS.NOUN).getWordIDs();
+ for(IWordID wid: wordIDs)
+ {
+ Word w = new WordnetWord();
+ w.setLexicon(noun);
+ w.setID(wid);
+ ret.add(w);
+ }
+ }catch(Exception ex){
+ // ex.printStackTrace();
+ //Not in dictionary
+ Word w = new WordnetWord();
+ w.setLexicon(noun);
+ ret.add(w);
+ }
+ return ret;
+ }
+}
Added: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordnetWord.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordnetWord.java?rev=1658901&view=auto
==============================================================================
--- opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordnetWord.java (added)
+++ opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordnetWord.java Wed Feb 11 08:53:14 2015
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+package opennlp.summarization.lexicalchaining;
+
+import java.util.Hashtable;
+import java.util.List;
+
+import edu.mit.jwi.item.IPointer;
+import edu.mit.jwi.item.ISenseKey;
+import edu.mit.jwi.item.ISynset;
+import edu.mit.jwi.item.ISynsetID;
+import edu.mit.jwi.item.IWordID;
+
+public class WordnetWord implements Word{
+ String lexicon;
+ ISenseKey wordSense;
+ IWordID id;
+
+ //Cache..
+ ISynset synonyms;
+ Hashtable<IPointer, List<ISynsetID>>rels;
+
+ public WordnetWord()
+ {
+ rels = new Hashtable<IPointer, List<ISynsetID>>();
+ }
+
+ @Override
+ public String getLexicon() {
+ return lexicon;
+ }
+
+ @Override
+ public Object getSense() {
+ return wordSense;
+ }
+
+ @Override
+ public Object getID() {
+ return id;
+ }
+
+ @Override
+ public void setLexicon(String lex) {
+ this.lexicon = lex;
+ }
+
+ @Override
+ public void setSense(Object senseID) {
+ this.wordSense = (ISenseKey) senseID;
+ }
+
+ @Override
+ public void setID(Object id) {
+ this.id = (IWordID)id;
+ }
+
+ @Override
+ public String toString()
+ {
+ return this.lexicon;
+ }
+
+ @Override
+ public int hashCode()
+ {
+ return toString().hashCode();
+ }
+}
Added: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java?rev=1658901&view=auto
==============================================================================
--- opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java (added)
+++ opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java Wed Feb 11 08:53:14 2015
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.meta;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Hashtable;
+import java.util.List;
+
+import opennlp.summarization.Score;
+import opennlp.summarization.Sentence;
+import opennlp.summarization.lexicalchaining.LexicalChain;
+import opennlp.summarization.lexicalchaining.LexicalChainingSummarizer;
+import opennlp.summarization.preprocess.DefaultDocProcessor;
+import opennlp.summarization.textrank.TextRankSummarizer;
+
+import java.util.logging.*;
+
+import opennlp.summarization.DocProcessor;
+/*
+ * A summarizer that combines results from the text rank algorithm and the lexical chaining algorithm.
+ * It runs both algorithm and uses the the lexical chains to identify the main topics and relative importance
+ * and the text rank to pick sentences from lexical chains.
+ */
+public class MetaSummarizer{
+ DocProcessor dp ;
+ TextRankSummarizer textRank;
+ LexicalChainingSummarizer lcs;
+ String sentFragModel = "resources/en-sent.bin";
+
+ public MetaSummarizer(String posModelFile) throws Exception
+ {
+ Logger.getAnonymousLogger().info("Initializing Meta Summarizer");
+ dp = new DefaultDocProcessor(sentFragModel);
+ textRank = new TextRankSummarizer();
+ lcs = new LexicalChainingSummarizer(dp, posModelFile);
+ }
+
+ //An Utility method to sort the ranked sentences by sentence order.
+ private List<Score> order(List<Score> s)
+ {
+ Collections.sort(s, new Comparator<Score>()
+ {
+
+ @Override
+ public int compare(Score o1, Score o2) {
+ // TODO Auto-generated method stub
+
+ return o1.getSentId() - o2.getSentId();
+ }
+ });
+ return s;
+ }
+
+ // Rank sentences by merging the scores from lexical chaining and text rank..
+ // maxWords -1 indicates rank all sentences..
+ public int getBestSent(LexicalChain l, Hashtable<Integer, Score> pageRankScores)
+ {
+ double bestScore = 0; int bestStr=-1;
+ for(Sentence s : l.getSentences())
+ {
+ Score sc = pageRankScores.get(new Integer(s.getSentId()));
+ if(sc!=null && sc.getScore() > bestScore)
+ {
+ bestScore = sc.getScore();
+ bestStr = sc.getSentId();
+ }
+ }
+ return bestStr;
+ }
+
+ public List<Score> rankSentences(String article, List<Sentence> sent, int maxWords)
+ {
+ List<LexicalChain> lc = lcs.buildLexicalChains(article, sent);
+ Collections.sort(lc);
+ Hashtable<Integer, Score> sentScores = new Hashtable<Integer, Score>();
+ try{
+ List<Score> scores = textRank.rankSentences(article, sent, dp, article.length());
+ for(Score s: scores) sentScores.put(s.getSentId(), s);
+ }catch(Exception ex){
+ ex.printStackTrace();
+ }
+
+ Hashtable<Sentence, Boolean> summSents = new Hashtable<Sentence,Boolean>();
+ List<Score> finalSc = new ArrayList<Score>();
+ int currWordCnt = 0;
+ for(int i=lc.size()-1;i>=0;i--)
+ {
+ LexicalChain l = lc.get(i);
+ boolean added =false;
+ while(l.getSentences().size()>0)
+ {
+ int sentId = getBestSent(l, sentScores);
+ if(sentId == -1) break;
+
+ Sentence s = sent.get(sentId);
+
+ //Sentence already added, try again..
+ if(summSents.containsKey(s))
+ l.getSentences().remove(s);
+ else{
+ finalSc.add(sentScores.get(s.getSentId()));
+ summSents.put(s, true);
+ currWordCnt += s.getWordCnt();
+ break;
+ }
+ }
+ if(maxWords>0 && currWordCnt>maxWords) break;
+ }
+
+ order(finalSc);
+ return finalSc;
+ }
+
+ //Default Summarization using only lexical chains..
+ public String summarize(String article, int maxWords)
+ {
+ //Build lexical Chains..
+ List<Sentence> sent = dp.getSentencesFromStr(article);
+
+ List<Score>finalSc = rankSentences(article, sent, maxWords);
+
+ StringBuilder sb = new StringBuilder();
+ for(int i=0;i<finalSc.size();i++)
+ {
+ sb.append(sent.get(finalSc.get(i).getSentId()).toString().trim() +".. ");
+ }
+ // Pick sentences
+ return sb.toString();
+ }
+
+ public static void main(String[] args)
+ {
+ try{
+ String posModelFileName = "./resources/en-pos-maxent.bin";
+ String sentFragModel = "resources/en-sent.bin";
+ DefaultDocProcessor dp =new DefaultDocProcessor(sentFragModel);
+ Logger l = Logger.getAnonymousLogger();
+ MetaSummarizer lcs = new MetaSummarizer(posModelFileName);
+ String article = dp.docToString("test/tax.txt");
+ long strt = System.currentTimeMillis();
+ System.out.println(lcs.summarize(article, 50));
+ System.out.println(System.currentTimeMillis() - strt);
+
+ article = dp.docToString("test/houston-rep-nopara.txt");
+ strt = System.currentTimeMillis();
+ System.out.println(lcs.summarize(article, 50));
+ System.out.println(System.currentTimeMillis() - strt);
+
+ article = dp.docToString("gunman.txt");
+ strt = System.currentTimeMillis();
+ System.out.println(lcs.summarize(article, 50));
+ System.out.println(System.currentTimeMillis() - strt);
+
+ article = dp.docToString("satellite.txt");
+ strt = System.currentTimeMillis();
+ System.out.println(lcs.summarize(article, 50));
+ System.out.println(System.currentTimeMillis() - strt);
+ }catch(Exception ex){
+ ex.printStackTrace();
+ }
+ }
+}
Added: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java?rev=1658901&view=auto
==============================================================================
--- opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java (added)
+++ opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java Wed Feb 11 08:53:14 2015
@@ -0,0 +1,250 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.preprocess;
+
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.LineNumberReader;
+import java.io.StringReader;
+import java.io.UnsupportedEncodingException;
+import java.text.BreakIterator;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Locale;
+import java.util.Hashtable;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import opennlp.summarization.Sentence;
+import opennlp.summarization.DocProcessor;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.stemmer.Stemmer;
+
+
+/*
+ * Parse document to sentences..
+ */
+public class DefaultDocProcessor implements DocProcessor
+{
+ SentenceModel sentModel;
+ Stemmer stemmer;
+ StopWords sw;
+ //Sentence fragmentation to use..
+ static int OPEN_NLP = 1;
+ static int SIMPLE = 2;
+ static int SENTENCE_FRAG= OPEN_NLP;
+
+ public DefaultDocProcessor(String fragModelFile){
+ try {
+ InputStream modelIn = new FileInputStream(fragModelFile);
+ sentModel = new SentenceModel(modelIn);
+ }catch(Exception ex){
+ Logger.getAnonymousLogger().info("Error while parsing.. Ignoring the line and marching on.. "+ ex.getMessage());
+ }
+ }
+
+ //Str - Document or para
+ //sentences - List containing returned sentences
+ // iidx - if not null update with the words in the sentence + sent id
+ // processedSent - Sentences after stemming and stopword removal..
+ private void getSentences(String str, List<String> sentences, Hashtable<String, List<Integer>> iidx, List<String> processedSent)
+ {
+ int oldSentEndIdx = 0;
+ int sentEndIdx = 0;
+ Stemmer stemmer = new PorterStemmer();
+ StopWords sw = StopWords.getInstance();
+ BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US);
+ BreakIterator wrdItr = BreakIterator.getWordInstance(Locale.US);
+ iterator.setText(str);
+ int start = iterator.first();
+ int sentCnt = 0;
+
+ for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next())
+ {
+ String sentence = str.substring(start,end);//str.substring(oldSentEndIdx, sentEndIdx).trim();
+
+ //Add the sentence as-is; do any processing at the word level..
+ //To lower case and trim all punctuations
+ sentences.add(sentence);
+ wrdItr.setText(sentence);
+ StringBuffer procSent = new StringBuffer();
+ int wrdStrt = 0;
+
+ for(int wrdEnd = wrdItr.next(); wrdEnd != BreakIterator.DONE;
+ wrdStrt = wrdEnd, wrdEnd = wrdItr.next())
+ {
+ String word = sentence.substring(wrdStrt, wrdEnd);//words[i].trim();
+ word.replaceAll("\"|'","");
+
+ //Skip stop words and stem the word..
+ if(sw.isStopWord(word)) continue;
+
+ String stemedWrd = stemmer.stem(word).toString();
+
+ //update iidx by adding the current sentence to the list..
+ if(iidx!=null)
+ {
+ if(stemedWrd.length()>1)
+ {
+ List<Integer> sentList= iidx.get(stemedWrd);
+ if(sentList==null)
+ {
+ sentList = new ArrayList<Integer>();
+ }
+
+ sentList.add(sentCnt);
+ //Save it back
+ iidx.put(stemedWrd, sentList);
+ }
+ }
+ procSent.append(stemedWrd+" ");
+ }
+
+ sentCnt++;
+ if(processedSent!=null )
+ processedSent.add(procSent.toString());
+ }
+ }
+
+
+ public String docToString(String fileName)
+ {
+ LineNumberReader lnr = null;
+ StringBuffer docBuffer = new StringBuffer();
+
+ try {
+ lnr = new LineNumberReader(new FileReader(fileName));
+ String nextLine;
+
+ while ((nextLine = lnr.readLine()) != null) {
+ String trimmedLine = nextLine.trim();
+ if (!trimmedLine.isEmpty() ) {
+ docBuffer.append(trimmedLine.replaceAll("&#?[0-9 a-z A-Z][0-9 a-z A-Z][0-9 a-z A-Z]?;", "")+" ");
+ }
+ }
+ } catch (Exception ex) {
+ Logger.getLogger(DefaultDocProcessor.class.getName()).log(Level.SEVERE, null, ex);
+ } finally {
+ try {
+ lnr.close();
+ } catch (IOException ex) {
+ Logger.getLogger(DefaultDocProcessor.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ }
+
+ return docBuffer.toString();
+ }
+
+ //List of sentences form a document
+ public List<Sentence> docToSentList(String fileName)
+ {
+ List<Sentence> sentList = new ArrayList<Sentence>();
+ LineNumberReader lnr = null;
+ StringBuffer docBuffer = new StringBuffer();
+
+ try {
+ lnr = new LineNumberReader(new FileReader(fileName));
+ String nextLine;
+ int paraNo =0;
+ int sentNo = 0;
+ while ((nextLine = lnr.readLine()) != null) {
+ String trimmedLine = nextLine.trim();
+ if (!trimmedLine.isEmpty()) {
+ List<String> sents = new ArrayList<String>();
+ List<String> cleanedSents = new ArrayList<String>();
+ this.getSentences(trimmedLine, sents, null, cleanedSents);
+ int paraPos = 1;
+ for(String sen:sents)
+ {
+ Sentence s = new Sentence();
+ s.setSentId(sentNo++);
+ s.setParagraph(paraNo);
+ s.setStringVal(sen);
+ s.setParaPos(paraPos++);
+ sentList.add(s);
+ }
+ paraNo++;
+ }
+ }
+
+ String doc = docBuffer.toString();
+ } catch (Exception ex) {
+ Logger.getLogger(DefaultDocProcessor.class.getName()).log(Level.SEVERE, null, ex);
+ ex.printStackTrace();
+ } finally {
+ try {
+ lnr.close();
+ } catch (IOException ex) {
+ Logger.getLogger(DefaultDocProcessor.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ }
+
+ return sentList;
+ }
+
+
+ public List<Sentence> getSentencesFromStr(String text) {
+ List<Sentence> ret = new ArrayList<Sentence>();
+
+ List<String> sentStrs = new ArrayList<String>();
+ List<String> cleanedSents = new ArrayList<String>();
+
+ //Custom/simple method if specified or open nlp model was not found..
+ if(sentModel==null || SENTENCE_FRAG==SIMPLE)
+ getSentences(text, sentStrs, null, cleanedSents);
+ else{
+ SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel);
+ String[] sentences = sentenceDetector.sentDetect(text);
+ for(String sentence : sentences)
+ {
+ Logger.getLogger("DocProcessor").info(sentence);
+ sentStrs.add(sentence);
+ }
+ }
+ int sentNo = 0;
+
+ for(String sen:sentStrs)
+ {
+ Sentence s = new Sentence();
+ s.setSentId(sentNo);
+ s.setParagraph(1);
+ s.setStringVal(sen);
+ s.setParaPos(sentNo);
+ ret.add(s);
+ sentNo++;
+ }
+ return ret;
+ }
+
+
+ public String[] getWords(String sent)
+ {
+ return sent.split(" ");
+ }
+
+ @Override
+ public Stemmer getStemmer() {
+ // TODO Auto-generated method stub
+ return stemmer;
+ }
+
+}
\ No newline at end of file
Propchange: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
------------------------------------------------------------------------------
svn:executable = *
Added: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/IDFWordWeight.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/IDFWordWeight.java?rev=1658901&view=auto
==============================================================================
--- opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/IDFWordWeight.java (added)
+++ opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/IDFWordWeight.java Wed Feb 11 08:53:14 2015
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.preprocess;
+
+import java.util.Hashtable;
+import java.io.File;
+import java.io.FileReader;
+import java.io.LineNumberReader;
+
+import com.sun.istack.internal.logging.Logger;
+
+/*
+ * Class to load inverse document frequency for words. Resources like google n-gram can be used to populate this.
+ *
+ */
+public class IDFWordWeight implements WordWeight
+{
+ Hashtable<String, Double> idf;
+ private static IDFWordWeight instance;
+
+ public IDFWordWeight(String fileName)
+ {
+ idf = new Hashtable<String,Double>();
+ load(fileName);
+ }
+
+ public static IDFWordWeight getInstance(String fileName)
+ {
+ if(instance==null)
+ instance = new IDFWordWeight(fileName);
+ return instance;
+ }
+
+ public double getWordWeight(String s)
+ {
+ if(idf==null) return 1d;
+
+ Double d = idf.get(s);
+ if(d == null)
+ {
+ return 1;
+ }
+ return d.doubleValue();
+ }
+
+ /*
+ * Loads the IDF for words from given file. The file is required to have a simple format -
+ * word, IDF.
+ */
+ public void load(String fileName)
+ {
+ try{
+ LineNumberReader lnr = new LineNumberReader(new FileReader(fileName));
+ String nextLine;
+
+ while ((nextLine = lnr.readLine()) != null)
+ {
+ String trimmedLine = nextLine.trim();
+ if (!trimmedLine.isEmpty())
+ {
+ String[] tokens = trimmedLine.split(",");
+ String word = tokens[0]; double idfVal = Double.parseDouble(tokens[1]);
+ idf.put(word, idfVal);
+ }
+ }
+ }catch(Exception ex){
+ Logger.getLogger(opennlp.summarization.preprocess.IDFWordWeight.class).warning("Could not load the file with IDF");
+ }
+ }
+}
Propchange: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/IDFWordWeight.java
------------------------------------------------------------------------------
svn:executable = *
Added: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/PorterStemmer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/PorterStemmer.java?rev=1658901&view=auto
==============================================================================
--- opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/PorterStemmer.java (added)
+++ opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/PorterStemmer.java Wed Feb 11 08:53:14 2015
@@ -0,0 +1,391 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.preprocess;
+
+/*
+
+ Porter stemmer in Java. The original paper is in
+
+ Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+ no. 3, pp 130-137,
+
+ See also http://www.tartarus.org/~martin/PorterStemmer
+
+ History:
+
+ Release 1
+
+ Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
+ The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
+ is then out outside the bounds of b.
+
+ Release 2
+
+ Similarly,
+
+ Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
+ 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
+ b[j] is then outside the bounds of b.
+
+ Release 3
+
+ Considerably revised 4/9/00 in the light of many helpful suggestions
+ from Brian Goetz of Quiotix Corporation (brian@quiotix.com).
+
+ Release 4
+
+*/
+
+import java.io.*;
+
+import opennlp.tools.stemmer.Stemmer;
+
+/**
+ * Stemmer, implementing the Porter Stemming Algorithm
+ *
+ * The Stemmer class transforms a word into its root form. The input
+ * word can be provided a character at time (by calling add()), or at once
+ * by calling one of the various stem(something) methods.
+ */
+
+public class PorterStemmer implements Stemmer
+{ private char[] b;
+ private int i, /* offset into b */
+ i_end, /* offset to end of stemmed word */
+ j, k;
+ private static final int INC = 50;
+ /* unit of size whereby b is increased */
+ public PorterStemmer()
+ { b = new char[INC];
+ i = 0;
+ i_end = 0;
+ }
+
+ /**
+ * Add a character to the word being stemmed. When you are finished
+ * adding characters, you can call stem(void) to stem the word.
+ */
+
+ public void add(char ch)
+ { if (i == b.length)
+ { char[] new_b = new char[i+INC];
+ for (int c = 0; c < i; c++) new_b[c] = b[c];
+ b = new_b;
+ }
+ b[i++] = ch;
+ }
+
+
+ /** Adds wLen characters to the word being stemmed contained in a portion
+ * of a char[] array. This is like repeated calls of add(char ch), but
+ * faster.
+ */
+
+ public void add(char[] w, int wLen)
+ { if (i+wLen >= b.length)
+ { char[] new_b = new char[i+wLen+INC];
+ for (int c = 0; c < i; c++) new_b[c] = b[c];
+ b = new_b;
+ }
+ for (int c = 0; c < wLen; c++) b[i++] = w[c];
+ }
+
+ /**
+ * After a word has been stemmed, it can be retrieved by toString(),
+ * or a reference to the internal buffer can be retrieved by getResultBuffer
+ * and getResultLength (which is generally more efficient.)
+ */
+ public String toString() { return new String(b,0,i_end); }
+
+ /**
+ * Returns the length of the word resulting from the stemming process.
+ */
+ public int getResultLength() { return i_end; }
+
+ /**
+ * Returns a reference to a character buffer containing the results of
+ * the stemming process. You also need to consult getResultLength()
+ * to determine the length of the result.
+ */
+ public char[] getResultBuffer() { return b; }
+
+ /* cons(i) is true <=> b[i] is a consonant. */
+
+ private final boolean cons(int i)
+ { switch (b[i])
+ { case 'a': case 'e': case 'i': case 'o': case 'u': return false;
+ case 'y': return (i==0) ? true : !cons(i-1);
+ default: return true;
+ }
+ }
+
+ /* m() measures the number of consonant sequences between 0 and j. if c is
+ a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
+ presence,
+
+ <c><v> gives 0
+ <c>vc<v> gives 1
+ <c>vcvc<v> gives 2
+ <c>vcvcvc<v> gives 3
+ ....
+ */
+
+ private final int m()
+ { int n = 0;
+ int i = 0;
+ while(true)
+ { if (i > j) return n;
+ if (! cons(i)) break; i++;
+ }
+ i++;
+ while(true)
+ { while(true)
+ { if (i > j) return n;
+ if (cons(i)) break;
+ i++;
+ }
+ i++;
+ n++;
+ while(true)
+ { if (i > j) return n;
+ if (! cons(i)) break;
+ i++;
+ }
+ i++;
+ }
+ }
+
+ /* vowelinstem() is true <=> 0,...j contains a vowel */
+
+ private final boolean vowelinstem()
+ { int i; for (i = 0; i <= j; i++) if (! cons(i)) return true;
+ return false;
+ }
+
+ /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
+
+ private final boolean doublec(int j)
+ { if (j < 1) return false;
+ if (b[j] != b[j-1]) return false;
+ return cons(j);
+ }
+
+ /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
+ and also if the second c is not w,x or y. this is used when trying to
+ restore an e at the end of a short word. e.g.
+
+ cav(e), lov(e), hop(e), crim(e), but
+ snow, box, tray.
+
+ */
+
+ private final boolean cvc(int i)
+ { if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) return false;
+ { int ch = b[i];
+ if (ch == 'w' || ch == 'x' || ch == 'y') return false;
+ }
+ return true;
+ }
+
+ private final boolean ends(String s)
+ { int l = s.length();
+ int o = k-l+1;
+ if (o < 0) return false;
+ for (int i = 0; i < l; i++) if (b[o+i] != s.charAt(i)) return false;
+ j = k-l;
+ return true;
+ }
+
+ /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
+ k. */
+
+ private final void setto(String s)
+ { int l = s.length();
+ int o = j+1;
+ for (int i = 0; i < l; i++) b[o+i] = s.charAt(i);
+ k = j+l;
+ }
+
+ /* r(s) is used further down. */
+
+ private final void r(String s) { if (m() > 0) setto(s); }
+
+ /* step1() gets rid of plurals and -ed or -ing. e.g.
+
+ caresses -> caress
+ ponies -> poni
+ ties -> ti
+ caress -> caress
+ cats -> cat
+
+ feed -> feed
+ agreed -> agree
+ disabled -> disable
+
+ matting -> mat
+ mating -> mate
+ meeting -> meet
+ milling -> mill
+ messing -> mess
+
+ meetings -> meet
+
+ */
+
+ private final void step1()
+ { if (b[k] == 's')
+ { if (ends("sses")) k -= 2; else
+ if (ends("ies")) setto("i"); else
+ if (b[k-1] != 's') k--;
+ }
+ if (ends("eed")) { if (m() > 0) k--; } else
+ if ((ends("ed") || ends("ing")) && vowelinstem())
+ { k = j;
+ if (ends("at")) setto("ate"); else
+ if (ends("bl")) setto("ble"); else
+ if (ends("iz")) setto("ize"); else
+ if (doublec(k))
+ { k--;
+ { int ch = b[k];
+ if (ch == 'l' || ch == 's' || ch == 'z') k++;
+ }
+ }
+ else if (m() == 1 && cvc(k)) setto("e");
+ }
+ }
+
+ /* step2() turns terminal y to i when there is another vowel in the stem. */
+
+ private final void step2() { if (ends("y") && vowelinstem()) b[k] = 'i'; }
+
+ /* step3() maps double suffices to single ones. so -ization ( = -ize plus
+ -ation) maps to -ize etc. note that the string before the suffix must give
+ m() > 0. */
+
+ private final void step3() { if (k == 0) return; /* For Bug 1 */ switch (b[k-1])
+ {
+ case 'a': if (ends("ational")) { r("ate"); break; }
+ if (ends("tional")) { r("tion"); break; }
+ break;
+ case 'c': if (ends("enci")) { r("ence"); break; }
+ if (ends("anci")) { r("ance"); break; }
+ break;
+ case 'e': if (ends("izer")) { r("ize"); break; }
+ break;
+ case 'l': if (ends("bli")) { r("ble"); break; }
+ if (ends("alli")) { r("al"); break; }
+ if (ends("entli")) { r("ent"); break; }
+ if (ends("eli")) { r("e"); break; }
+ if (ends("ousli")) { r("ous"); break; }
+ break;
+ case 'o': if (ends("ization")) { r("ize"); break; }
+ if (ends("ation")) { r("ate"); break; }
+ if (ends("ator")) { r("ate"); break; }
+ break;
+ case 's': if (ends("alism")) { r("al"); break; }
+ if (ends("iveness")) { r("ive"); break; }
+ if (ends("fulness")) { r("ful"); break; }
+ if (ends("ousness")) { r("ous"); break; }
+ break;
+ case 't': if (ends("aliti")) { r("al"); break; }
+ if (ends("iviti")) { r("ive"); break; }
+ if (ends("biliti")) { r("ble"); break; }
+ break;
+ case 'g': if (ends("logi")) { r("log"); break; }
+ } }
+
+ /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
+
+ private final void step4() { switch (b[k])
+ {
+ case 'e': if (ends("icate")) { r("ic"); break; }
+ if (ends("ative")) { r(""); break; }
+ if (ends("alize")) { r("al"); break; }
+ break;
+ case 'i': if (ends("iciti")) { r("ic"); break; }
+ break;
+ case 'l': if (ends("ical")) { r("ic"); break; }
+ if (ends("ful")) { r(""); break; }
+ break;
+ case 's': if (ends("ness")) { r(""); break; }
+ break;
+ } }
+
+ /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
+
+ private final void step5()
+ { if (k == 0) return; /* for Bug 1 */ switch (b[k-1])
+ { case 'a': if (ends("al")) break; return;
+ case 'c': if (ends("ance")) break;
+ if (ends("ence")) break; return;
+ case 'e': if (ends("er")) break; return;
+ case 'i': if (ends("ic")) break; return;
+ case 'l': if (ends("able")) break;
+ if (ends("ible")) break; return;
+ case 'n': if (ends("ant")) break;
+ if (ends("ement")) break;
+ if (ends("ment")) break;
+ /* element etc. not stripped before the m */
+ if (ends("ent")) break; return;
+ case 'o': if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
+ /* j >= 0 fixes Bug 2 */
+ if (ends("ou")) break; return;
+ /* takes care of -ous */
+ case 's': if (ends("ism")) break; return;
+ case 't': if (ends("ate")) break;
+ if (ends("iti")) break; return;
+ case 'u': if (ends("ous")) break; return;
+ case 'v': if (ends("ive")) break; return;
+ case 'z': if (ends("ize")) break; return;
+ default: return;
+ }
+ if (m() > 1) k = j;
+ }
+
+ /* step6() removes a final -e if m() > 1. */
+
+ private final void step6()
+ { j = k;
+ if (b[k] == 'e')
+ { int a = m();
+ if (a > 1 || a == 1 && !cvc(k-1)) k--;
+ }
+ if (b[k] == 'l' && doublec(k) && m() > 1) k--;
+ }
+
+ /** Stem the word placed into the Stemmer buffer through calls to add().
+ * Returns true if the stemming process resulted in a word different
+ * from the input. You can retrieve the result with
+ * getResultLength()/getResultBuffer() or toString().
+ */
+ public void stem()
+ { k = i - 1;
+ if (k > 1) { step1(); step2(); step3(); step4(); step5(); step6(); }
+ i_end = k+1; i = 0;
+ }
+
+ public CharSequence stem(CharSequence word)
+ {
+ b = new char[word.length()];
+ char[] arr = word.toString().toCharArray();
+ for(k=0;k<arr.length;k++) this.add(arr[k]);
+ stem();
+ return this.toString();
+ }
+}
+
Propchange: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/PorterStemmer.java
------------------------------------------------------------------------------
svn:executable = *
Added: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/StopWords.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/StopWords.java?rev=1658901&view=auto
==============================================================================
--- opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/StopWords.java (added)
+++ opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/StopWords.java Wed Feb 11 08:53:14 2015
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.preprocess;
+
+import java.util.Hashtable;
+
+/**
+ *
+ * @author rtww
+ */
+public class StopWords {
+ private Hashtable<String, Boolean> h;
+ private static StopWords instance;
+
+ public StopWords()
+ {
+ h = new Hashtable<String, Boolean>();
+ h.put("0", true);
+ h.put("1", true);
+ h.put("2", true);
+ h.put("3", true);
+ h.put("4", true);
+ h.put("5", true);
+ h.put("6", true);
+ h.put("7", true);
+ h.put("8", true);
+ h.put("9", true);
+
+ h.put("a", true);
+ h.put("about", true);
+ h.put("above", true);
+ h.put("after", true);
+ h.put("again", true);
+ h.put("against", true);
+ h.put("all", true);
+ h.put("am", true);
+ h.put("an", true);
+ h.put("and", true);
+ h.put("any", true);
+ h.put("are", true);
+ h.put("aren't", true);
+ h.put("as", true);
+ h.put("at", true);
+ h.put("be", true);
+ h.put("because", true);
+ h.put("been", true);
+ h.put("before", true);
+ h.put("being", true);
+ h.put("below", true);
+ h.put("between", true);
+ h.put("both", true);
+ h.put("but", true);
+ h.put("by", true);
+ h.put("can't", true);
+ h.put("cannot", true);
+ h.put("could", true);
+ h.put("couldn't", true);
+ h.put("did", true);
+ h.put("didn't", true);
+ h.put("do", true);
+ h.put("does", true);
+ h.put("doesn't", true);
+ h.put("doing", true);
+ h.put("don't", true);
+ h.put("down", true);
+ h.put("during", true);
+ h.put("each", true);
+ h.put("few", true);
+ h.put("for", true);
+ h.put("from", true);
+ h.put("further", true);
+ h.put("had", true);
+ h.put("hadn't", true);
+ h.put("has", true);
+ h.put("hasn't", true);
+ h.put("have", true);
+ h.put("haven't", true);
+ h.put("having", true);
+ h.put("he", true);
+ h.put("he'd", true);
+ h.put("he'll", true);
+ h.put("he's", true);
+ h.put("her", true);
+ h.put("here", true);
+ h.put("here's", true);
+ h.put("hers", true);
+ h.put("herself", true);
+ h.put("him", true);
+ h.put("himself", true);
+ h.put("his", true);
+ h.put("how", true);
+ h.put("how's", true);
+ h.put("i", true);
+ h.put("i'd", true);
+ h.put("i'll", true);
+ h.put("i'm", true);
+ h.put("i've", true);
+ h.put("if", true);
+ h.put("in", true);
+ h.put("into", true);
+ h.put("is", true);
+ h.put("isn't", true);
+ h.put("it", true);
+ h.put("it's", true);
+ h.put("its", true);
+ h.put("itself", true);
+ h.put("let's", true);
+ h.put("me", true);
+ h.put("more", true);
+ h.put("most", true);
+ h.put("mustn't", true);
+ h.put("my", true);
+ h.put("myself", true);
+ h.put("no", true);
+ h.put("nor", true);
+ h.put("not", true);
+ h.put("of", true);
+ h.put("off", true);
+ h.put("on", true);
+ h.put("once", true);
+ h.put("only", true);
+ h.put("or", true);
+ h.put("other", true);
+ h.put("ought", true);
+ h.put("our", true);
+ h.put("ours ", true);
+ h.put(" ourselves", true);
+ h.put("out", true);
+ h.put("over", true);
+ h.put("own", true);
+ h.put("same", true);
+ h.put("shan't", true);
+ h.put("she", true);
+ h.put("she'd", true);
+ h.put("she'll", true);
+ h.put("she's", true);
+ h.put("should", true);
+ h.put("shouldn't", true);
+ h.put("so", true);
+ h.put("some", true);
+ h.put("say", true);
+ h.put("said", true);
+ h.put("such", true);
+ h.put("than", true);
+ h.put("that", true);
+ h.put("that's", true);
+ h.put("the", true);
+ h.put("their", true);
+ h.put("theirs", true);
+ h.put("them", true);
+ h.put("themselves", true);
+ h.put("then", true);
+ h.put("there", true);
+ h.put("there's", true);
+ h.put("these", true);
+ h.put("they", true);
+ h.put("they'd", true);
+ h.put("they'll", true);
+ h.put("they're", true);
+ h.put("they've", true);
+ h.put("this", true);
+ h.put("those", true);
+ h.put("through", true);
+ h.put("to", true);
+ h.put("too", true);
+ h.put("under", true);
+ h.put("until", true);
+ h.put("up", true);
+ h.put("very", true);
+ h.put("was", true);
+ h.put("wasn't", true);
+ h.put("we", true);
+ h.put("we'd", true);
+ h.put("we'll", true);
+ h.put("we're", true);
+ h.put("we've", true);
+ h.put("were", true);
+ h.put("weren't", true);
+ h.put("what", true);
+ h.put("what's", true);
+ h.put("when", true);
+ h.put("when's", true);
+ h.put("where", true);
+ h.put("where's", true);
+ h.put("which", true);
+ h.put("while", true);
+ h.put("who", true);
+ h.put("who's", true);
+ h.put("whom", true);
+ h.put("why", true);
+ h.put("why's", true);
+ h.put("with", true);
+ h.put("won't", true);
+ h.put("would", true);
+ h.put("wouldn't", true);
+ h.put("you", true);
+ h.put("you'd", true);
+ h.put("you'll", true);
+ h.put("you're", true);
+ h.put("you've", true);
+ h.put("your", true);
+ h.put("yours", true);
+ h.put("yourself", true);
+ h.put("yourselves ", true);
+ }
+
+ public boolean isStopWord(String s)
+ {
+ boolean ret = h.get(s)==null? false: true;
+ if(s.length()==1) ret = true;
+ return ret;
+ }
+
+ public static StopWords getInstance()
+ {
+ if(instance == null)
+ instance = new StopWords();
+ return instance;
+ }
+}
Propchange: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/StopWords.java
------------------------------------------------------------------------------
svn:executable = *
Added: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/WordWeight.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/WordWeight.java?rev=1658901&view=auto
==============================================================================
--- opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/WordWeight.java (added)
+++ opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/WordWeight.java Wed Feb 11 08:53:14 2015
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package opennlp.summarization.preprocess;
+
+public interface WordWeight
+{
+ public double getWordWeight(String s);
+}
Propchange: opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/preprocess/WordWeight.java
------------------------------------------------------------------------------
svn:executable = *