You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by bg...@apache.org on 2014/01/06 18:48:32 UTC
svn commit: r1555944 [8/11] - in /opennlp/sandbox/opennlp-similarity/src:
main/java/opennlp/tools/apps/ main/java/opennlp/tools/apps/contentgen/
main/java/opennlp/tools/apps/contentgen/multithreaded/
main/java/opennlp/tools/apps/relevanceVocabs/ main/j...
Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java?rev=1555944&r1=1555943&r2=1555944&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java Mon Jan 6 17:48:30 2014
@@ -19,15 +19,24 @@ package opennlp.tools.similarity.apps;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashSet;
import java.util.List;
+import java.util.Set;
import java.util.logging.Logger;
+import opennlp.tools.parse_thicket.Triple;
+import opennlp.tools.parse_thicket.apps.SnippetToParagraph;
+import opennlp.tools.parse_thicket.apps.SnippetToParagraph.TextChunk;
+import opennlp.tools.parse_thicket.apps.SnippetToParagraph.TextChunkComparable;
import opennlp.tools.similarity.apps.utils.PageFetcher;
import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
import opennlp.tools.similarity.apps.utils.Utils;
import opennlp.tools.textsimilarity.ParseTreeChunk;
import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
import opennlp.tools.textsimilarity.SentencePairMatchResult;
+import opennlp.tools.textsimilarity.TextProcessor;
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
import org.apache.commons.lang.StringUtils;
@@ -43,575 +52,952 @@ import org.apache.commons.lang.StringUti
*/
public class RelatedSentenceFinder {
- private static Logger LOG = Logger
- .getLogger("opennlp.tools.similarity.apps.RelatedSentenceFinder");
- PageFetcher pFetcher = new PageFetcher();
-
- private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
- private ParseTreeChunk parseTreeChunk = new ParseTreeChunk();
-
- static StringDistanceMeasurer STRING_DISTANCE_MEASURER = new StringDistanceMeasurer();
-
- // used to indicate that a sentence is an opinion, so more appropriate
- static List<String> MENTAL_VERBS = new ArrayList<String>(
- Arrays.asList(new String[] { "want", "know", "believe", "appeal", "ask",
- "accept", "agree", "allow", "appeal", "ask", "assume", "believe",
- "check", "confirm", "convince", "deny", "disagree", "explain",
- "ignore", "inform", "remind", "request", "suggest", "suppose",
- "think", "threaten", "try", "understand" }));
-
- private static final int MAX_FRAGMENT_SENTS = 10;
-
- public RelatedSentenceFinder() {
-
- }
-
- public List<HitBase> findRelatedOpinionsForSentenceFastAndDummy(String word,
- List<String> sents) throws Exception {
- BingWebQueryRunner yrunner = new BingWebQueryRunner();
- List<HitBase> searchResult = yrunner.runSearch(word, 100);
- return searchResult;
- }
-
- public List<HitBase> findRelatedOpinionsForSentence(String sentence,
- List<String> sents) throws Exception {
- List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
- System.out.println(" \n\n=== Sentence = " + sentence);
- List<String> nounPhraseQueries = buildSearchEngineQueryFromSentence(sentence);
-
- BingWebQueryRunner yrunner = new BingWebQueryRunner();
- for (String query : nounPhraseQueries) {
- System.out.println("\nquery = " + query);
- // query += " "+join(MENTAL_VERBS, " OR ") ;
- List<HitBase> searchResult = yrunner.runSearch(query, 100);
- if (searchResult != null) {
- for (HitBase item : searchResult) { // got some text from .html
- if (item.getAbstractText() != null
- && !(item.getUrl().indexOf(".pdf") > 0)) { // exclude
- // pdf
- opinionSentencesToAdd
- .add(augmentWithMinedSentencesAndVerifyRelevance(item,
- sentence, sents));
- }
- }
- }
- }
-
- opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
- return opinionSentencesToAdd;
- }
-
- /**
- * Main content generation function which takes a seed as a person, rock
- * group, or other entity name and produce a list of text fragments by web
- * mining for <br>
- *
- * @param String
- * entity name
- * @return List<HitBase> of text fragment structures which contain approved
- * (in terms of relevance) mined sentences, as well as original search
- * results objects such as doc titles, abstracts, and urls.
- */
-
- public List<HitBase> generateContentAbout(String sentence) throws Exception {
- List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
- System.out.println(" \n=== Entity to write about = " + sentence);
- List<String> nounPhraseQueries = new ArrayList<String>();
-
- // nounPhraseQueries.add(sentence + frequentPerformingVerbs);
-
- BingWebQueryRunner yrunner = new BingWebQueryRunner();
- for (String verbAddition : StoryDiscourseNavigator.frequentPerformingVerbs) {
- List<HitBase> searchResult = yrunner.runSearch(sentence + " "
- + verbAddition, 100);
- if (searchResult != null) {
- for (HitBase item : searchResult) { // got some text from .html
- if (item.getAbstractText() != null
- && !(item.getUrl().indexOf(".pdf") > 0)) { // exclude pdf
- opinionSentencesToAdd
- .add(augmentWithMinedSentencesAndVerifyRelevance(item,
- sentence, null));
- }
- }
- }
- }
-
- opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
- return opinionSentencesToAdd;
- }
-
- /**
- * Takes a sentence and extracts noun phrases and entity names to from search
- * queries for finding relevant sentences on the web, which are then subject
- * to relevance assessment by Similarity. Search queries should not be too
- * general (irrelevant search results) or too specific (too few search
- * results)
- *
- * @param String
- * input sentence to form queries
- * @return List<String> of search expressions
- */
- public static List<String> buildSearchEngineQueryFromSentence(String sentence) {
- ParseTreeChunk matcher = new ParseTreeChunk();
- ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor
- .getInstance();
- List<List<ParseTreeChunk>> sent1GrpLst = null;
-
- List<ParseTreeChunk> nPhrases = pos
- .formGroupedPhrasesFromChunksForSentence(sentence).get(0);
- List<String> queryArrayStr = new ArrayList<String>();
- for (ParseTreeChunk ch : nPhrases) {
- String query = "";
- int size = ch.getLemmas().size();
-
- for (int i = 0; i < size; i++) {
- if (ch.getPOSs().get(i).startsWith("N")
- || ch.getPOSs().get(i).startsWith("J")) {
- query += ch.getLemmas().get(i) + " ";
- }
- }
- query = query.trim();
- int len = query.split(" ").length;
- if (len < 2 || len > 5)
- continue;
- if (len < 4) { // every word should start with capital
- String[] qs = query.split(" ");
- boolean bAccept = true;
- for (String w : qs) {
- if (w.toLowerCase().equals(w)) // idf only two words then
- // has to be person name,
- // title or geo location
- bAccept = false;
- }
- if (!bAccept)
- continue;
- }
-
- query = query.trim().replace(" ", " +");
- query = " +" + query;
-
- queryArrayStr.add(query);
-
- }
- if (queryArrayStr.size() < 1) { // release constraints on NP down to 2
- // keywords
- for (ParseTreeChunk ch : nPhrases) {
- String query = "";
- int size = ch.getLemmas().size();
-
- for (int i = 0; i < size; i++) {
- if (ch.getPOSs().get(i).startsWith("N")
- || ch.getPOSs().get(i).startsWith("J")) {
- query += ch.getLemmas().get(i) + " ";
- }
- }
- query = query.trim();
- int len = query.split(" ").length;
- if (len < 2)
- continue;
-
- query = query.trim().replace(" ", " +");
- query = " +" + query;
-
- queryArrayStr.add(query);
-
- }
- }
-
- queryArrayStr = removeDuplicatesFromQueries(queryArrayStr);
- queryArrayStr.add(sentence);
-
- return queryArrayStr;
-
- }
-
- /**
- * remove dupes from queries to easy cleaning dupes and repetitive search
- * afterwards
- *
- * @param List
- * <String> of sentences (search queries, or search results
- * abstracts, or titles
- * @return List<String> of sentences where dupes are removed
- */
- public static List<String> removeDuplicatesFromQueries(List<String> hits) {
- StringDistanceMeasurer meas = new StringDistanceMeasurer();
- double dupeThresh = 0.8; // if more similar, then considered dupes was
- // 0.7
- List<Integer> idsToRemove = new ArrayList<Integer>();
- List<String> hitsDedup = new ArrayList<String>();
- try {
- for (int i = 0; i < hits.size(); i++)
- for (int j = i + 1; j < hits.size(); j++) {
- String title1 = hits.get(i);
- String title2 = hits.get(j);
- if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
- continue;
- if (meas.measureStringDistance(title1, title2) > dupeThresh) {
- idsToRemove.add(j); // dupes found, later list member to
- // be deleted
-
- }
- }
-
- for (int i = 0; i < hits.size(); i++)
- if (!idsToRemove.contains(i))
- hitsDedup.add(hits.get(i));
-
- if (hitsDedup.size() < hits.size()) {
- LOG.info("Removed duplicates from formed query, including "
- + hits.get(idsToRemove.get(0)));
- }
-
- } catch (Exception e) {
- LOG.severe("Problem removing duplicates from query list");
- }
-
- return hitsDedup;
-
- }
-
- /**
- * remove dupes from search results
- *
- * @param List
- * <HitBase> of search results objects
- * @return List<String> of search results objects where dupes are removed
- */
- public static List<HitBase> removeDuplicatesFromResultantHits(
- List<HitBase> hits) {
- StringDistanceMeasurer meas = new StringDistanceMeasurer();
- double dupeThresh = // 0.8; // if more similar, then considered dupes was
- 0.7;
- List<Integer> idsToRemove = new ArrayList<Integer>();
- List<HitBase> hitsDedup = new ArrayList<HitBase>();
- try {
- for (int i = 0; i < hits.size(); i++)
- for (int j = i + 1; j < hits.size(); j++) {
- HitBase hit2 = hits.get(j);
- List<Fragment> fragmList1 = hits.get(i).getFragments();
- List<Fragment> fragmList2 = hits.get(j).getFragments();
- List<Fragment> fragmList2Results = new ArrayList<Fragment>(fragmList2);
- for (Fragment f1 : fragmList1)
- for (Fragment f2 : fragmList2) {
- String sf1 = f1.getResultText();
- String sf2 = f2.getResultText();
- if (StringUtils.isEmpty(sf1) || StringUtils.isEmpty(sf1))
- continue;
- if (meas.measureStringDistance(sf1, sf2) > dupeThresh) {
- fragmList2Results.remove(f2);
- LOG.info("Removed duplicates from formed fragments list: "
- + sf2);
- }
- }
-
- hit2.setFragments(fragmList2Results);
- hits.set(j, hit2);
- }
- } catch (Exception e) {
- LOG.severe("Problem removing duplicates from list of fragment");
- }
- return hits;
- }
-
- /**
- * Takes single search result for an entity which is the subject of the essay
- * to be written and forms essey sentences from the title, abstract, and
- * possibly original page
- *
- * @param HitBase
- * item : search result
- * @param originalSentence
- * : seed for the essay to be written
- * @param sentsAll
- * : list<String> of other sentences in the seed if it is
- * multi-sentence
- * @return search result
- */
-
- public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item,
- String originalSentence, List<String> sentsAll) {
- if (sentsAll == null)
- sentsAll = new ArrayList<String>();
- // put orig sentence in structure
- List<String> origs = new ArrayList<String>();
- origs.add(originalSentence);
- item.setOriginalSentences(origs);
- String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
- .replace(" ", " ").replace(" ", " ");
- // generation results for this sentence
- List<Fragment> result = new ArrayList<Fragment>();
- // form plain text from snippet
- String snapshot = item.getAbstractText().replace("<b>", " ")
- .replace("</b>", " ").replace(" ", " ").replace(" ", " ");
-
- ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor
- .getInstance();
- // fix a template expression which can be substituted by original if
- // relevant
- String snapshotMarked = snapshot.replace("...",
- " _should_find_orig_ . _should_find_orig_");
- String[] fragments = sm.splitSentences(snapshotMarked);
- List<String> allFragms = new ArrayList<String>();
- allFragms.addAll(Arrays.asList(fragments));
-
- String[] sents = null;
- String downloadedPage;
- try {
- if (snapshotMarked.length() != snapshot.length()) {
- downloadedPage = pFetcher.fetchPage(item.getUrl());
- if (downloadedPage != null && downloadedPage.length() > 100) {
- item.setPageContent(downloadedPage);
- String pageContent = Utils.fullStripHTML(item.getPageContent());
- pageContent = GeneratedSentenceProcessor
- .normalizeForSentenceSplitting(pageContent);
- pageContent = pageContent.trim().replaceAll(" [A-Z]", ". $0")// .replace(" ",
- // ". ")
- .replace("..", ".").replace(". . .", " ").trim(); // sometimes
- // html breaks
- // are converted
- // into ' ' (two
- // spaces), so
- // we need to
- // put '.'
- sents = sm.splitSentences(snapshotMarked);
- ;
- sents = cleanListOfSents(sents);
- }
- }
- } catch (Exception e) {
- // TODO Auto-generated catch block
- // e.printStackTrace();
- System.err
- .println("Problem downloading the page and splitting into sentences");
- return item;
- }
-
- for (String fragment : allFragms) {
- String followSent = null;
- if (fragment.length() < 50)
- continue;
- String pageSentence = "";
- // try to find original sentence from webpage
- if (fragment.indexOf("_should_find_orig_") > -1 && sents != null
- && sents.length > 0)
- try {
- String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
- fragment.replace("_should_find_orig_", ""), sents);
- pageSentence = mainAndFollowSent[0];
- followSent = mainAndFollowSent[1];
-
- } catch (Exception e) {
-
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- else
- // or get original snippet
- pageSentence = fragment;
- if (pageSentence != null)
- pageSentence.replace("_should_find_orig_", "");
-
- // resultant sentence SHOULD NOT be longer than twice the size of
- // snippet fragment
- if (pageSentence != null
- && (float) pageSentence.length() / (float) fragment.length() < 4.0) { // was
- // 2.0,
- // but
- // since
- // snippet
- // sentences
- // are
- // rather
- // short
- // now...
- try { // get score from syntactic match between sentence in
- // original text and mined sentence
- double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;
-
- SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence
- + " " + title, originalSentence);
- List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
- if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {
- System.out
- .println("Rejected Sentence : No verb OR Yes imperative verb :"
- + pageSentence);
- continue;
- }
-
- syntScore = parseTreeChunkListScorer
- .getParseTreeChunkListScore(match);
- System.out.println(parseTreeChunk.listToString(match) + " "
- + syntScore + "\n pre-processed sent = '" + pageSentence);
-
- if (syntScore < 1.5) { // trying other sents
- for (String currSent : sentsAll) {
- if (currSent.startsWith(originalSentence))
- continue;
- match = sm.assessRelevance(currSent, pageSentence)
- .getMatchResult();
- double syntScoreCurr = parseTreeChunkListScorer
- .getParseTreeChunkListScore(match);
- if (syntScoreCurr > syntScore) {
- syntScore = syntScoreCurr;
- }
- }
- if (syntScore > 1.5) {
- System.out.println("Got match with other sent: "
- + parseTreeChunk.listToString(match) + " " + syntScore);
- }
- }
-
- measScore = STRING_DISTANCE_MEASURER.measureStringDistance(
- originalSentence, pageSentence);
-
- // now possibly increase score by finding mental verbs
- // indicating opinions
- for (String s : MENTAL_VERBS) {
- if (pageSentence.indexOf(s) > -1) {
- mentalScore += 0.3;
- break;
- }
- }
-
- if ((syntScore > 1.5 || measScore > 0.5 || mentalScore > 0.5)
- && measScore < 0.8 && pageSentence.length() > 40) // >70
- {
- String pageSentenceProc = GeneratedSentenceProcessor
- .acceptableMinedSentence(pageSentence);
- if (pageSentenceProc != null) {
- pageSentenceProc = GeneratedSentenceProcessor
- .processSentence(pageSentenceProc);
- if (followSent != null) {
- pageSentenceProc += " "
- + GeneratedSentenceProcessor.processSentence(followSent);
- }
-
- pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
- Fragment f = new Fragment(pageSentenceProc, syntScore + measScore
- + mentalScore + (double) pageSentenceProc.length()
- / (double) 50);
- f.setSourceURL(item.getUrl());
- f.fragment = fragment;
- result.add(f);
- System.out.println("Accepted sentence: " + pageSentenceProc
- + "| with title= " + title);
- System.out.println("For fragment = " + fragment);
- } else
- System.out
- .println("Rejected sentence due to wrong area at webpage: "
- + pageSentence);
- } else
- System.out.println("Rejected sentence due to low score: "
- + pageSentence);
- // }
- } catch (Throwable t) {
- t.printStackTrace();
- }
- }
- }
- item.setFragments(result);
- return item;
- }
-
- public static String[] cleanListOfSents(String[] sents) {
- List<String> sentsClean = new ArrayList<String>();
- for (String s : sents) {
- if (s == null || s.trim().length() < 30 || s.length() < 20)
- continue;
- sentsClean.add(s);
- }
- return (String[]) sentsClean.toArray(new String[0]);
- }
-
- // given a fragment from snippet, finds an original sentence at a webpage by
- // optimizing alignmemt score
- public static String[] getFullOriginalSentenceFromWebpageBySnippetFragment(
- String fragment, String[] sents) {
- if (fragment.trim().length() < 15)
- return null;
-
- StringDistanceMeasurer meas = new StringDistanceMeasurer();
- Double dist = 0.0;
- String result = null, followSent = null;
- for (int i = 0; i < sents.length; i++) {
- String s = sents[i];
- if (s == null || s.length() < 30)
- continue;
- Double distCurr = meas.measureStringDistance(s, fragment);
- if (distCurr > dist && distCurr > 0.4) {
- result = s;
- dist = distCurr;
- if (i < sents.length - 1 && sents[i + 1].length() > 60) {
- followSent = sents[i + 1];
- }
-
- }
- }
- return new String[] { result, followSent };
- }
-
- // given a fragment from snippet, finds an original sentence at a webpage by
- // optimizing alignmemt score
- public static String[] getBestFullOriginalSentenceFromWebpageBySnippetFragment(
- String fragment, String[] sents) {
- if (fragment.trim().length() < 15)
- return null;
- int bestSentIndex = -1;
- StringDistanceMeasurer meas = new StringDistanceMeasurer();
- Double distBest = 10.0; // + sup
- String result = null, followSent = null;
- for (int i = 0; i < sents.length; i++) {
- String s = sents[i];
- if (s == null || s.length() < 30)
- continue;
- Double distCurr = meas.measureStringDistance(s, fragment);
- if (distCurr > distBest) {
- distBest = distCurr;
- bestSentIndex = i;
- }
-
- }
- if (distBest > 0.4) {
- result = sents[bestSentIndex];
-
- if (bestSentIndex < sents.length - 1
- && sents[bestSentIndex + 1].length() > 60) {
- followSent = sents[bestSentIndex + 1];
- }
-
- }
-
- return new String[] { result, followSent };
- }
-
- public static void main(String[] args) {
- RelatedSentenceFinder f = new RelatedSentenceFinder();
-
- List<HitBase> hits = null;
- try {
- // uncomment the sentence you would like to serve as a seed sentence for
- // content generation for an event description
-
- // uncomment the sentence you would like to serve as a seed sentence for
- // content generation for an event description
- hits = f.generateContentAbout("Albert Einstein"
- // "Britney Spears - The Femme Fatale Tour"
- // "Rush Time Machine",
- // "Blue Man Group" ,
- // "Belly Dance With Zaharah",
- // "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
- // "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
- );
- System.out.println(HitBase.toString(hits));
- System.out.println(HitBase.toResultantString(hits));
- // WordFileGenerator.createWordDoc("Essey about Albert Einstein",
- // hits.get(0).getTitle(), hits);
-
- } catch (Exception e) {
- e.printStackTrace();
- }
+ private static Logger LOG = Logger
+ .getLogger("opennlp.tools.similarity.apps.RelatedSentenceFinder");
+ PageFetcher pFetcher = new PageFetcher();
+ ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor
+ .getInstance();
+ protected ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
+ protected ParseTreeChunk parseTreeChunk = new ParseTreeChunk();
+ protected static StringDistanceMeasurer stringDistanceMeasurer = new StringDistanceMeasurer();
+ protected BingQueryRunner yrunner = new BingQueryRunner();
+ protected int MAX_STEPS = 1;
+ protected int MAX_SEARCH_RESULTS = 1;
+ protected float RELEVANCE_THRESHOLD = 1.1f;
+ protected Set<String> visitedURLs = new HashSet();
+
+ // used to indicate that a sentence is an opinion, so more appropriate
+ static List<String> MENTAL_VERBS = new ArrayList<String>(
+ Arrays.asList(new String[] { "want", "know", "believe", "appeal", "ask",
+ "accept", "agree", "allow", "appeal", "ask", "assume", "believe",
+ "check", "confirm", "convince", "deny", "disagree", "explain",
+ "ignore", "inform", "remind", "request", "suggest", "suppose",
+ "think", "threaten", "try", "understand" }));
+
+ private static final int MAX_FRAGMENT_SENTS = 10;
+
+ public RelatedSentenceFinder(int ms, int msr, float thresh, String key) {
+ this.MAX_STEPS = ms;
+ this.MAX_SEARCH_RESULTS = msr;
+ this.RELEVANCE_THRESHOLD=thresh;
+ yrunner.setKey(key);
+ }
+
+ public RelatedSentenceFinder() {
+ // TODO Auto-generated constructor stub
+ }
+ public void setLang(String lang) {
+ yrunner.setLang(lang);
+
+ }
+ public List<HitBase> findRelatedOpinionsForSentenceFastAndDummy(String word,
+ List<String> sents) throws Exception {
+
+ List<HitBase> searchResult = yrunner.runSearch(word, 100);
+ return searchResult;
+ }
+
+ public List<HitBase> findRelatedOpinionsForSentence(String sentence,
+ List<String> sents) throws Exception {
+ List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
+ System.out.println(" \n\n=== Sentence = " + sentence);
+ List<String> nounPhraseQueries = buildSearchEngineQueryFromSentence(sentence);
+
+ BingQueryRunner yrunner = new BingQueryRunner();
+ for (String query : nounPhraseQueries) {
+ System.out.println("\nquery = " + query);
+ // query += " "+join(MENTAL_VERBS, " OR ") ;
+ List<HitBase> searchResult = yrunner.runSearch(query, 100);
+ if (searchResult != null) {
+ for (HitBase item : searchResult) { // got some text from .html
+ if (item.getAbstractText() != null
+ && !(item.getUrl().indexOf(".pdf") > 0)) { // exclude
+ // pdf
+ opinionSentencesToAdd
+ .add(augmentWithMinedSentencesAndVerifyRelevance(item,
+ sentence, sents));
+
+ }
+ }
+ }
+ }
+
+ opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
+ return opinionSentencesToAdd;
+ }
+
+ /**
+ * Main content generation function which takes a seed as a person, rock
+ * group, or other entity name and produce a list of text fragments by web
+ * mining for <br>
+ *
+ * @param String
+ * entity name
+ * @return List<HitBase> of text fragment structures which contain approved
+ * (in terms of relevance) mined sentences, as well as original search
+ * results objects such as doc titles, abstracts, and urls.
+ */
+
+ public List<HitBase> generateContentAbout(String sentence) throws Exception {
+ List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
+ System.out.println(" \n=== Entity to write about = " + sentence);
+ List<String> nounPhraseQueries = new ArrayList<String>();
+
+ String[] extraKeywords = new StoryDiscourseNavigator().obtainAdditionalKeywordsForAnEntity(sentence);
+ System.out.println("Found extraKeywords "+ Arrays.asList(extraKeywords));
+ if (extraKeywords==null || extraKeywords.length<1)
+ extraKeywords = StoryDiscourseNavigator.frequentPerformingVerbs;
+
+ int stepCount=0;
+ for (String verbAddition : extraKeywords) {
+ List<HitBase> searchResult = yrunner.runSearch(sentence + " "
+ + verbAddition, MAX_SEARCH_RESULTS); //100);
+ if (MAX_SEARCH_RESULTS<searchResult.size())
+ searchResult = searchResult.subList(0, MAX_SEARCH_RESULTS);
+ //TODO for shorter run
+ if (searchResult != null) {
+ for (HitBase item : searchResult) { // got some text from .html
+ if (item.getAbstractText() != null
+ && !(item.getUrl().indexOf(".pdf") > 0) && !visitedURLs.contains(item.getUrl())) { // exclude pdf
+ opinionSentencesToAdd
+ .add(//augmentWithMinedSentencesAndVerifyRelevance(item,
+ // sentence, null));
+ buildParagraphOfGeneratedText(item, sentence, null));
+ visitedURLs.add(item.getUrl());
+ }
+ }
+ }
+ stepCount++;
+ if (stepCount>MAX_STEPS)
+ break;
+ }
+
+ opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
+ return opinionSentencesToAdd;
+ }
+
+ /**
+ * Takes a sentence and extracts noun phrases and entity names to from search
+ * queries for finding relevant sentences on the web, which are then subject
+ * to relevance assessment by Similarity. Search queries should not be too
+ * general (irrelevant search results) or too specific (too few search
+ * results)
+ *
+ * @param String
+ * input sentence to form queries
+ * @return List<String> of search expressions
+ */
+ public static List<String> buildSearchEngineQueryFromSentence(String sentence) {
+ ParseTreeChunk matcher = new ParseTreeChunk();
+ ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor
+ .getInstance();
+ List<List<ParseTreeChunk>> sent1GrpLst = null;
+
+ List<ParseTreeChunk> nPhrases = pos
+ .formGroupedPhrasesFromChunksForSentence(sentence).get(0);
+ List<String> queryArrayStr = new ArrayList<String>();
+ for (ParseTreeChunk ch : nPhrases) {
+ String query = "";
+ int size = ch.getLemmas().size();
+
+ for (int i = 0; i < size; i++) {
+ if (ch.getPOSs().get(i).startsWith("N")
+ || ch.getPOSs().get(i).startsWith("J")) {
+ query += ch.getLemmas().get(i) + " ";
+ }
+ }
+ query = query.trim();
+ int len = query.split(" ").length;
+ if (len < 2 || len > 5)
+ continue;
+ if (len < 4) { // every word should start with capital
+ String[] qs = query.split(" ");
+ boolean bAccept = true;
+ for (String w : qs) {
+ if (w.toLowerCase().equals(w)) // idf only two words then
+ // has to be person name,
+ // title or geo location
+ bAccept = false;
+ }
+ if (!bAccept)
+ continue;
+ }
+
+ query = query.trim().replace(" ", " +");
+ query = " +" + query;
+
+ queryArrayStr.add(query);
+
+ }
+ if (queryArrayStr.size() < 1) { // release constraints on NP down to 2
+ // keywords
+ for (ParseTreeChunk ch : nPhrases) {
+ String query = "";
+ int size = ch.getLemmas().size();
+
+ for (int i = 0; i < size; i++) {
+ if (ch.getPOSs().get(i).startsWith("N")
+ || ch.getPOSs().get(i).startsWith("J")) {
+ query += ch.getLemmas().get(i) + " ";
+ }
+ }
+ query = query.trim();
+ int len = query.split(" ").length;
+ if (len < 2)
+ continue;
+
+ query = query.trim().replace(" ", " +");
+ query = " +" + query;
+
+ queryArrayStr.add(query);
+
+ }
+ }
+
+ queryArrayStr = removeDuplicatesFromQueries(queryArrayStr);
+ queryArrayStr.add(sentence);
+
+ return queryArrayStr;
+
+ }
+
+ /**
+ * remove dupes from queries to easy cleaning dupes and repetitive search
+ * afterwards
+ *
+ * @param List
+ * <String> of sentences (search queries, or search results
+ * abstracts, or titles
+ * @return List<String> of sentences where dupes are removed
+ */
+ public static List<String> removeDuplicatesFromQueries(List<String> hits) {
+ StringDistanceMeasurer meas = new StringDistanceMeasurer();
+ double dupeThresh = 0.8; // if more similar, then considered dupes was
+ // 0.7
+ List<Integer> idsToRemove = new ArrayList<Integer>();
+ List<String> hitsDedup = new ArrayList<String>();
+ try {
+ for (int i = 0; i < hits.size(); i++)
+ for (int j = i + 1; j < hits.size(); j++) {
+ String title1 = hits.get(i);
+ String title2 = hits.get(j);
+ if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
+ continue;
+ if (meas.measureStringDistance(title1, title2) > dupeThresh) {
+ idsToRemove.add(j); // dupes found, later list member to
+ // be deleted
+
+ }
+ }
+
+ for (int i = 0; i < hits.size(); i++)
+ if (!idsToRemove.contains(i))
+ hitsDedup.add(hits.get(i));
+
+ if (hitsDedup.size() < hits.size()) {
+ LOG.info("Removed duplicates from formed query, including "
+ + hits.get(idsToRemove.get(0)));
+ }
+
+ } catch (Exception e) {
+ LOG.severe("Problem removing duplicates from query list");
+ }
+
+ return hitsDedup;
+
+ }
+
+ /**
+ * remove dupes from search results
+ *
+ * @param List
+ * <HitBase> of search results objects
+ * @return List<String> of search results objects where dupes are removed
+ */
+ public static List<HitBase> removeDuplicatesFromResultantHits(
+ List<HitBase> hits) {
+ StringDistanceMeasurer meas = new StringDistanceMeasurer();
+ double dupeThresh = // 0.8; // if more similar, then considered dupes was
+ 0.7;
+ List<Integer> idsToRemove = new ArrayList<Integer>();
+ List<HitBase> hitsDedup = new ArrayList<HitBase>();
+ try {
+ for (int i = 0; i < hits.size(); i++)
+ for (int j = i + 1; j < hits.size(); j++) {
+ HitBase hit2 = hits.get(j);
+ List<Fragment> fragmList1 = hits.get(i).getFragments();
+ List<Fragment> fragmList2 = hits.get(j).getFragments();
+ List<Fragment> fragmList2Results = new ArrayList<Fragment>(fragmList2);
+ for (Fragment f1 : fragmList1)
+ for (Fragment f2 : fragmList2) {
+ String sf1 = f1.getResultText();
+ String sf2 = f2.getResultText();
+ if (StringUtils.isEmpty(sf1) || StringUtils.isEmpty(sf1))
+ continue;
+ if (meas.measureStringDistance(sf1, sf2) > dupeThresh) {
+ fragmList2Results.remove(f2);
+ LOG.info("Removed duplicates from formed fragments list: "
+ + sf2);
+ }
+ }
+
+ hit2.setFragments(fragmList2Results);
+ hits.set(j, hit2);
+ }
+ } catch (Exception e) {
+ LOG.severe("Problem removing duplicates from list of fragment");
+ }
+ return hits;
+ }
+
+ /**
+ * Takes single search result for an entity which is the subject of the essay
+ * to be written and forms essey sentences from the title, abstract, and
+ * possibly original page
+ *
+ * @param HitBase
+ * item : search result
+ * @param originalSentence
+ * : seed for the essay to be written
+ * @param sentsAll
+ * : list<String> of other sentences in the seed if it is
+ * multi-sentence
+ * @return search result
+ */
+
+ public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item,
+ String originalSentence, List<String> sentsAll) {
+ if (sentsAll == null)
+ sentsAll = new ArrayList<String>();
+ // put orig sentence in structure
+ List<String> origs = new ArrayList<String>();
+ origs.add(originalSentence);
+ item.setOriginalSentences(origs);
+ String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
+ .replace(" ", " ").replace(" ", " ");
+ // generation results for this sentence
+ List<Fragment> result = new ArrayList<Fragment>();
+ // form plain text from snippet
+ String snapshot = item.getAbstractText().replace("<b>", " ")
+ .replace("</b>", " ").replace(" ", " ").replace(" ", " ");
+
+
+ // fix a template expression which can be substituted by original if
+ // relevant
+ String snapshotMarked = snapshot.replace("...",
+ " _should_find_orig_ . _should_find_orig_");
+ String[] fragments = sm.splitSentences(snapshotMarked);
+ List<String> allFragms = new ArrayList<String>();
+ allFragms.addAll(Arrays.asList(fragments));
+
+ String[] sents = null;
+ String downloadedPage = null;
+ try {
+ if (snapshotMarked.length() != snapshot.length()) {
+ downloadedPage = pFetcher.fetchPage(item.getUrl());
+ if (downloadedPage != null && downloadedPage.length() > 100) {
+ item.setPageContent(downloadedPage);
+ String pageContent = Utils.fullStripHTML(item.getPageContent());
+ pageContent = GeneratedSentenceProcessor
+ .normalizeForSentenceSplitting(pageContent);
+ pageContent = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(pageContent);
+ //pageContent = pageContent.trim().replaceAll(" [A-Z]", ". $0")// .replace(" ",
+ // // ". ")
+ // .replace("..", ".").replace(". . .", " ").trim(); // sometimes html breaks are converted into ' ' (two spaces), so
+ // we need to put '.'
+ sents = sm.splitSentences(pageContent);
+
+ sents = ContentGeneratorSupport.cleanListOfSents(sents);
+ }
+ }
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ // e.printStackTrace();
+ System.err
+ .println("Problem downloading the page and splitting into sentences");
+ return item;
+ }
+
+ for (String fragment : allFragms) {
+ String followSent = "";
+ if (fragment.length() < 50)
+ continue;
+ String pageSentence = "";
+ // try to find original sentence from webpage
+ if (fragment.indexOf("_should_find_orig_") > -1 && sents != null
+ && sents.length > 0){
+ try {
+ // first try sorted sentences from page by length approach
+ String[] sentsSortedByLength = extractSentencesFromPage(downloadedPage);
+ String[] mainAndFollowSent = null;
+
+ try {
+ mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
+ fragment.replace("_should_find_orig_", ""), sentsSortedByLength);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ // if the above gives null than try to match all sentences from snippet fragment
+ if (mainAndFollowSent==null || mainAndFollowSent[0]==null){
+ mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
+ fragment.replace("_should_find_orig_", ""), sents);
+ }
+
+ if (mainAndFollowSent!=null || mainAndFollowSent[0]!=null){
+ pageSentence = mainAndFollowSent[0];
+ for(int i = 1; i< mainAndFollowSent.length; i++)
+ if (mainAndFollowSent[i]!=null)
+ followSent+= mainAndFollowSent[i];
+ }
+
+ } catch (Exception e) {
+
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ else
+ // or get original snippet
+ pageSentence = fragment;
+ if (pageSentence != null)
+ pageSentence.replace("_should_find_orig_", "");
+
+ // resultant sentence SHOULD NOT be longer than for times the size of
+ // snippet fragment
+ if (pageSentence != null && pageSentence.length()>50 )
+ // && (float) pageSentence.length() / (float) fragment.length() < 4.0)
+ { // was 2.0,
+
+ try { // get score from syntactic match between sentence in
+ // original text and mined sentence
+ double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;
+
+ SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence
+ + " " + title, originalSentence);
+ List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
+ if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {
+ System.out
+ .println("Rejected Sentence : No verb OR Yes imperative verb :"
+ + pageSentence);
+ continue;
+ }
+
+ syntScore = parseTreeChunkListScorer
+ .getParseTreeChunkListScore(match);
+ System.out.println(parseTreeChunk.listToString(match) + " "
+ + syntScore + "\n pre-processed sent = '" + pageSentence);
+
+ if (syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents
+ for (String currSent : sentsAll) {
+ if (currSent.startsWith(originalSentence))
+ continue;
+ match = sm.assessRelevance(currSent, pageSentence)
+ .getMatchResult();
+ double syntScoreCurr = parseTreeChunkListScorer
+ .getParseTreeChunkListScore(match);
+ if (syntScoreCurr > syntScore) {
+ syntScore = syntScoreCurr;
+ }
+ }
+ if (syntScore > RELEVANCE_THRESHOLD) {
+ System.out.println("Got match with other sent: "
+ + parseTreeChunk.listToString(match) + " " + syntScore);
+ }
+ }
+
+ measScore = stringDistanceMeasurer.measureStringDistance(
+ originalSentence, pageSentence);
+
+
+ if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5)
+ && measScore < 0.8 && pageSentence.length() > 40) // >70
+ {
+ String pageSentenceProc = GeneratedSentenceProcessor
+ .acceptableMinedSentence(pageSentence);
+ if (pageSentenceProc != null) {
+ pageSentenceProc = GeneratedSentenceProcessor
+ .processSentence(pageSentenceProc);
+ followSent = GeneratedSentenceProcessor.processSentence(followSent);
+ if (followSent != null) {
+ pageSentenceProc += " "+ followSent;
+ }
+
+ pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
+ Fragment f = new Fragment(pageSentenceProc, syntScore + measScore
+ + mentalScore + (double) pageSentenceProc.length()
+ / (double) 50);
+ f.setSourceURL(item.getUrl());
+ f.fragment = fragment;
+ result.add(f);
+ System.out.println("Accepted sentence: " + pageSentenceProc + " | "+followSent
+ + "| with title= " + title);
+ System.out.println("For fragment = " + fragment);
+ } else
+ System.out
+ .println("Rejected sentence due to wrong area at webpage: "
+ + pageSentence);
+ } else
+ System.out.println("Rejected sentence due to low score: "
+ + pageSentence);
+ // }
+ } catch (Throwable t) {
+ t.printStackTrace();
+ }
+ }
+ }
+ item.setFragments(result);
+ return item;
+ }
+
+
+
+ // given a fragment from snippet, finds an original sentence at a webpage by
+ // optimizing alignmemt score
+ public static String[] getFullOriginalSentenceFromWebpageBySnippetFragment(
+ String fragment, String[] sents) {
+ if (fragment.trim().length() < 15)
+ return null;
+
+ StringDistanceMeasurer meas = new StringDistanceMeasurer();
+ Double dist = 0.0;
+ String result = null, followSent = "";
+ for (int i = 0; i < sents.length; i++) {
+ String s = sents[i];
+ if (s == null || s.length() < 30)
+ continue;
+ Double distCurr = meas.measureStringDistance(s, fragment);
+ if (distCurr > dist && distCurr > 0.4) {
+ result = s;
+ dist = distCurr;
+ try {
+ if (i < sents.length - 1 && sents[i + 1].length() > 60) {
+ String f1 = GeneratedSentenceProcessor.acceptableMinedSentence(sents[i+1]);
+ if (f1!=null){
+ followSent = f1;
+ }
+ }
+
+ if (i < sents.length - 2 && sents[i + 2].length() > 60) {
+ String f2 = GeneratedSentenceProcessor.acceptableMinedSentence(sents[i+2]);
+ if (f2!=null){
+ followSent += " "+f2;
+ }
+ }
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ return new String[] { result, followSent };
+ }
+
+ // given a fragment from snippet, finds an original sentence at a webpage by
+ // optimizing alignmemt score
+ public static String[] getBestFullOriginalSentenceFromWebpageBySnippetFragment(
+ String fragment, String[] sents) {
+ if (fragment.trim().length() < 15)
+ return null;
+ int bestSentIndex = -1;
+ StringDistanceMeasurer meas = new StringDistanceMeasurer();
+ Double distBest = 10.0; // + sup
+ String result = null, followSent = null;
+ for (int i = 0; i < sents.length; i++) {
+ String s = sents[i];
+ if (s == null || s.length() < 30)
+ continue;
+ Double distCurr = meas.measureStringDistance(s, fragment);
+ if (distCurr > distBest) {
+ distBest = distCurr;
+ bestSentIndex = i;
+ }
+
+ }
+ if (distBest > 0.4) {
+ result = sents[bestSentIndex];
+
+ if (bestSentIndex < sents.length - 1
+ && sents[bestSentIndex + 1].length() > 60) {
+ followSent = sents[bestSentIndex + 1];
+ }
+
+ }
+
+ return new String[] { result, followSent };
+ }
+
+ public String[] extractSentencesFromPage(String downloadedPage)
+ {
+
+ int maxSentsFromPage= 100;
+ List<String[]> results = new ArrayList<String[]>();
+
+ //String pageOrigHTML = pFetcher.fetchOrigHTML(url);
+
+ downloadedPage= downloadedPage.replace(" ", "&");
+ downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");
+ String[] sents = downloadedPage.split("#");
+ List<TextChunk> sentsList = new ArrayList<TextChunk>();
+ for(String s: sents){
+ s = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(s);
+ /* s = s.trim().replace(" ", ". ").replace("..", ".").replace(". . .", " ")
+ .replace(": ", ". ").replace("- ", ". ").
+ replace (". .",".").trim(); */
+ sentsList.add(new TextChunk(s, s.length()));
+ }
+
+ Collections.sort(sentsList, new TextChunkComparable());
+ String[] longestSents = new String[maxSentsFromPage];
+ int j=0;
+ int initIndex = sentsList.size()-1 -maxSentsFromPage;
+ if (initIndex<0)
+ initIndex = 0;
+ for(int i=initIndex; i< sentsList.size() && j<maxSentsFromPage ; i++){
+ longestSents[j] = sentsList.get(i).text;
+ j++;
+ }
+
+ sents = cleanSplitListOfSents(longestSents);
+
+ //sents = removeDuplicates(sents);
+ //sents = verifyEnforceStartsUpperCase(sents);
+
+ return sents;
+ }
+
+ public class TextChunk {
+ public TextChunk(String s, int length) {
+ this.text = s;
+ this.len = length;
+ }
+ public String text;
+ public int len;
+ }
+
+ public class TextChunkComparable implements Comparator<TextChunk>
+ {
+ public int compare(TextChunk ch1, TextChunk ch2)
+ {
+ if (ch1.len>ch2.len)
+ return 1;
+ else if (ch1.len<ch2.len)
+ return -1;
+ else return 0;
+
+ }
+ }
+
+ protected String[] cleanSplitListOfSents(String[] longestSents){
+ float minFragmentLength = 40, minFragmentLengthSpace=4;
+
+ List<String> sentsClean = new ArrayList<String>();
+ for (String sentenceOrMultSent : longestSents)
+ {
+ if (sentenceOrMultSent==null || sentenceOrMultSent.length()<20)
+ continue;
+ if (GeneratedSentenceProcessor.acceptableMinedSentence(sentenceOrMultSent)==null){
+ //System.out.println("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "+sentenceOrMultSent);
+ continue;
+ }
+ // aaa. hhh hhh. kkk . kkk ll hhh. lll kkk n.
+ int numOfDots = sentenceOrMultSent.replace('.','&').split("&").length;
+ float avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;
+ if ( avgSentenceLengthInTextPortion<minFragmentLength)
+ continue;
+ // o oo o ooo o o o ooo oo ooo o o oo
+ numOfDots = sentenceOrMultSent.replace(' ','&').split("&").length;
+ avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;
+ if ( avgSentenceLengthInTextPortion<minFragmentLengthSpace)
+ continue;
+
+ List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent);
+
+ // forced split by ',' somewhere in the middle of sentence
+ // disused - Feb 26 13
+ //furtherSplit = furtherMakeSentencesShorter(furtherSplit);
+ furtherSplit.remove(furtherSplit.size()-1);
+ for(String s : furtherSplit){
+ if (s.indexOf('|')>-1)
+ continue;
+ s = s.replace("<em>"," ").replace("</em>"," ");
+ s = Utils.convertToASCII(s);
+ sentsClean.add(s);
+ }
+ }
+ return (String[]) sentsClean.toArray(new String[0]);
+ }
+
+ public Triple<List<String>, String, String[]> formCandidateFragmentsForPage(HitBase item, String originalSentence, List<String> sentsAll){
+ if (sentsAll == null)
+ sentsAll = new ArrayList<String>();
+ // put orig sentence in structure
+ List<String> origs = new ArrayList<String>();
+ origs.add(originalSentence);
+ item.setOriginalSentences(origs);
+ String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
+ .replace(" ", " ").replace(" ", " ");
+ // generation results for this sentence
+ List<Fragment> result = new ArrayList<Fragment>();
+ // form plain text from snippet
+ String snapshot = item.getAbstractText().replace("<b>", " ")
+ .replace("</b>", " ").replace(" ", " ").replace(" ", " ");
+
+
+ // fix a template expression which can be substituted by original if
+ // relevant
+ String snapshotMarked = snapshot.replace("...",
+ " _should_find_orig_ . _should_find_orig_");
+ String[] fragments = sm.splitSentences(snapshotMarked);
+ List<String> allFragms = new ArrayList<String>();
+ allFragms.addAll(Arrays.asList(fragments));
+
+ String[] sents = null;
+ String downloadedPage = null;
+ try {
+ if (snapshotMarked.length() != snapshot.length()) {
+ downloadedPage = pFetcher.fetchPage(item.getUrl());
+ if (downloadedPage != null && downloadedPage.length() > 100) {
+ item.setPageContent(downloadedPage);
+ String pageContent = Utils.fullStripHTML(item.getPageContent());
+ pageContent = GeneratedSentenceProcessor
+ .normalizeForSentenceSplitting(pageContent);
+ pageContent = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(pageContent);
+ //pageContent = pageContent.trim().replaceAll(" [A-Z]", ". $0")// .replace(" ",
+ // // ". ")
+ // .replace("..", ".").replace(". . .", " ").
+ // replace(". .",". ").trim(); // sometimes html breaks are converted into ' ' (two spaces), so
+ // we need to put '.'
+ sents = sm.splitSentences(pageContent);
+
+ sents = ContentGeneratorSupport.cleanListOfSents(sents);
+ }
+ }
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ // e.printStackTrace();
+ System.err
+ .println("Problem downloading the page and splitting into sentences");
+ return new Triple(allFragms, downloadedPage, sents);
+ }
+ return new Triple(allFragms, downloadedPage, sents);
+ }
+
+ String[] formCandidateSentences(String fragment, Triple<List<String>, String, String[]> fragmentExtractionResults){
+ String[] mainAndFollowSent = null;
+
+ List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();
+ String downloadedPage = (String)fragmentExtractionResults.getSecond();
+ String[] sents = (String[])fragmentExtractionResults.getThird();
+
+ String followSent = null;
+ if (fragment.length() < 50)
+ return null;
+ String pageSentence = "";
+ // try to find original sentence from webpage
+ if (fragment.indexOf("_should_find_orig_") > -1 && sents != null
+ && sents.length > 0){
+ try {
+ // first try sorted sentences from page by length approach
+ String[] sentsSortedByLength = extractSentencesFromPage(downloadedPage);
+
+
+ try {
+ mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
+ fragment.replace("_should_find_orig_", ""), sentsSortedByLength);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ // if the above gives null than try to match all sentences from snippet fragment
+ if (mainAndFollowSent==null || mainAndFollowSent[0]==null){
+ mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
+ fragment.replace("_should_find_orig_", ""), sents);
+ }
+
+
+ } catch (Exception e) {
+
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ else
+ // or get original snippet
+ pageSentence = fragment;
+ if (pageSentence != null)
+ pageSentence.replace("_should_find_orig_", "");
+
+ return mainAndFollowSent;
+
+ }
+
+ private Fragment verifyCandidateSentencesAndFormParagraph(
+ String[] candidateSentences, HitBase item, String fragment, String originalSentence, List<String> sentsAll) {
+ Fragment result = null;
+
+ String pageSentence = candidateSentences[0];
+ String followSent = "";
+ for(int i = 1; i< candidateSentences.length; i++)
+ followSent+= candidateSentences[i];
+ String title = item.getTitle();
+
+ // resultant sentence SHOULD NOT be longer than for times the size of
+ // snippet fragment
+ if (!(pageSentence != null && pageSentence.length()>50) ){
+ System.out.println("Cannot accept the sentence = "+ pageSentence +
+ "!(pageSentence != null && pageSentence.length()>50 && (float) pageSentence.length() / (float) fragment.length() < 4.0) )");
+
+ return null;
+ }
+
+
+ try { // get score from syntactic match between sentence in
+ // original text and mined sentence
+ double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;
+
+ SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence
+ + " " + title, originalSentence);
+ List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
+ if (match==null || match.size()<1){
+ System.out
+ .println("Rejected Sentence : empty match "+ pageSentence);
+ return null;
+ }
+
+ if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {
+ System.out
+ .println("Rejected Sentence : No verb OR Yes imperative verb :"
+ + pageSentence);
+ return null;
+ }
+
+ syntScore = parseTreeChunkListScorer
+ .getParseTreeChunkListScore(match);
+ System.out.println(parseTreeChunk.listToString(match) + " "
+ + syntScore + "\n pre-processed sent = '" + pageSentence);
+
+ try {
+ if (sentsAll!=null && syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents
+ for (String currSent : sentsAll) {
+ if (currSent.startsWith(originalSentence))
+ continue;
+ match = sm.assessRelevance(currSent, pageSentence)
+ .getMatchResult();
+ double syntScoreCurr = parseTreeChunkListScorer
+ .getParseTreeChunkListScore(match);
+ if (syntScoreCurr > syntScore) {
+ syntScore = syntScoreCurr;
+ }
+ }
+ if (syntScore > RELEVANCE_THRESHOLD) {
+ System.out.println("Got match with other sent: "
+ + parseTreeChunk.listToString(match) + " " + syntScore);
+ }
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ measScore = stringDistanceMeasurer.measureStringDistance(
+ originalSentence, pageSentence);
+
+
+ if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5)
+ && measScore < 0.8 && pageSentence.length() > 40) // >70
+ {
+ String pageSentenceProc = GeneratedSentenceProcessor
+ .acceptableMinedSentence(pageSentence);
+ if (pageSentenceProc != null) {
+ pageSentenceProc = GeneratedSentenceProcessor
+ .processSentence(pageSentenceProc);
+ followSent = GeneratedSentenceProcessor.processSentence(followSent);
+ if (followSent != null) {
+ pageSentenceProc += " "+ followSent;
+ }
+
+ pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
+ result = new Fragment(pageSentenceProc, syntScore + measScore
+ + mentalScore + (double) pageSentenceProc.length()
+ / (double) 50);
+ result.setSourceURL(item.getUrl());
+ result.fragment = fragment;
+
+ System.out.println("Accepted sentence: " + pageSentenceProc
+ + "| with title= " + title);
+ System.out.println("For fragment = " + fragment);
+ } else
+ System.out
+ .println("Rejected sentence due to wrong area at webpage: "
+ + pageSentence);
+ } else
+ System.out.println("Rejected sentence due to low score: "
+ + pageSentence);
+ // }
+ } catch (Throwable t) {
+ t.printStackTrace();
+ }
+
+ return result;
+}
+
+public HitBase buildParagraphOfGeneratedText(HitBase item,
+ String originalSentence, List<String> sentsAll) {
+ List<Fragment> results = new ArrayList<Fragment>() ;
+
+ Triple<List<String>, String, String[]> fragmentExtractionResults = formCandidateFragmentsForPage(item, originalSentence, sentsAll);
+
+ List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();
+ String downloadedPage = (String)fragmentExtractionResults.getSecond();
+ String[] sents = (String[])fragmentExtractionResults.getThird();
+
+ for (String fragment : allFragms) {
+ String[] candidateSentences = formCandidateSentences(fragment, fragmentExtractionResults);
+ if (candidateSentences == null)
+ continue;
+ Fragment res = verifyCandidateSentencesAndFormParagraph(candidateSentences, item, fragment, originalSentence, sentsAll);
+ if (res!=null)
+ results.add(res);
+
+ }
+
+ item.setFragments(results );
+ return item;
+}
+
+
+
+
+public static void main(String[] args) {
+ RelatedSentenceFinder f = new RelatedSentenceFinder();
+
+ List<HitBase> hits = null;
+ try {
+ // uncomment the sentence you would like to serve as a seed sentence for
+ // content generation for an event description
+
+ // uncomment the sentence you would like to serve as a seed sentence for
+ // content generation for an event description
+ hits = f.generateContentAbout("Albert Einstein"
+ // "Britney Spears - The Femme Fatale Tour"
+ // "Rush Time Machine",
+ // "Blue Man Group" ,
+ // "Belly Dance With Zaharah",
+ // "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
+ // "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
+ );
+ System.out.println(HitBase.toString(hits));
+ System.out.println(HitBase.toResultantString(hits));
+ // WordFileGenerator.createWordDoc("Essey about Albert Einstein",
+ // hits.get(0).getTitle(), hits);
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+}
+
- }
}
\ No newline at end of file
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinderML.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinderML.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinderML.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinderML.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.logging.Logger;
+import opennlp.tools.similarity.apps.utils.Utils;
+import opennlp.tools.textsimilarity.TextProcessor;
+
+/*
+ * This class does content generation in ES, DE etc
+ *
+ */
+
+public class RelatedSentenceFinderML extends RelatedSentenceFinder{
+ private static Logger LOG = Logger
+ .getLogger("opennlp.tools.similarity.apps.RelatedSentenceFinderML");
+
+
+ public RelatedSentenceFinderML(int ms, int msr, float thresh, String key) {
+ this.MAX_STEPS = ms;
+ this.MAX_SEARCH_RESULTS = msr;
+ this.RELEVANCE_THRESHOLD=thresh;
+ yrunner.setKey(key);
+ }
+
+ public RelatedSentenceFinderML() {
+ // TODO Auto-generated constructor stub
+ }
+
+ public List<HitBase> generateContentAbout(String sentence) throws Exception {
+ List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
+ System.out.println(" \n=== Entity to write about = " + sentence);
+ List<String> nounPhraseQueries = new ArrayList<String>();
+
+ List<HitBase> searchResult = yrunner.runSearch(sentence, 100);
+ if (MAX_SEARCH_RESULTS<searchResult.size())
+ searchResult = searchResult.subList(0, MAX_SEARCH_RESULTS);
+ //TODO for shorter run
+ if (searchResult != null) {
+ for (HitBase item : searchResult) { // got some text from .html
+ if (item.getAbstractText() != null
+ && !(item.getUrl().indexOf(".pdf") > 0)) { // exclude pdf
+ opinionSentencesToAdd
+ .add(augmentWithMinedSentencesAndVerifyRelevance(item,
+ sentence, null));
+ }
+ }
+ }
+
+ opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
+ return opinionSentencesToAdd;
+ }
+
+
+ /**
+ * Takes single search result for an entity which is the subject of the essay
+ * to be written and forms essey sentences from the title, abstract, and
+ * possibly original page
+ *
+ * @param HitBase
+ * item : search result
+ * @param originalSentence
+ * : seed for the essay to be written
+ * @param sentsAll
+ * : list<String> of other sentences in the seed if it is
+ * multi-sentence
+ * @return search result
+ */
+
+ public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item,
+ String originalSentence, List<String> sentsAll) {
+ if (sentsAll == null)
+ sentsAll = new ArrayList<String>();
+ // put orig sentence in structure
+ List<String> origs = new ArrayList<String>();
+ origs.add(originalSentence);
+ item.setOriginalSentences(origs);
+ String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
+ .replace(" ", " ").replace(" ", " ");
+ // generation results for this sentence
+ List<Fragment> result = new ArrayList<Fragment>();
+ // form plain text from snippet
+ String snapshot = item.getAbstractText().replace("<b>", " ")
+ .replace("</b>", " ").replace(" ", " ").replace(" ", " ");
+
+
+ // fix a template expression which can be substituted by original if
+ // relevant
+ String snapshotMarked = snapshot.replace("...",
+ " _should_find_orig_ . _should_find_orig_");
+ String[] fragments = sm.splitSentences(snapshotMarked);
+ List<String> allFragms = new ArrayList<String>();
+ allFragms.addAll(Arrays.asList(fragments));
+
+ String[] sents = null;
+ String downloadedPage = null;
+ try {
+ if (snapshotMarked.length() != snapshot.length()) {
+ downloadedPage = pFetcher.fetchPage(item.getUrl());
+ if (downloadedPage != null && downloadedPage.length() > 100) {
+ item.setPageContent(downloadedPage);
+ String pageContent = Utils.fullStripHTML(item.getPageContent());
+ pageContent = GeneratedSentenceProcessor
+ .normalizeForSentenceSplitting(pageContent);
+ pageContent = pageContent.trim().replaceAll(" [A-Z]", ". $0")// .replace(" ",
+ // ". ")
+ .replace("..", ".").replace(". . .", " ").trim(); // sometimes html breaks are converted into ' ' (two spaces), so
+ // we need to put '.'
+ sents = sm.splitSentences(pageContent);
+
+ sents = ContentGeneratorSupport.cleanListOfSents(sents);
+ }
+ }
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ // e.printStackTrace();
+ System.err
+ .println("Problem downloading the page and splitting into sentences");
+ return item;
+ }
+
+ for (String fragment : allFragms) {
+ String followSent = null;
+ if (fragment.length() < 50)
+ continue;
+ String pageSentence = "";
+ // try to find original sentence from webpage
+ if (fragment.indexOf("_should_find_orig_") > -1 && sents != null
+ && sents.length > 0)
+ try {
+ // first try sorted sentences from page by lenght approach
+ String[] sentsSortedByLength = extractSentencesFromPage(downloadedPage);
+ String[] mainAndFollowSent = null;
+
+ try {
+ mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
+ fragment.replace("_should_find_orig_", ""), sentsSortedByLength);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ // if the above gives null than try to match all sentences from snippet fragment
+ if (mainAndFollowSent==null || mainAndFollowSent[0]==null){
+ mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
+ fragment.replace("_should_find_orig_", ""), sents);
+ }
+
+
+ } catch (Exception e) {
+
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ else
+ // or get original snippet
+ pageSentence = fragment;
+ if (pageSentence != null)
+ pageSentence.replace("_should_find_orig_", "");
+
+ // resultant sentence SHOULD NOT be longer than twice the size of
+ // snippet fragment
+ if (pageSentence != null
+ && (float) pageSentence.length() / (float) fragment.length() < 4.0) { // was 2.0,
+
+ try { // get score from syntactic match between sentence in
+ // original text and mined sentence
+ double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;
+
+ syntScore = calculateKeywordScore(pageSentence + " " + title, originalSentence);
+
+
+ if (syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents
+ for (String currSent : sentsAll) {
+ if (currSent.startsWith(originalSentence))
+ continue;
+ double syntScoreCurr = calculateKeywordScore(currSent, pageSentence);
+ if (syntScoreCurr > syntScore) {
+ syntScore = syntScoreCurr;
+ }
+ }
+ if (syntScore > RELEVANCE_THRESHOLD) {
+ System.out.println("Got match with other sent: " + syntScore);
+ }
+ }
+
+ measScore = stringDistanceMeasurer.measureStringDistance(
+ originalSentence, pageSentence);
+
+ // now possibly increase score by finding mental verbs
+ // indicating opinions
+ for (String s : MENTAL_VERBS) {
+ if (pageSentence.indexOf(s) > -1) {
+ mentalScore += 0.3;
+ break;
+ }
+ }
+
+ if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5 || mentalScore > 0.5)
+ && measScore < 0.8 && pageSentence.length() > 40) // >70
+ {
+ String pageSentenceProc = GeneratedSentenceProcessor
+ .acceptableMinedSentence(pageSentence);
+ if (pageSentenceProc != null) {
+ pageSentenceProc = GeneratedSentenceProcessor
+ .processSentence(pageSentenceProc);
+ if (followSent != null) {
+ pageSentenceProc += " "
+ + GeneratedSentenceProcessor.processSentence(followSent);
+ }
+
+ pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
+ Fragment f = new Fragment(pageSentenceProc, syntScore + measScore
+ + mentalScore + (double) pageSentenceProc.length()
+ / (double) 50);
+ f.setSourceURL(item.getUrl());
+ f.fragment = fragment;
+ result.add(f);
+ System.out.println("Accepted sentence: " + pageSentenceProc
+ + "| with title= " + title);
+ System.out.println("For fragment = " + fragment);
+ } else
+ System.out
+ .println("Rejected sentence due to wrong area at webpage: "
+ + pageSentence);
+ } else
+ System.out.println("Rejected sentence due to low score: "
+ + pageSentence);
+ // }
+ } catch (Throwable t) {
+ t.printStackTrace();
+ }
+ }
+ }
+ item.setFragments(result);
+ return item;
+ }
+
+ private double calculateKeywordScore(String currSent, String pageSentence) {
+ List<String> list1 =TextProcessor.fastTokenize(currSent, false);
+ List<String> list2 =TextProcessor.fastTokenize(pageSentence, false);
+ List<String> overlap1 = new ArrayList<String>(list1);
+ overlap1.retainAll(list2);
+ return overlap1.size();
+
+ }
+
+
+ public static void main(String[] args) {
+ RelatedSentenceFinderML f = new RelatedSentenceFinderML();
+
+ List<HitBase> hits = null;
+ try {
+ // uncomment the sentence you would like to serve as a seed sentence for
+ // content generation for an event description
+
+ // uncomment the sentence you would like to serve as a seed sentence for
+ // content generation for an event description
+ hits = f.generateContentAbout("Albert Einstein"
+ // "Britney Spears - The Femme Fatale Tour"
+ // "Rush Time Machine",
+ // "Blue Man Group" ,
+ // "Belly Dance With Zaharah",
+ // "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
+ // "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
+ );
+ System.out.println(HitBase.toString(hits));
+ System.out.println(HitBase.toResultantString(hits));
+ // WordFileGenerator.createWordDoc("Essey about Albert Einstein",
+ // hits.get(0).getTitle(), hits);
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ }
+
+}
\ No newline at end of file
Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java?rev=1555944&r1=1555943&r2=1555944&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java Mon Jan 6 17:48:30 2014
@@ -26,7 +26,7 @@ import opennlp.tools.textsimilarity.Pars
import opennlp.tools.textsimilarity.SentencePairMatchResult;
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
-public class SearchResultsProcessor extends BingWebQueryRunner {
+public class SearchResultsProcessor extends BingQueryRunner {
private static Logger LOG = Logger
.getLogger("opennlp.tools.similarity.apps.SearchResultsProcessor");
private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
@@ -92,7 +92,7 @@ public class SearchResultsProcessor exte
public List<HitBase> runSearchViaAPI(String query) {
List<HitBase> hits = null;
try {
- List<HitBase> resultList = runSearch(query, 30);
+ List<HitBase> resultList = runSearch(query);
// now we apply our own relevance filter
hits = calculateMatchScoreResortHits(resultList, query);
@@ -102,7 +102,6 @@ public class SearchResultsProcessor exte
return null;
}
- hits = removeDuplicates(hits, 0.9);
return hits;
}
Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java?rev=1555944&r1=1555943&r2=1555944&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java Mon Jan 6 17:48:30 2014
@@ -101,8 +101,6 @@ public class SpeechRecognitionResultsPro
double bestSentScore = -1;
String bestSent = null;
for (String sentence : sents) {
- BingResponse resp = null, // obtained from bing
- newResp = null; // re-sorted based on similarity
try {
List<HitBase> resultList = scraper.runSearch(sentence);
double scoreForSentence = calculateTotalMatchScoreForHits(resultList,
Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java?rev=1555944&r1=1555943&r2=1555944&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java Mon Jan 6 17:48:30 2014
@@ -17,22 +17,115 @@
package opennlp.tools.similarity.apps;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+
+import opennlp.tools.similarity.apps.utils.StringCleaner;
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.SentencePairMatchResult;
+import opennlp.tools.textsimilarity.TextProcessor;
+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+
public class StoryDiscourseNavigator {
- public static final String[] frequentPerformingVerbs = {
- " born raised meet learn ", " graduated enter discover",
- " facts inventions life ", "accomplishments childhood timeline",
- " acquire befriend encounter", " achieve reache describe ",
- " invent innovate improve ", " impress outstanding award",
- " curous sceptical pessimistic", " spend enroll assume point",
- " explain discuss dispute", " learn teach study investigate",
- " propose suggest indicate", " pioneer explorer discoverer ",
- " advance promote lead", " direct control simulate ",
- " guide lead assist ", " inspire first initial",
- " vision predict foresee", " prediction inspiration achievement",
- " approve agree confirm", " deny argue disagree",
- " emotional loud imagination", " release announce celebrate discover",
- "introduce enjoy follow", " open present show",
- "meet enjoy follow create", "discover continue produce"
+ protected BingQueryRunner yrunner = new BingQueryRunner();
+ ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor
+ .getInstance();
+ private PorterStemmer ps = new PorterStemmer();
+
+ public static final String[] frequentPerformingVerbs = {
+ " born raised meet learn ", " graduated enter discover",
+ " facts inventions life ", "accomplishments childhood timeline",
+ " acquire befriend encounter", " achieve reache describe ",
+ " invent innovate improve ", " impress outstanding award",
+ " curous sceptical pessimistic", " spend enroll assume point",
+ " explain discuss dispute", " learn teach study investigate",
+ " propose suggest indicate", " pioneer explorer discoverer ",
+ " advance promote lead", " direct control simulate ",
+ " guide lead assist ", " inspire first initial",
+ " vision predict foresee", " prediction inspiration achievement",
+ " approve agree confirm", " deny argue disagree",
+ " emotional loud imagination", " release announce celebrate discover",
+ "introduce enjoy follow", " open present show",
+ "meet enjoy follow create", "discover continue produce"
+
+ };
+
+ public String[] obtainAdditionalKeywordsForAnEntity(String entity){
+ List<List<ParseTreeChunk>> matchList = runSearchForTaxonomyPath(
+ entity, "", "en", 30);
+ Collection<String> keywordsToRemove = TextProcessor.fastTokenize(entity.toLowerCase(), false);
+ List<List<String>> resList = getCommonWordsFromList_List_ParseTreeChunk(matchList);
+ String[] res = new String[resList.size()];
+ int i=0;
+ for(List<String> phrase: resList){
+ phrase.removeAll(keywordsToRemove);
+ String keywords = phrase.toString().replace('[', ' ').replace(']', ' ').replace(',',' ');
+ res[i] = keywords;
+ i++;
+ }
+ return res;
+ }
+
+ public List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query,
+ String domain, String lang, int numbOfHits) {
+ List<List<ParseTreeChunk>> genResult = new ArrayList<List<ParseTreeChunk>>();
+ try {
+ List<HitBase> resultList = yrunner.runSearch(query, numbOfHits);
+
+ for (int i = 0; i < resultList.size(); i++) {
+ {
+ for (int j = i + 1; j < resultList.size(); j++) {
+ HitBase h1 = resultList.get(i);
+ HitBase h2 = resultList.get(j);
+ String snapshot1 = StringCleaner.processSnapshotForMatching(h1
+ .getTitle() + " . " + h1.getAbstractText());
+ String snapshot2 = StringCleaner.processSnapshotForMatching(h2
+ .getTitle() + " . " + h2.getAbstractText());
+ SentencePairMatchResult matchRes = sm.assessRelevance(snapshot1,
+ snapshot2);
+ List<List<ParseTreeChunk>> matchResult = matchRes.getMatchResult();
+ genResult.addAll(matchResult);
+ }
+ }
+ }
+
+ } catch (Exception e) {
+ System.err.print("Problem extracting taxonomy node");
+ }
- };
+ return genResult;
+ }
+ private List<List<String>> getCommonWordsFromList_List_ParseTreeChunk(
+ List<List<ParseTreeChunk>> matchList) {
+ List<List<String>> res = new ArrayList<List<String>>();
+ for (List<ParseTreeChunk> chunks : matchList) {
+ List<String> wordRes = new ArrayList<String>();
+ for (ParseTreeChunk ch : chunks) {
+ List<String> lemmas = ch.getLemmas();
+ for (int w = 0; w < lemmas.size(); w++)
+ if ((!lemmas.get(w).equals("*"))
+ && ((ch.getPOSs().get(w).startsWith("NN") || ch.getPOSs().get(w)
+ .startsWith("VB"))) && lemmas.get(w).length() > 2) {
+ String formedWord = lemmas.get(w);
+ String stemmedFormedWord = ps.stem(formedWord);
+ if (!stemmedFormedWord.startsWith("invalid"))
+ wordRes.add(formedWord);
+ }
+ }
+ wordRes = new ArrayList<String>(new HashSet<String>(wordRes));
+ if (wordRes.size() > 0) {
+ res.add(wordRes);
+ }
+ }
+ res = new ArrayList<List<String>>(new HashSet<List<String>>(res));
+ return res;
+ }
+ public static void main(String[] args){
+ String[] res = new StoryDiscourseNavigator().obtainAdditionalKeywordsForAnEntity("Albert Einstein");
+ System.out.println(Arrays.asList(res));
+ }
}
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooAnswersMiner.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooAnswersMiner.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooAnswersMiner.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooAnswersMiner.java Mon Jan 6 17:48:30 2014
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import opennlp.tools.jsmlearning.ProfileReaderWriter;
+import opennlp.tools.parse_thicket.Triple;
+
+import net.billylieurance.azuresearch.AzureSearchImageQuery;
+import net.billylieurance.azuresearch.AzureSearchImageResult;
+import net.billylieurance.azuresearch.AzureSearchResultSet;
+import net.billylieurance.azuresearch.AzureSearchWebQuery;
+import net.billylieurance.azuresearch.AzureSearchWebResult;
+
+public class YahooAnswersMiner extends BingQueryRunner{
+
+ private static final Logger LOG = Logger
+ .getLogger("opennlp.tools.similarity.apps.YahooAnswersMiner");
+ private int page = 0;
+ private static final int hitsPerPage = 50;
+
+ public List<HitBase> runSearch(String query) {
+ aq.setAppid(BING_KEY);
+ aq.setQuery("site:answers.yahoo.com "+
+ query);
+ aq.setPerPage(hitsPerPage);
+ aq.setPage(page);
+
+ aq.doQuery();
+ List<HitBase> results = new ArrayList<HitBase> ();
+ AzureSearchResultSet<AzureSearchWebResult> ars = aq.getQueryResult();
+
+ for (AzureSearchWebResult anr : ars){
+ HitBase h = new HitBase();
+ h.setAbstractText(anr.getDescription());
+ h.setTitle(anr.getTitle());
+ h.setUrl(anr.getUrl());
+ results.add(h);
+ }
+ page++;
+
+ return results;
+ }
+
+
+ public List<HitBase> runSearch(String query, int totalPages) {
+ int count=0;
+ List<HitBase> results = new ArrayList<HitBase>();
+ while(totalPages>page*hitsPerPage){
+ List<HitBase> res = runSearch(query);
+ results.addAll(res);
+ if (count>10)
+ break;
+ count++;
+ }
+
+ return results;
+ }
+
+
+ public static void main(String[] args) {
+ YahooAnswersMiner self = new YahooAnswersMiner();
+ RelatedSentenceFinder extractor = new RelatedSentenceFinder();
+ String topic = "obamacare";
+
+ List<HitBase> resp = self
+ .runSearch(topic, 150);
+ System.out.print(resp.get(0));
+ List<String[]> data = new ArrayList<String[]>();
+
+
+ for(HitBase item: resp){
+ Triple<List<String>, String, String[]> fragmentExtractionResults =
+ extractor.formCandidateFragmentsForPage(item, topic, null);
+
+ List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();
+ String downloadedPage = (String)fragmentExtractionResults.getSecond();
+ String[] sents = (String[])fragmentExtractionResults.getThird();
+
+ for (String fragment : allFragms) {
+ String[] candidateSentences = extractor.formCandidateSentences(fragment, fragmentExtractionResults);
+ System.out.println(candidateSentences);
+ data.add(candidateSentences);
+ }
+
+ }
+
+ ProfileReaderWriter.writeReport(data, "multi_sentence_queries.csv");
+
+ }
+
+}