You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by bg...@apache.org on 2014/01/06 18:48:32 UTC

svn commit: r1555944 [8/11] - in /opennlp/sandbox/opennlp-similarity/src: main/java/opennlp/tools/apps/ main/java/opennlp/tools/apps/contentgen/ main/java/opennlp/tools/apps/contentgen/multithreaded/ main/java/opennlp/tools/apps/relevanceVocabs/ main/j...

Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java?rev=1555944&r1=1555943&r2=1555944&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java Mon Jan  6 17:48:30 2014
@@ -19,15 +19,24 @@ package opennlp.tools.similarity.apps;
 
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 import java.util.logging.Logger;
 
+import opennlp.tools.parse_thicket.Triple;
+import opennlp.tools.parse_thicket.apps.SnippetToParagraph;
+import opennlp.tools.parse_thicket.apps.SnippetToParagraph.TextChunk;
+import opennlp.tools.parse_thicket.apps.SnippetToParagraph.TextChunkComparable;
 import opennlp.tools.similarity.apps.utils.PageFetcher;
 import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
 import opennlp.tools.similarity.apps.utils.Utils;
 import opennlp.tools.textsimilarity.ParseTreeChunk;
 import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
 import opennlp.tools.textsimilarity.SentencePairMatchResult;
+import opennlp.tools.textsimilarity.TextProcessor;
 import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
 
 import org.apache.commons.lang.StringUtils;
@@ -43,575 +52,952 @@ import org.apache.commons.lang.StringUti
  */
 
 public class RelatedSentenceFinder {
-  private static Logger LOG = Logger
-      .getLogger("opennlp.tools.similarity.apps.RelatedSentenceFinder");
-  PageFetcher pFetcher = new PageFetcher();
-
-  private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
-  private ParseTreeChunk parseTreeChunk = new ParseTreeChunk();
-
-  static StringDistanceMeasurer STRING_DISTANCE_MEASURER = new StringDistanceMeasurer();
-
-  // used to indicate that a sentence is an opinion, so more appropriate
-  static List<String> MENTAL_VERBS = new ArrayList<String>(
-      Arrays.asList(new String[] { "want", "know", "believe", "appeal", "ask",
-          "accept", "agree", "allow", "appeal", "ask", "assume", "believe",
-          "check", "confirm", "convince", "deny", "disagree", "explain",
-          "ignore", "inform", "remind", "request", "suggest", "suppose",
-          "think", "threaten", "try", "understand" }));
-
-  private static final int MAX_FRAGMENT_SENTS = 10;
-
-  public RelatedSentenceFinder() {
-
-  }
-
-  public List<HitBase> findRelatedOpinionsForSentenceFastAndDummy(String word,
-      List<String> sents) throws Exception {
-    BingWebQueryRunner yrunner = new BingWebQueryRunner();
-    List<HitBase> searchResult = yrunner.runSearch(word, 100);
-    return searchResult;
-  }
-
-  public List<HitBase> findRelatedOpinionsForSentence(String sentence,
-      List<String> sents) throws Exception {
-    List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
-    System.out.println(" \n\n=== Sentence  = " + sentence);
-    List<String> nounPhraseQueries = buildSearchEngineQueryFromSentence(sentence);
-
-    BingWebQueryRunner yrunner = new BingWebQueryRunner();
-    for (String query : nounPhraseQueries) {
-      System.out.println("\nquery = " + query);
-      // query += " "+join(MENTAL_VERBS, " OR ") ;
-      List<HitBase> searchResult = yrunner.runSearch(query, 100);
-      if (searchResult != null) {
-        for (HitBase item : searchResult) { // got some text from .html
-          if (item.getAbstractText() != null
-              && !(item.getUrl().indexOf(".pdf") > 0)) { // exclude
-                                                         // pdf
-            opinionSentencesToAdd
-                .add(augmentWithMinedSentencesAndVerifyRelevance(item,
-                    sentence, sents));
-          }
-        }
-      }
-    }
-
-    opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
-    return opinionSentencesToAdd;
-  }
-
-  /**
-   * Main content generation function which takes a seed as a person, rock
-   * group, or other entity name and produce a list of text fragments by web
-   * mining for <br>
-   * 
-   * @param String
-   *          entity name
-   * @return List<HitBase> of text fragment structures which contain approved
-   *         (in terms of relevance) mined sentences, as well as original search
-   *         results objects such as doc titles, abstracts, and urls.
-   */
-
-  public List<HitBase> generateContentAbout(String sentence) throws Exception {
-    List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
-    System.out.println(" \n=== Entity to write about = " + sentence);
-    List<String> nounPhraseQueries = new ArrayList<String>();
-
-    // nounPhraseQueries.add(sentence + frequentPerformingVerbs);
-
-    BingWebQueryRunner yrunner = new BingWebQueryRunner();
-    for (String verbAddition : StoryDiscourseNavigator.frequentPerformingVerbs) {
-      List<HitBase> searchResult = yrunner.runSearch(sentence + " "
-          + verbAddition, 100);
-      if (searchResult != null) {
-        for (HitBase item : searchResult) { // got some text from .html
-          if (item.getAbstractText() != null
-              && !(item.getUrl().indexOf(".pdf") > 0)) { // exclude pdf
-            opinionSentencesToAdd
-                .add(augmentWithMinedSentencesAndVerifyRelevance(item,
-                    sentence, null));
-          }
-        }
-      }
-    }
-
-    opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
-    return opinionSentencesToAdd;
-  }
-
-  /**
-   * Takes a sentence and extracts noun phrases and entity names to from search
-   * queries for finding relevant sentences on the web, which are then subject
-   * to relevance assessment by Similarity. Search queries should not be too
-   * general (irrelevant search results) or too specific (too few search
-   * results)
-   * 
-   * @param String
-   *          input sentence to form queries
-   * @return List<String> of search expressions
-   */
-  public static List<String> buildSearchEngineQueryFromSentence(String sentence) {
-    ParseTreeChunk matcher = new ParseTreeChunk();
-    ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor
-        .getInstance();
-    List<List<ParseTreeChunk>> sent1GrpLst = null;
-
-    List<ParseTreeChunk> nPhrases = pos
-        .formGroupedPhrasesFromChunksForSentence(sentence).get(0);
-    List<String> queryArrayStr = new ArrayList<String>();
-    for (ParseTreeChunk ch : nPhrases) {
-      String query = "";
-      int size = ch.getLemmas().size();
-
-      for (int i = 0; i < size; i++) {
-        if (ch.getPOSs().get(i).startsWith("N")
-            || ch.getPOSs().get(i).startsWith("J")) {
-          query += ch.getLemmas().get(i) + " ";
-        }
-      }
-      query = query.trim();
-      int len = query.split(" ").length;
-      if (len < 2 || len > 5)
-        continue;
-      if (len < 4) { // every word should start with capital
-        String[] qs = query.split(" ");
-        boolean bAccept = true;
-        for (String w : qs) {
-          if (w.toLowerCase().equals(w)) // idf only two words then
-            // has to be person name,
-            // title or geo location
-            bAccept = false;
-        }
-        if (!bAccept)
-          continue;
-      }
-
-      query = query.trim().replace(" ", " +");
-      query = " +" + query;
-
-      queryArrayStr.add(query);
-
-    }
-    if (queryArrayStr.size() < 1) { // release constraints on NP down to 2
-                                    // keywords
-      for (ParseTreeChunk ch : nPhrases) {
-        String query = "";
-        int size = ch.getLemmas().size();
-
-        for (int i = 0; i < size; i++) {
-          if (ch.getPOSs().get(i).startsWith("N")
-              || ch.getPOSs().get(i).startsWith("J")) {
-            query += ch.getLemmas().get(i) + " ";
-          }
-        }
-        query = query.trim();
-        int len = query.split(" ").length;
-        if (len < 2)
-          continue;
-
-        query = query.trim().replace(" ", " +");
-        query = " +" + query;
-
-        queryArrayStr.add(query);
-
-      }
-    }
-
-    queryArrayStr = removeDuplicatesFromQueries(queryArrayStr);
-    queryArrayStr.add(sentence);
-
-    return queryArrayStr;
-
-  }
-
-  /**
-   * remove dupes from queries to easy cleaning dupes and repetitive search
-   * afterwards
-   * 
-   * @param List
-   *          <String> of sentences (search queries, or search results
-   *          abstracts, or titles
-   * @return List<String> of sentences where dupes are removed
-   */
-  public static List<String> removeDuplicatesFromQueries(List<String> hits) {
-    StringDistanceMeasurer meas = new StringDistanceMeasurer();
-    double dupeThresh = 0.8; // if more similar, then considered dupes was
-    // 0.7
-    List<Integer> idsToRemove = new ArrayList<Integer>();
-    List<String> hitsDedup = new ArrayList<String>();
-    try {
-      for (int i = 0; i < hits.size(); i++)
-        for (int j = i + 1; j < hits.size(); j++) {
-          String title1 = hits.get(i);
-          String title2 = hits.get(j);
-          if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
-            continue;
-          if (meas.measureStringDistance(title1, title2) > dupeThresh) {
-            idsToRemove.add(j); // dupes found, later list member to
-            // be deleted
-
-          }
-        }
-
-      for (int i = 0; i < hits.size(); i++)
-        if (!idsToRemove.contains(i))
-          hitsDedup.add(hits.get(i));
-
-      if (hitsDedup.size() < hits.size()) {
-        LOG.info("Removed duplicates from formed query, including "
-            + hits.get(idsToRemove.get(0)));
-      }
-
-    } catch (Exception e) {
-      LOG.severe("Problem removing duplicates from query list");
-    }
-
-    return hitsDedup;
-
-  }
-
-  /**
-   * remove dupes from search results
-   * 
-   * @param List
-   *          <HitBase> of search results objects
-   * @return List<String> of search results objects where dupes are removed
-   */
-  public static List<HitBase> removeDuplicatesFromResultantHits(
-      List<HitBase> hits) {
-    StringDistanceMeasurer meas = new StringDistanceMeasurer();
-    double dupeThresh = // 0.8; // if more similar, then considered dupes was
-    0.7;
-    List<Integer> idsToRemove = new ArrayList<Integer>();
-    List<HitBase> hitsDedup = new ArrayList<HitBase>();
-    try {
-      for (int i = 0; i < hits.size(); i++)
-        for (int j = i + 1; j < hits.size(); j++) {
-          HitBase hit2 = hits.get(j);
-          List<Fragment> fragmList1 = hits.get(i).getFragments();
-          List<Fragment> fragmList2 = hits.get(j).getFragments();
-          List<Fragment> fragmList2Results = new ArrayList<Fragment>(fragmList2);
-          for (Fragment f1 : fragmList1)
-            for (Fragment f2 : fragmList2) {
-              String sf1 = f1.getResultText();
-              String sf2 = f2.getResultText();
-              if (StringUtils.isEmpty(sf1) || StringUtils.isEmpty(sf1))
-                continue;
-              if (meas.measureStringDistance(sf1, sf2) > dupeThresh) {
-                fragmList2Results.remove(f2);
-                LOG.info("Removed duplicates from formed fragments list: "
-                    + sf2);
-              }
-            }
-
-          hit2.setFragments(fragmList2Results);
-          hits.set(j, hit2);
-        }
-    } catch (Exception e) {
-      LOG.severe("Problem removing duplicates from list of fragment");
-    }
-    return hits;
-  }
-
-  /**
-   * Takes single search result for an entity which is the subject of the essay
-   * to be written and forms essey sentences from the title, abstract, and
-   * possibly original page
-   * 
-   * @param HitBase
-   *          item : search result
-   * @param originalSentence
-   *          : seed for the essay to be written
-   * @param sentsAll
-   *          : list<String> of other sentences in the seed if it is
-   *          multi-sentence
-   * @return search result
-   */
-
-  public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item,
-      String originalSentence, List<String> sentsAll) {
-    if (sentsAll == null)
-      sentsAll = new ArrayList<String>();
-    // put orig sentence in structure
-    List<String> origs = new ArrayList<String>();
-    origs.add(originalSentence);
-    item.setOriginalSentences(origs);
-    String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
-        .replace("  ", " ").replace("  ", " ");
-    // generation results for this sentence
-    List<Fragment> result = new ArrayList<Fragment>();
-    // form plain text from snippet
-    String snapshot = item.getAbstractText().replace("<b>", " ")
-        .replace("</b>", " ").replace("  ", " ").replace("  ", " ");
-
-    ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor
-        .getInstance();
-    // fix a template expression which can be substituted by original if
-    // relevant
-    String snapshotMarked = snapshot.replace("...",
-        " _should_find_orig_ . _should_find_orig_");
-    String[] fragments = sm.splitSentences(snapshotMarked);
-    List<String> allFragms = new ArrayList<String>();
-    allFragms.addAll(Arrays.asList(fragments));
-
-    String[] sents = null;
-    String downloadedPage;
-    try {
-      if (snapshotMarked.length() != snapshot.length()) {
-        downloadedPage = pFetcher.fetchPage(item.getUrl());
-        if (downloadedPage != null && downloadedPage.length() > 100) {
-          item.setPageContent(downloadedPage);
-          String pageContent = Utils.fullStripHTML(item.getPageContent());
-          pageContent = GeneratedSentenceProcessor
-              .normalizeForSentenceSplitting(pageContent);
-          pageContent = pageContent.trim().replaceAll("  [A-Z]", ". $0")// .replace("  ",
-                                                                        // ". ")
-              .replace("..", ".").replace(". . .", " ").trim(); // sometimes
-                                                                // html breaks
-                                                                // are converted
-                                                                // into ' ' (two
-                                                                // spaces), so
-                                                                // we need to
-                                                                // put '.'
-          sents = sm.splitSentences(snapshotMarked);
-          ;
-          sents = cleanListOfSents(sents);
-        }
-      }
-    } catch (Exception e) {
-      // TODO Auto-generated catch block
-      // e.printStackTrace();
-      System.err
-          .println("Problem downloading  the page and splitting into sentences");
-      return item;
-    }
-
-    for (String fragment : allFragms) {
-      String followSent = null;
-      if (fragment.length() < 50)
-        continue;
-      String pageSentence = "";
-      // try to find original sentence from webpage
-      if (fragment.indexOf("_should_find_orig_") > -1 && sents != null
-          && sents.length > 0)
-        try {
-          String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
-              fragment.replace("_should_find_orig_", ""), sents);
-          pageSentence = mainAndFollowSent[0];
-          followSent = mainAndFollowSent[1];
-
-        } catch (Exception e) {
-
-          // TODO Auto-generated catch block
-          e.printStackTrace();
-        }
-      else
-        // or get original snippet
-        pageSentence = fragment;
-      if (pageSentence != null)
-        pageSentence.replace("_should_find_orig_", "");
-
-      // resultant sentence SHOULD NOT be longer than twice the size of
-      // snippet fragment
-      if (pageSentence != null
-          && (float) pageSentence.length() / (float) fragment.length() < 4.0) { // was
-                                                                                // 2.0,
-                                                                                // but
-                                                                                // since
-                                                                                // snippet
-                                                                                // sentences
-                                                                                // are
-                                                                                // rather
-                                                                                // short
-                                                                                // now...
-        try { // get score from syntactic match between sentence in
-              // original text and mined sentence
-          double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;
-
-          SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence
-              + " " + title, originalSentence);
-          List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
-          if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {
-            System.out
-                .println("Rejected Sentence : No verb OR Yes imperative verb :"
-                    + pageSentence);
-            continue;
-          }
-
-          syntScore = parseTreeChunkListScorer
-              .getParseTreeChunkListScore(match);
-          System.out.println(parseTreeChunk.listToString(match) + " "
-              + syntScore + "\n pre-processed sent = '" + pageSentence);
-
-          if (syntScore < 1.5) { // trying other sents
-            for (String currSent : sentsAll) {
-              if (currSent.startsWith(originalSentence))
-                continue;
-              match = sm.assessRelevance(currSent, pageSentence)
-                  .getMatchResult();
-              double syntScoreCurr = parseTreeChunkListScorer
-                  .getParseTreeChunkListScore(match);
-              if (syntScoreCurr > syntScore) {
-                syntScore = syntScoreCurr;
-              }
-            }
-            if (syntScore > 1.5) {
-              System.out.println("Got match with other sent: "
-                  + parseTreeChunk.listToString(match) + " " + syntScore);
-            }
-          }
-
-          measScore = STRING_DISTANCE_MEASURER.measureStringDistance(
-              originalSentence, pageSentence);
-
-          // now possibly increase score by finding mental verbs
-          // indicating opinions
-          for (String s : MENTAL_VERBS) {
-            if (pageSentence.indexOf(s) > -1) {
-              mentalScore += 0.3;
-              break;
-            }
-          }
-
-          if ((syntScore > 1.5 || measScore > 0.5 || mentalScore > 0.5)
-              && measScore < 0.8 && pageSentence.length() > 40) // >70
-          {
-            String pageSentenceProc = GeneratedSentenceProcessor
-                .acceptableMinedSentence(pageSentence);
-            if (pageSentenceProc != null) {
-              pageSentenceProc = GeneratedSentenceProcessor
-                  .processSentence(pageSentenceProc);
-              if (followSent != null) {
-                pageSentenceProc += " "
-                    + GeneratedSentenceProcessor.processSentence(followSent);
-              }
-
-              pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
-              Fragment f = new Fragment(pageSentenceProc, syntScore + measScore
-                  + mentalScore + (double) pageSentenceProc.length()
-                  / (double) 50);
-              f.setSourceURL(item.getUrl());
-              f.fragment = fragment;
-              result.add(f);
-              System.out.println("Accepted sentence: " + pageSentenceProc
-                  + "| with title= " + title);
-              System.out.println("For fragment = " + fragment);
-            } else
-              System.out
-                  .println("Rejected sentence due to wrong area at webpage: "
-                      + pageSentence);
-          } else
-            System.out.println("Rejected sentence due to low score: "
-                + pageSentence);
-          // }
-        } catch (Throwable t) {
-          t.printStackTrace();
-        }
-      }
-    }
-    item.setFragments(result);
-    return item;
-  }
-
-  public static String[] cleanListOfSents(String[] sents) {
-    List<String> sentsClean = new ArrayList<String>();
-    for (String s : sents) {
-      if (s == null || s.trim().length() < 30 || s.length() < 20)
-        continue;
-      sentsClean.add(s);
-    }
-    return (String[]) sentsClean.toArray(new String[0]);
-  }
-
-  // given a fragment from snippet, finds an original sentence at a webpage by
-  // optimizing alignmemt score
-  public static String[] getFullOriginalSentenceFromWebpageBySnippetFragment(
-      String fragment, String[] sents) {
-    if (fragment.trim().length() < 15)
-      return null;
-
-    StringDistanceMeasurer meas = new StringDistanceMeasurer();
-    Double dist = 0.0;
-    String result = null, followSent = null;
-    for (int i = 0; i < sents.length; i++) {
-      String s = sents[i];
-      if (s == null || s.length() < 30)
-        continue;
-      Double distCurr = meas.measureStringDistance(s, fragment);
-      if (distCurr > dist && distCurr > 0.4) {
-        result = s;
-        dist = distCurr;
-        if (i < sents.length - 1 && sents[i + 1].length() > 60) {
-          followSent = sents[i + 1];
-        }
-
-      }
-    }
-    return new String[] { result, followSent };
-  }
-
-  // given a fragment from snippet, finds an original sentence at a webpage by
-  // optimizing alignmemt score
-  public static String[] getBestFullOriginalSentenceFromWebpageBySnippetFragment(
-      String fragment, String[] sents) {
-    if (fragment.trim().length() < 15)
-      return null;
-    int bestSentIndex = -1;
-    StringDistanceMeasurer meas = new StringDistanceMeasurer();
-    Double distBest = 10.0; // + sup
-    String result = null, followSent = null;
-    for (int i = 0; i < sents.length; i++) {
-      String s = sents[i];
-      if (s == null || s.length() < 30)
-        continue;
-      Double distCurr = meas.measureStringDistance(s, fragment);
-      if (distCurr > distBest) {
-        distBest = distCurr;
-        bestSentIndex = i;
-      }
-
-    }
-    if (distBest > 0.4) {
-      result = sents[bestSentIndex];
-
-      if (bestSentIndex < sents.length - 1
-          && sents[bestSentIndex + 1].length() > 60) {
-        followSent = sents[bestSentIndex + 1];
-      }
-
-    }
-
-    return new String[] { result, followSent };
-  }
-
-  public static void main(String[] args) {
-    RelatedSentenceFinder f = new RelatedSentenceFinder();
-
-    List<HitBase> hits = null;
-    try {
-      // uncomment the sentence you would like to serve as a seed sentence for
-      // content generation for an event description
-
-      // uncomment the sentence you would like to serve as a seed sentence for
-      // content generation for an event description
-      hits = f.generateContentAbout("Albert Einstein"
-      // "Britney Spears - The Femme Fatale Tour"
-      // "Rush Time Machine",
-      // "Blue Man Group" ,
-      // "Belly Dance With Zaharah",
-      // "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
-      // "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
-          );
-      System.out.println(HitBase.toString(hits));
-      System.out.println(HitBase.toResultantString(hits));
-      // WordFileGenerator.createWordDoc("Essey about Albert Einstein",
-      // hits.get(0).getTitle(), hits);
-
-    } catch (Exception e) {
-      e.printStackTrace();
-    }
+	private static Logger LOG = Logger
+			.getLogger("opennlp.tools.similarity.apps.RelatedSentenceFinder");
+	PageFetcher pFetcher = new PageFetcher();
+	ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor
+			.getInstance();
+	protected ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
+	protected ParseTreeChunk parseTreeChunk = new ParseTreeChunk();
+	protected static StringDistanceMeasurer stringDistanceMeasurer = new StringDistanceMeasurer();
+	protected BingQueryRunner yrunner = new BingQueryRunner();
+	protected int MAX_STEPS = 1;
+	protected int MAX_SEARCH_RESULTS = 1;
+	protected float RELEVANCE_THRESHOLD = 1.1f;
+	protected Set<String> visitedURLs = new HashSet();
+
+	// used to indicate that a sentence is an opinion, so more appropriate
+	static List<String> MENTAL_VERBS = new ArrayList<String>(
+			Arrays.asList(new String[] { "want", "know", "believe", "appeal", "ask",
+					"accept", "agree", "allow", "appeal", "ask", "assume", "believe",
+					"check", "confirm", "convince", "deny", "disagree", "explain",
+					"ignore", "inform", "remind", "request", "suggest", "suppose",
+					"think", "threaten", "try", "understand" }));
+
+	private static final int MAX_FRAGMENT_SENTS = 10;
+
+	public RelatedSentenceFinder(int ms, int msr, float thresh, String key) {
+		this.MAX_STEPS = ms;
+		this.MAX_SEARCH_RESULTS = msr;
+		this.RELEVANCE_THRESHOLD=thresh;
+		yrunner.setKey(key);
+	}
+
+	public RelatedSentenceFinder() {
+		// TODO Auto-generated constructor stub
+	}
+	public void setLang(String lang) {
+		yrunner.setLang(lang);
+
+	}
+	public List<HitBase> findRelatedOpinionsForSentenceFastAndDummy(String word,
+			List<String> sents) throws Exception {
+
+		List<HitBase> searchResult = yrunner.runSearch(word, 100);
+		return searchResult;
+	}
+
+	public List<HitBase> findRelatedOpinionsForSentence(String sentence,
+			List<String> sents) throws Exception {
+		List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
+		System.out.println(" \n\n=== Sentence  = " + sentence);
+		List<String> nounPhraseQueries = buildSearchEngineQueryFromSentence(sentence);
+
+		BingQueryRunner yrunner = new BingQueryRunner();
+		for (String query : nounPhraseQueries) {
+			System.out.println("\nquery = " + query);
+			// query += " "+join(MENTAL_VERBS, " OR ") ;
+			List<HitBase> searchResult = yrunner.runSearch(query, 100);
+			if (searchResult != null) {
+				for (HitBase item : searchResult) { // got some text from .html
+					if (item.getAbstractText() != null
+							&& !(item.getUrl().indexOf(".pdf") > 0)) { // exclude
+						// pdf
+						opinionSentencesToAdd
+						.add(augmentWithMinedSentencesAndVerifyRelevance(item,
+								sentence, sents));
+						
+					}
+				}
+			}
+		}
+
+		opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
+		return opinionSentencesToAdd;
+	}
+
+	/**
+	 * Main content generation function which takes a seed as a person, rock
+	 * group, or other entity name and produce a list of text fragments by web
+	 * mining for <br>
+	 * 
+	 * @param String
+	 *          entity name
+	 * @return List<HitBase> of text fragment structures which contain approved
+	 *         (in terms of relevance) mined sentences, as well as original search
+	 *         results objects such as doc titles, abstracts, and urls.
+	 */
+
+	public List<HitBase> generateContentAbout(String sentence) throws Exception {
+		List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
+		System.out.println(" \n=== Entity to write about = " + sentence);
+		List<String> nounPhraseQueries = new ArrayList<String>();
+
+		String[] extraKeywords = new StoryDiscourseNavigator().obtainAdditionalKeywordsForAnEntity(sentence);
+		System.out.println("Found  extraKeywords "+ Arrays.asList(extraKeywords));
+		if (extraKeywords==null || extraKeywords.length<1)
+			extraKeywords = StoryDiscourseNavigator.frequentPerformingVerbs;
+
+		int stepCount=0;
+		for (String verbAddition : extraKeywords) {
+			List<HitBase> searchResult = yrunner.runSearch(sentence + " "
+					+ verbAddition, MAX_SEARCH_RESULTS); //100);
+			if (MAX_SEARCH_RESULTS<searchResult.size())
+				searchResult = searchResult.subList(0, MAX_SEARCH_RESULTS);
+			//TODO for shorter run
+			if (searchResult != null) {
+				for (HitBase item : searchResult) { // got some text from .html
+					if (item.getAbstractText() != null
+							&& !(item.getUrl().indexOf(".pdf") > 0) && !visitedURLs.contains(item.getUrl())) { // exclude pdf
+						opinionSentencesToAdd
+						.add(//augmentWithMinedSentencesAndVerifyRelevance(item,
+							//	sentence, null));
+								buildParagraphOfGeneratedText(item,	sentence, null));
+						visitedURLs.add(item.getUrl());
+					}
+				}
+			}
+			stepCount++;
+			if (stepCount>MAX_STEPS)
+				break;
+		}
+
+		opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
+		return opinionSentencesToAdd;
+	}
+
+	/**
+	 * Takes a sentence and extracts noun phrases and entity names to from search
+	 * queries for finding relevant sentences on the web, which are then subject
+	 * to relevance assessment by Similarity. Search queries should not be too
+	 * general (irrelevant search results) or too specific (too few search
+	 * results)
+	 * 
+	 * @param String
+	 *          input sentence to form queries
+	 * @return List<String> of search expressions
+	 */
+	public static List<String> buildSearchEngineQueryFromSentence(String sentence) {
+		ParseTreeChunk matcher = new ParseTreeChunk();
+		ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor
+				.getInstance();
+		List<List<ParseTreeChunk>> sent1GrpLst = null;
+
+		List<ParseTreeChunk> nPhrases = pos
+				.formGroupedPhrasesFromChunksForSentence(sentence).get(0);
+		List<String> queryArrayStr = new ArrayList<String>();
+		for (ParseTreeChunk ch : nPhrases) {
+			String query = "";
+			int size = ch.getLemmas().size();
+
+			for (int i = 0; i < size; i++) {
+				if (ch.getPOSs().get(i).startsWith("N")
+						|| ch.getPOSs().get(i).startsWith("J")) {
+					query += ch.getLemmas().get(i) + " ";
+				}
+			}
+			query = query.trim();
+			int len = query.split(" ").length;
+			if (len < 2 || len > 5)
+				continue;
+			if (len < 4) { // every word should start with capital
+				String[] qs = query.split(" ");
+				boolean bAccept = true;
+				for (String w : qs) {
+					if (w.toLowerCase().equals(w)) // idf only two words then
+						// has to be person name,
+						// title or geo location
+						bAccept = false;
+				}
+				if (!bAccept)
+					continue;
+			}
+
+			query = query.trim().replace(" ", " +");
+			query = " +" + query;
+
+			queryArrayStr.add(query);
+
+		}
+		if (queryArrayStr.size() < 1) { // release constraints on NP down to 2
+			// keywords
+			for (ParseTreeChunk ch : nPhrases) {
+				String query = "";
+				int size = ch.getLemmas().size();
+
+				for (int i = 0; i < size; i++) {
+					if (ch.getPOSs().get(i).startsWith("N")
+							|| ch.getPOSs().get(i).startsWith("J")) {
+						query += ch.getLemmas().get(i) + " ";
+					}
+				}
+				query = query.trim();
+				int len = query.split(" ").length;
+				if (len < 2)
+					continue;
+
+				query = query.trim().replace(" ", " +");
+				query = " +" + query;
+
+				queryArrayStr.add(query);
+
+			}
+		}
+
+		queryArrayStr = removeDuplicatesFromQueries(queryArrayStr);
+		queryArrayStr.add(sentence);
+
+		return queryArrayStr;
+
+	}
+
+	/**
+	 * remove dupes from queries to easy cleaning dupes and repetitive search
+	 * afterwards
+	 * 
+	 * @param List
+	 *          <String> of sentences (search queries, or search results
+	 *          abstracts, or titles
+	 * @return List<String> of sentences where dupes are removed
+	 */
+	public static List<String> removeDuplicatesFromQueries(List<String> hits) {
+		StringDistanceMeasurer meas = new StringDistanceMeasurer();
+		double dupeThresh = 0.8; // if more similar, then considered dupes was
+		// 0.7
+		List<Integer> idsToRemove = new ArrayList<Integer>();
+		List<String> hitsDedup = new ArrayList<String>();
+		try {
+			for (int i = 0; i < hits.size(); i++)
+				for (int j = i + 1; j < hits.size(); j++) {
+					String title1 = hits.get(i);
+					String title2 = hits.get(j);
+					if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
+						continue;
+					if (meas.measureStringDistance(title1, title2) > dupeThresh) {
+						idsToRemove.add(j); // dupes found, later list member to
+						// be deleted
+
+					}
+				}
+
+			for (int i = 0; i < hits.size(); i++)
+				if (!idsToRemove.contains(i))
+					hitsDedup.add(hits.get(i));
+
+			if (hitsDedup.size() < hits.size()) {
+				LOG.info("Removed duplicates from formed query, including "
+						+ hits.get(idsToRemove.get(0)));
+			}
+
+		} catch (Exception e) {
+			LOG.severe("Problem removing duplicates from query list");
+		}
+
+		return hitsDedup;
+
+	}
+
+	/**
+	 * remove dupes from search results
+	 * 
+	 * @param List
+	 *          <HitBase> of search results objects
+	 * @return List<String> of search results objects where dupes are removed
+	 */
+	public static List<HitBase> removeDuplicatesFromResultantHits(
+			List<HitBase> hits) {
+		StringDistanceMeasurer meas = new StringDistanceMeasurer();
+		double dupeThresh = // 0.8; // if more similar, then considered dupes was
+				0.7;
+		List<Integer> idsToRemove = new ArrayList<Integer>();
+		List<HitBase> hitsDedup = new ArrayList<HitBase>();
+		try {
+			for (int i = 0; i < hits.size(); i++)
+				for (int j = i + 1; j < hits.size(); j++) {
+					HitBase hit2 = hits.get(j);
+					List<Fragment> fragmList1 = hits.get(i).getFragments();
+					List<Fragment> fragmList2 = hits.get(j).getFragments();
+					List<Fragment> fragmList2Results = new ArrayList<Fragment>(fragmList2);
+					for (Fragment f1 : fragmList1)
+						for (Fragment f2 : fragmList2) {
+							String sf1 = f1.getResultText();
+							String sf2 = f2.getResultText();
+							if (StringUtils.isEmpty(sf1) || StringUtils.isEmpty(sf1))
+								continue;
+							if (meas.measureStringDistance(sf1, sf2) > dupeThresh) {
+								fragmList2Results.remove(f2);
+								LOG.info("Removed duplicates from formed fragments list: "
+										+ sf2);
+							}
+						}
+
+					hit2.setFragments(fragmList2Results);
+					hits.set(j, hit2);
+				}
+		} catch (Exception e) {
+			LOG.severe("Problem removing duplicates from list of fragment");
+		}
+		return hits;
+	}
+
+	/**
+	 * Takes single search result for an entity which is the subject of the essay
+	 * to be written and forms essey sentences from the title, abstract, and
+	 * possibly original page
+	 * 
+	 * @param HitBase
+	 *          item : search result
+	 * @param originalSentence
+	 *          : seed for the essay to be written
+	 * @param sentsAll
+	 *          : list<String> of other sentences in the seed if it is
+	 *          multi-sentence
+	 * @return search result
+	 */
+
+	public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item,
+			String originalSentence, List<String> sentsAll) {
+		if (sentsAll == null)
+			sentsAll = new ArrayList<String>();
+		// put orig sentence in structure
+		List<String> origs = new ArrayList<String>();
+		origs.add(originalSentence);
+		item.setOriginalSentences(origs);
+		String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
+				.replace("  ", " ").replace("  ", " ");
+		// generation results for this sentence
+		List<Fragment> result = new ArrayList<Fragment>();
+		// form plain text from snippet
+		String snapshot = item.getAbstractText().replace("<b>", " ")
+				.replace("</b>", " ").replace("  ", " ").replace("  ", " ");
+
+
+		// fix a template expression which can be substituted by original if
+		// relevant
+		String snapshotMarked = snapshot.replace("...",
+				" _should_find_orig_ . _should_find_orig_");
+		String[] fragments = sm.splitSentences(snapshotMarked);
+		List<String> allFragms = new ArrayList<String>();
+		allFragms.addAll(Arrays.asList(fragments));
+
+		String[] sents = null;
+		String downloadedPage = null;
+		try {
+			if (snapshotMarked.length() != snapshot.length()) {
+				downloadedPage = pFetcher.fetchPage(item.getUrl());
+				if (downloadedPage != null && downloadedPage.length() > 100) {
+					item.setPageContent(downloadedPage);
+					String pageContent = Utils.fullStripHTML(item.getPageContent());
+					pageContent = GeneratedSentenceProcessor
+							.normalizeForSentenceSplitting(pageContent);
+					pageContent = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(pageContent);
+					//pageContent = pageContent.trim().replaceAll("  [A-Z]", ". $0")// .replace("  ",
+					//		// ". ")
+					//		.replace("..", ".").replace(". . .", " ").trim(); // sometimes   html breaks are converted into ' ' (two spaces), so
+					// we need to put '.'
+					sents = sm.splitSentences(pageContent);
+
+					sents = ContentGeneratorSupport.cleanListOfSents(sents);
+				}
+			}
+		} catch (Exception e) {
+			// TODO Auto-generated catch block
+			// e.printStackTrace();
+			System.err
+			.println("Problem downloading  the page and splitting into sentences");
+			return item;
+		}
+
+		for (String fragment : allFragms) {
+			String followSent = "";
+			if (fragment.length() < 50)
+				continue;
+			String pageSentence = "";
+			// try to find original sentence from webpage
+			if (fragment.indexOf("_should_find_orig_") > -1 && sents != null
+					&& sents.length > 0){
+				try { 
+					// first try sorted sentences from page by length approach
+					String[] sentsSortedByLength = extractSentencesFromPage(downloadedPage);
+					String[] mainAndFollowSent = null;
+
+					try {
+						mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
+								fragment.replace("_should_find_orig_", ""), sentsSortedByLength);
+					} catch (Exception e) {
+						// TODO Auto-generated catch block
+						e.printStackTrace();
+					}
+					// if the above gives null than try to match all sentences from snippet fragment
+					if (mainAndFollowSent==null || mainAndFollowSent[0]==null){
+						mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
+								fragment.replace("_should_find_orig_", ""), sents);
+					}
+					
+					if (mainAndFollowSent!=null || mainAndFollowSent[0]!=null){
+						pageSentence = mainAndFollowSent[0];
+						for(int i = 1; i< mainAndFollowSent.length; i++)
+							if (mainAndFollowSent[i]!=null)
+								followSent+= mainAndFollowSent[i];
+					}
+
+				} catch (Exception e) {
+
+					// TODO Auto-generated catch block
+					e.printStackTrace();
+				}
+			}
+			
+			else
+				// or get original snippet
+				pageSentence = fragment;
+			if (pageSentence != null)
+				pageSentence.replace("_should_find_orig_", "");
+
+			// resultant sentence SHOULD NOT be longer than for times the size of
+			// snippet fragment
+			if (pageSentence != null && pageSentence.length()>50 )
+			//		&& (float) pageSentence.length() / (float) fragment.length() < 4.0)
+			{ // was 2.0,
+
+				try { // get score from syntactic match between sentence in
+					// original text and mined sentence
+					double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;
+
+					SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence
+							+ " " + title, originalSentence);
+					List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
+					if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {
+						System.out
+						.println("Rejected Sentence : No verb OR Yes imperative verb :"
+								+ pageSentence);
+						continue;
+					}
+
+					syntScore = parseTreeChunkListScorer
+							.getParseTreeChunkListScore(match);
+					System.out.println(parseTreeChunk.listToString(match) + " "
+							+ syntScore + "\n pre-processed sent = '" + pageSentence);
+
+					if (syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents
+						for (String currSent : sentsAll) {
+							if (currSent.startsWith(originalSentence))
+								continue;
+							match = sm.assessRelevance(currSent, pageSentence)
+									.getMatchResult();
+							double syntScoreCurr = parseTreeChunkListScorer
+									.getParseTreeChunkListScore(match);
+							if (syntScoreCurr > syntScore) {
+								syntScore = syntScoreCurr;
+							}
+						}
+						if (syntScore > RELEVANCE_THRESHOLD) {
+							System.out.println("Got match with other sent: "
+									+ parseTreeChunk.listToString(match) + " " + syntScore);
+						}
+					}
+
+					measScore = stringDistanceMeasurer.measureStringDistance(
+							originalSentence, pageSentence);
+
+
+					if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5)
+							&& measScore < 0.8 && pageSentence.length() > 40) // >70
+					{
+						String pageSentenceProc = GeneratedSentenceProcessor
+								.acceptableMinedSentence(pageSentence);
+						if (pageSentenceProc != null) {
+							pageSentenceProc = GeneratedSentenceProcessor
+									.processSentence(pageSentenceProc);
+							followSent = GeneratedSentenceProcessor.processSentence(followSent);
+							if (followSent != null) {
+								pageSentenceProc += " "+ followSent;
+							}
+
+							pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
+							Fragment f = new Fragment(pageSentenceProc, syntScore + measScore
+									+ mentalScore + (double) pageSentenceProc.length()
+									/ (double) 50);
+							f.setSourceURL(item.getUrl());
+							f.fragment = fragment;
+							result.add(f);
+							System.out.println("Accepted sentence: " + pageSentenceProc + " | "+followSent
+									+ "| with title= " + title);
+							System.out.println("For fragment = " + fragment);
+						} else
+							System.out
+							.println("Rejected sentence due to wrong area at webpage: "
+									+ pageSentence);
+					} else
+						System.out.println("Rejected sentence due to low score: "
+								+ pageSentence);
+					// }
+				} catch (Throwable t) {
+					t.printStackTrace();
+				}
+			}
+		}
+		item.setFragments(result);
+		return item;
+	}
+
+	
+
+	// given a fragment from snippet, finds an original sentence at a webpage by
+	// optimizing alignmemt score
+	public static String[] getFullOriginalSentenceFromWebpageBySnippetFragment(
+			String fragment, String[] sents) {
+		if (fragment.trim().length() < 15)
+			return null;
+
+		StringDistanceMeasurer meas = new StringDistanceMeasurer();
+		Double dist = 0.0;
+		String result = null, followSent = "";
+		for (int i = 0; i < sents.length; i++) {
+			String s = sents[i];
+			if (s == null || s.length() < 30)
+				continue;
+			Double distCurr = meas.measureStringDistance(s, fragment);
+			if (distCurr > dist && distCurr > 0.4) {
+				result = s;
+				dist = distCurr;
+				try {
+					if (i < sents.length - 1 && sents[i + 1].length() > 60) { 
+						String f1 = GeneratedSentenceProcessor.acceptableMinedSentence(sents[i+1]);
+						if (f1!=null){
+							followSent = f1;
+						}
+					}
+
+					if (i < sents.length - 2 && sents[i + 2].length() > 60) {
+						String f2 = GeneratedSentenceProcessor.acceptableMinedSentence(sents[i+2]);
+						if (f2!=null){
+							followSent += " "+f2;
+						}
+					}
+				} catch (Exception e) {
+					// TODO Auto-generated catch block
+					e.printStackTrace();
+				}
+			}
+		}
+		return new String[] { result, followSent };
+	}
+
+	// given a fragment from snippet, finds an original sentence at a webpage by
+	// optimizing alignmemt score
+	public static String[] getBestFullOriginalSentenceFromWebpageBySnippetFragment(
+			String fragment, String[] sents) {
+		if (fragment.trim().length() < 15)
+			return null;
+		int bestSentIndex = -1;
+		StringDistanceMeasurer meas = new StringDistanceMeasurer();
+		Double distBest = 10.0; // + sup
+		String result = null, followSent = null;
+		for (int i = 0; i < sents.length; i++) {
+			String s = sents[i];
+			if (s == null || s.length() < 30)
+				continue;
+			Double distCurr = meas.measureStringDistance(s, fragment);
+			if (distCurr > distBest) {
+				distBest = distCurr;
+				bestSentIndex = i;
+			}
+
+		}
+		if (distBest > 0.4) {
+			result = sents[bestSentIndex];
+
+			if (bestSentIndex < sents.length - 1
+					&& sents[bestSentIndex + 1].length() > 60) {
+				followSent = sents[bestSentIndex + 1];
+			}
+
+		}
+
+		return new String[] { result, followSent };
+	}
+
+	public String[] extractSentencesFromPage(String downloadedPage)
+	{
+
+		int maxSentsFromPage= 100;
+		List<String[]> results = new ArrayList<String[]>();
+
+		//String pageOrigHTML = pFetcher.fetchOrigHTML(url);
+
+		downloadedPage= downloadedPage.replace("     ", "&");
+		downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");
+		String[] sents = downloadedPage.split("#");
+		List<TextChunk> sentsList = new ArrayList<TextChunk>();
+		for(String s: sents){
+			s = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(s);
+		/*	s = s.trim().replace("  ", ". ").replace("..", ".").replace(". . .", " ")
+					.replace(": ", ". ").replace("- ", ". ").
+					replace (". .",".").trim(); */
+			sentsList.add(new TextChunk(s, s.length()));
+		}
+
+		Collections.sort(sentsList, new TextChunkComparable());
+		String[] longestSents = new String[maxSentsFromPage];
+		int j=0;
+		int initIndex = sentsList.size()-1 -maxSentsFromPage;
+		if (initIndex<0)
+			initIndex = 0;
+		for(int i=initIndex; i< sentsList.size() && j<maxSentsFromPage ; i++){
+			longestSents[j] = sentsList.get(i).text;
+			j++;
+		}
+
+		sents = cleanSplitListOfSents(longestSents);
+
+		//sents = removeDuplicates(sents);
+		//sents = verifyEnforceStartsUpperCase(sents);
+
+		return sents;
+	}
+
+	public class TextChunk {
+		public TextChunk(String s, int length) {
+			this.text = s;
+			this.len = length;
+		}
+		public String text;
+		public int len;
+	}
+
+	public class TextChunkComparable implements Comparator<TextChunk>
+	{
+		public int compare(TextChunk ch1, TextChunk ch2)
+		{
+			if (ch1.len>ch2.len)
+				return 1;
+			else if (ch1.len<ch2.len)
+				return  -1;
+			else return 0;
+
+		}
+	}
+
+	protected String[] cleanSplitListOfSents(String[] longestSents){
+		float minFragmentLength = 40, minFragmentLengthSpace=4;
+
+		List<String> sentsClean = new ArrayList<String>();
+		for (String sentenceOrMultSent : longestSents)
+		{
+			if (sentenceOrMultSent==null || sentenceOrMultSent.length()<20)
+				continue;
+			if (GeneratedSentenceProcessor.acceptableMinedSentence(sentenceOrMultSent)==null){
+				//System.out.println("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "+sentenceOrMultSent);
+				continue;
+			}
+			// aaa. hhh hhh.  kkk . kkk ll hhh. lll kkk n.
+			int numOfDots = sentenceOrMultSent.replace('.','&').split("&").length;
+			float avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;
+			if ( avgSentenceLengthInTextPortion<minFragmentLength)
+				continue;
+			// o oo o ooo o o o ooo oo ooo o o oo
+			numOfDots = sentenceOrMultSent.replace(' ','&').split("&").length;
+			avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;
+			if ( avgSentenceLengthInTextPortion<minFragmentLengthSpace)
+				continue;
+
+			List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent);
+
+			// forced split by ',' somewhere in the middle of sentence
+			// disused - Feb 26 13
+			//furtherSplit = furtherMakeSentencesShorter(furtherSplit);
+			furtherSplit.remove(furtherSplit.size()-1);
+			for(String s : furtherSplit){
+				if (s.indexOf('|')>-1)
+					continue;
+				s = s.replace("<em>"," ").replace("</em>"," ");
+				s = Utils.convertToASCII(s);
+				sentsClean.add(s);
+			}
+		}
+		return (String[]) sentsClean.toArray(new String[0]);
+	}
+
+	public Triple<List<String>, String, String[]> formCandidateFragmentsForPage(HitBase item, String originalSentence, List<String> sentsAll){
+		if (sentsAll == null)
+			sentsAll = new ArrayList<String>();
+		// put orig sentence in structure
+		List<String> origs = new ArrayList<String>();
+		origs.add(originalSentence);
+		item.setOriginalSentences(origs);
+		String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
+				.replace("  ", " ").replace("  ", " ");
+		// generation results for this sentence
+		List<Fragment> result = new ArrayList<Fragment>();
+		// form plain text from snippet
+		String snapshot = item.getAbstractText().replace("<b>", " ")
+				.replace("</b>", " ").replace("  ", " ").replace("  ", " ");
+
+
+		// fix a template expression which can be substituted by original if
+		// relevant
+		String snapshotMarked = snapshot.replace("...",
+				" _should_find_orig_ . _should_find_orig_");
+		String[] fragments = sm.splitSentences(snapshotMarked);
+		List<String> allFragms = new ArrayList<String>();
+		allFragms.addAll(Arrays.asList(fragments));
+
+		String[] sents = null;
+		String downloadedPage = null;
+		try {
+			if (snapshotMarked.length() != snapshot.length()) {
+				downloadedPage = pFetcher.fetchPage(item.getUrl());
+				if (downloadedPage != null && downloadedPage.length() > 100) {
+					item.setPageContent(downloadedPage);
+					String pageContent = Utils.fullStripHTML(item.getPageContent());
+					pageContent = GeneratedSentenceProcessor
+							.normalizeForSentenceSplitting(pageContent);
+					pageContent = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(pageContent);
+					//pageContent = pageContent.trim().replaceAll("    [A-Z]", ". $0")// .replace("  ",
+					//		// ". ")
+					//		.replace("..", ".").replace(". . .", " ").
+					//		replace(".    .",". ").trim(); // sometimes   html breaks are converted into ' ' (two spaces), so
+					// we need to put '.'
+					sents = sm.splitSentences(pageContent);
+
+					sents = ContentGeneratorSupport.cleanListOfSents(sents);
+				}
+			}
+		} catch (Exception e) {
+			// TODO Auto-generated catch block
+			// e.printStackTrace();
+			System.err
+			.println("Problem downloading  the page and splitting into sentences");
+			return new Triple(allFragms, downloadedPage, sents);
+		}
+		return new Triple(allFragms, downloadedPage, sents);
+	}
+
+	String[] formCandidateSentences(String fragment, Triple<List<String>, String, String[]> fragmentExtractionResults){
+		String[] mainAndFollowSent = null;
+
+		List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();
+		String downloadedPage = (String)fragmentExtractionResults.getSecond();
+		String[] sents = (String[])fragmentExtractionResults.getThird();
+
+		String followSent = null;
+		if (fragment.length() < 50)
+			return null;
+		String pageSentence = "";
+		// try to find original sentence from webpage
+		if (fragment.indexOf("_should_find_orig_") > -1 && sents != null
+				&& sents.length > 0){
+			try { 
+				// first try sorted sentences from page by length approach
+				String[] sentsSortedByLength = extractSentencesFromPage(downloadedPage);
+
+
+				try {
+					mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
+							fragment.replace("_should_find_orig_", ""), sentsSortedByLength);
+				} catch (Exception e) {
+					// TODO Auto-generated catch block
+					e.printStackTrace();
+				}
+				// if the above gives null than try to match all sentences from snippet fragment
+				if (mainAndFollowSent==null || mainAndFollowSent[0]==null){
+					mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
+							fragment.replace("_should_find_orig_", ""), sents);
+				}
+
+
+			} catch (Exception e) {
+
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+			}
+		}
+		else
+			// or get original snippet
+			pageSentence = fragment;
+		if (pageSentence != null)
+			pageSentence.replace("_should_find_orig_", "");
+
+		return mainAndFollowSent;
+
+	}	
+
+	private Fragment verifyCandidateSentencesAndFormParagraph(
+			String[] candidateSentences, HitBase item, String fragment, String originalSentence, List<String> sentsAll) {
+		Fragment result = null;	
+
+		String pageSentence = candidateSentences[0];
+		String followSent = "";
+		for(int i = 1; i< candidateSentences.length; i++)
+			followSent+= candidateSentences[i];
+		String title = item.getTitle();
+
+		// resultant sentence SHOULD NOT be longer than for times the size of
+		// snippet fragment
+		if (!(pageSentence != null && pageSentence.length()>50) ){
+			System.out.println("Cannot accept the sentence = "+ pageSentence +
+					"!(pageSentence != null && pageSentence.length()>50 && (float) pageSentence.length() / (float) fragment.length() < 4.0) )");
+			
+			return null;
+		}
+
+
+		try { // get score from syntactic match between sentence in
+			// original text and mined sentence
+			double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;
+
+			SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence
+					+ " " + title, originalSentence);
+			List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
+			if (match==null || match.size()<1){
+				System.out
+				.println("Rejected Sentence : empty match "+ pageSentence);
+				return null;
+			}
+			
+			if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {
+				System.out
+				.println("Rejected Sentence : No verb OR Yes imperative verb :"
+						+ pageSentence);
+				return null;
+			}
+
+			syntScore = parseTreeChunkListScorer
+					.getParseTreeChunkListScore(match);
+			System.out.println(parseTreeChunk.listToString(match) + " "
+					+ syntScore + "\n pre-processed sent = '" + pageSentence);
+
+			try {
+				if (sentsAll!=null && syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents
+					for (String currSent : sentsAll) {
+						if (currSent.startsWith(originalSentence))
+							continue;
+						match = sm.assessRelevance(currSent, pageSentence)
+								.getMatchResult();
+						double syntScoreCurr = parseTreeChunkListScorer
+								.getParseTreeChunkListScore(match);
+						if (syntScoreCurr > syntScore) {
+							syntScore = syntScoreCurr;
+						}
+					}
+					if (syntScore > RELEVANCE_THRESHOLD) {
+						System.out.println("Got match with other sent: "
+								+ parseTreeChunk.listToString(match) + " " + syntScore);
+					}
+				}
+			} catch (Exception e) {
+				e.printStackTrace();
+			}
+
+			measScore = stringDistanceMeasurer.measureStringDistance(
+					originalSentence, pageSentence);
+
+
+			if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5)
+					&& measScore < 0.8 && pageSentence.length() > 40) // >70
+			{
+				String pageSentenceProc = GeneratedSentenceProcessor
+						.acceptableMinedSentence(pageSentence);
+				if (pageSentenceProc != null) {
+					pageSentenceProc = GeneratedSentenceProcessor
+							.processSentence(pageSentenceProc);
+					followSent = GeneratedSentenceProcessor.processSentence(followSent);
+					if (followSent != null) {
+						pageSentenceProc += " "+ followSent;
+					}
+
+					pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
+					result = new Fragment(pageSentenceProc, syntScore + measScore
+							+ mentalScore + (double) pageSentenceProc.length()
+							/ (double) 50);
+					result.setSourceURL(item.getUrl());
+					result.fragment = fragment;
+
+					System.out.println("Accepted sentence: " + pageSentenceProc
+							+ "| with title= " + title);
+					System.out.println("For fragment = " + fragment);
+				} else
+					System.out
+					.println("Rejected sentence due to wrong area at webpage: "
+							+ pageSentence);
+			} else
+				System.out.println("Rejected sentence due to low score: "
+						+ pageSentence);
+			// }
+		} catch (Throwable t) {
+			t.printStackTrace();
+		}
+
+	return result;
+}
+
+public HitBase buildParagraphOfGeneratedText(HitBase item,
+		String originalSentence, List<String> sentsAll) {
+	List<Fragment> results = new ArrayList<Fragment>() ;
+	
+	Triple<List<String>, String, String[]> fragmentExtractionResults = formCandidateFragmentsForPage(item, originalSentence, sentsAll);
+
+	List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();
+	String downloadedPage = (String)fragmentExtractionResults.getSecond();
+	String[] sents = (String[])fragmentExtractionResults.getThird();
+
+	for (String fragment : allFragms) {
+		String[] candidateSentences = formCandidateSentences(fragment, fragmentExtractionResults);
+		if (candidateSentences == null)
+			continue;
+		Fragment res = verifyCandidateSentencesAndFormParagraph(candidateSentences, item, fragment, originalSentence, sentsAll);
+		if (res!=null)
+			results.add(res);
+
+	}
+	
+	item.setFragments(results );
+	return item;
+}
+
+
+
+
+public static void main(String[] args) {
+	RelatedSentenceFinder f = new RelatedSentenceFinder();
+
+	List<HitBase> hits = null;
+	try {
+		// uncomment the sentence you would like to serve as a seed sentence for
+		// content generation for an event description
+
+		// uncomment the sentence you would like to serve as a seed sentence for
+		// content generation for an event description
+		hits = f.generateContentAbout("Albert Einstein"
+				// "Britney Spears - The Femme Fatale Tour"
+				// "Rush Time Machine",
+				// "Blue Man Group" ,
+				// "Belly Dance With Zaharah",
+				// "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
+				// "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
+				);
+		System.out.println(HitBase.toString(hits));
+		System.out.println(HitBase.toResultantString(hits));
+		// WordFileGenerator.createWordDoc("Essey about Albert Einstein",
+		// hits.get(0).getTitle(), hits);
+
+	} catch (Exception e) {
+		e.printStackTrace();
+	}
+
+}
+
 
-  }
 
 }
\ No newline at end of file

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinderML.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinderML.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinderML.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinderML.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.logging.Logger;
+import opennlp.tools.similarity.apps.utils.Utils;
+import opennlp.tools.textsimilarity.TextProcessor;
+
+/*
+ * This class does content generation in ES, DE etc
+ * 
+ */
+
+public class RelatedSentenceFinderML extends RelatedSentenceFinder{
+	private static Logger LOG = Logger
+			.getLogger("opennlp.tools.similarity.apps.RelatedSentenceFinderML");
+
+
+	public RelatedSentenceFinderML(int ms, int msr, float thresh, String key) {
+		this.MAX_STEPS = ms;
+		this.MAX_SEARCH_RESULTS = msr;
+		this.RELEVANCE_THRESHOLD=thresh;
+		yrunner.setKey(key);
+	}
+
+	public RelatedSentenceFinderML() {
+		// TODO Auto-generated constructor stub
+	}
+
+	public List<HitBase> generateContentAbout(String sentence) throws Exception {
+		List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
+		System.out.println(" \n=== Entity to write about = " + sentence);
+		List<String> nounPhraseQueries = new ArrayList<String>();
+
+		List<HitBase> searchResult = yrunner.runSearch(sentence, 100);
+		if (MAX_SEARCH_RESULTS<searchResult.size())
+			searchResult = searchResult.subList(0, MAX_SEARCH_RESULTS);
+		//TODO for shorter run
+		if (searchResult != null) {
+			for (HitBase item : searchResult) { // got some text from .html
+				if (item.getAbstractText() != null
+						&& !(item.getUrl().indexOf(".pdf") > 0)) { // exclude pdf
+					opinionSentencesToAdd
+					.add(augmentWithMinedSentencesAndVerifyRelevance(item,
+							sentence, null));
+				}
+			}
+		}
+
+		opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
+		return opinionSentencesToAdd;
+	}
+
+
+	/**
+	 * Takes single search result for an entity which is the subject of the essay
+	 * to be written and forms essey sentences from the title, abstract, and
+	 * possibly original page
+	 * 
+	 * @param HitBase
+	 *          item : search result
+	 * @param originalSentence
+	 *          : seed for the essay to be written
+	 * @param sentsAll
+	 *          : list<String> of other sentences in the seed if it is
+	 *          multi-sentence
+	 * @return search result
+	 */
+
+	public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item,
+			String originalSentence, List<String> sentsAll) {
+		if (sentsAll == null)
+			sentsAll = new ArrayList<String>();
+		// put orig sentence in structure
+		List<String> origs = new ArrayList<String>();
+		origs.add(originalSentence);
+		item.setOriginalSentences(origs);
+		String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
+				.replace("  ", " ").replace("  ", " ");
+		// generation results for this sentence
+		List<Fragment> result = new ArrayList<Fragment>();
+		// form plain text from snippet
+		String snapshot = item.getAbstractText().replace("<b>", " ")
+				.replace("</b>", " ").replace("  ", " ").replace("  ", " ");
+
+
+		// fix a template expression which can be substituted by original if
+		// relevant
+		String snapshotMarked = snapshot.replace("...",
+				" _should_find_orig_ . _should_find_orig_");
+		String[] fragments = sm.splitSentences(snapshotMarked);
+		List<String> allFragms = new ArrayList<String>();
+		allFragms.addAll(Arrays.asList(fragments));
+
+		String[] sents = null;
+		String downloadedPage = null;
+		try {
+			if (snapshotMarked.length() != snapshot.length()) {
+				downloadedPage = pFetcher.fetchPage(item.getUrl());
+				if (downloadedPage != null && downloadedPage.length() > 100) {
+					item.setPageContent(downloadedPage);
+					String pageContent = Utils.fullStripHTML(item.getPageContent());
+					pageContent = GeneratedSentenceProcessor
+							.normalizeForSentenceSplitting(pageContent);
+					pageContent = pageContent.trim().replaceAll("  [A-Z]", ". $0")// .replace("  ",
+							// ". ")
+							.replace("..", ".").replace(". . .", " ").trim(); // sometimes   html breaks are converted into ' ' (two spaces), so
+					// we need to put '.'
+					sents = sm.splitSentences(pageContent);
+
+					sents = ContentGeneratorSupport.cleanListOfSents(sents);
+				}
+			}
+		} catch (Exception e) {
+			// TODO Auto-generated catch block
+			// e.printStackTrace();
+			System.err
+			.println("Problem downloading  the page and splitting into sentences");
+			return item;
+		}
+
+		for (String fragment : allFragms) {
+			String followSent = null;
+			if (fragment.length() < 50)
+				continue;
+			String pageSentence = "";
+			// try to find original sentence from webpage
+			if (fragment.indexOf("_should_find_orig_") > -1 && sents != null
+					&& sents.length > 0)
+				try { 
+					// first try sorted sentences from page by lenght approach
+					String[] sentsSortedByLength = extractSentencesFromPage(downloadedPage);
+					String[] mainAndFollowSent = null;
+
+					try {
+						mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
+								fragment.replace("_should_find_orig_", ""), sentsSortedByLength);
+					} catch (Exception e) {
+						// TODO Auto-generated catch block
+						e.printStackTrace();
+					}
+					// if the above gives null than try to match all sentences from snippet fragment
+					if (mainAndFollowSent==null || mainAndFollowSent[0]==null){
+						mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
+								fragment.replace("_should_find_orig_", ""), sents);
+					}
+
+
+				} catch (Exception e) {
+
+					// TODO Auto-generated catch block
+					e.printStackTrace();
+				}
+			else
+				// or get original snippet
+				pageSentence = fragment;
+			if (pageSentence != null)
+				pageSentence.replace("_should_find_orig_", "");
+
+			// resultant sentence SHOULD NOT be longer than twice the size of
+			// snippet fragment
+			if (pageSentence != null
+					&& (float) pageSentence.length() / (float) fragment.length() < 4.0) { // was 2.0,
+
+				try { // get score from syntactic match between sentence in
+					// original text and mined sentence
+					double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;
+
+					syntScore = calculateKeywordScore(pageSentence + " " + title, originalSentence);
+
+
+					if (syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents
+						for (String currSent : sentsAll) {
+							if (currSent.startsWith(originalSentence))
+								continue;
+							double syntScoreCurr = calculateKeywordScore(currSent, pageSentence);
+							if (syntScoreCurr > syntScore) {
+								syntScore = syntScoreCurr;
+							}
+						}
+						if (syntScore > RELEVANCE_THRESHOLD) {
+							System.out.println("Got match with other sent: " + syntScore);
+						}
+					}
+
+					measScore = stringDistanceMeasurer.measureStringDistance(
+							originalSentence, pageSentence);
+
+					// now possibly increase score by finding mental verbs
+					// indicating opinions
+					for (String s : MENTAL_VERBS) {
+						if (pageSentence.indexOf(s) > -1) {
+							mentalScore += 0.3;
+							break;
+						}
+					}
+
+					if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5 || mentalScore > 0.5)
+							&& measScore < 0.8 && pageSentence.length() > 40) // >70
+					{
+						String pageSentenceProc = GeneratedSentenceProcessor
+								.acceptableMinedSentence(pageSentence);
+						if (pageSentenceProc != null) {
+							pageSentenceProc = GeneratedSentenceProcessor
+									.processSentence(pageSentenceProc);
+							if (followSent != null) {
+								pageSentenceProc += " "
+										+ GeneratedSentenceProcessor.processSentence(followSent);
+							}
+
+							pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
+							Fragment f = new Fragment(pageSentenceProc, syntScore + measScore
+									+ mentalScore + (double) pageSentenceProc.length()
+									/ (double) 50);
+							f.setSourceURL(item.getUrl());
+							f.fragment = fragment;
+							result.add(f);
+							System.out.println("Accepted sentence: " + pageSentenceProc
+									+ "| with title= " + title);
+							System.out.println("For fragment = " + fragment);
+						} else
+							System.out
+							.println("Rejected sentence due to wrong area at webpage: "
+									+ pageSentence);
+					} else
+						System.out.println("Rejected sentence due to low score: "
+								+ pageSentence);
+					// }
+				} catch (Throwable t) {
+					t.printStackTrace();
+				}
+			}
+		}
+		item.setFragments(result);
+		return item;
+	}
+
+	private double calculateKeywordScore(String currSent, String pageSentence) {
+		List<String>  list1 =TextProcessor.fastTokenize(currSent, false);
+		List<String>  list2 =TextProcessor.fastTokenize(pageSentence, false);
+		List<String> overlap1 = new ArrayList<String>(list1);		
+		overlap1.retainAll(list2);
+		return overlap1.size();
+
+	}
+
+
+	public static void main(String[] args) {
+		RelatedSentenceFinderML f = new RelatedSentenceFinderML();
+
+		List<HitBase> hits = null;
+		try {
+			// uncomment the sentence you would like to serve as a seed sentence for
+			// content generation for an event description
+
+			// uncomment the sentence you would like to serve as a seed sentence for
+			// content generation for an event description
+			hits = f.generateContentAbout("Albert Einstein"
+					// "Britney Spears - The Femme Fatale Tour"
+					// "Rush Time Machine",
+					// "Blue Man Group" ,
+					// "Belly Dance With Zaharah",
+					// "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
+					// "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
+					);
+			System.out.println(HitBase.toString(hits));
+			System.out.println(HitBase.toResultantString(hits));
+			// WordFileGenerator.createWordDoc("Essey about Albert Einstein",
+			// hits.get(0).getTitle(), hits);
+
+		} catch (Exception e) {
+			e.printStackTrace();
+		}
+
+	}
+
+}
\ No newline at end of file

Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java?rev=1555944&r1=1555943&r2=1555944&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java Mon Jan  6 17:48:30 2014
@@ -26,7 +26,7 @@ import opennlp.tools.textsimilarity.Pars
 import opennlp.tools.textsimilarity.SentencePairMatchResult;
 import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
 
-public class SearchResultsProcessor extends BingWebQueryRunner {
+public class SearchResultsProcessor extends BingQueryRunner {
   private static Logger LOG = Logger
       .getLogger("opennlp.tools.similarity.apps.SearchResultsProcessor");
   private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
@@ -92,7 +92,7 @@ public class SearchResultsProcessor exte
   public List<HitBase> runSearchViaAPI(String query) {
 	List<HitBase> hits = null;
     try {
-      List<HitBase> resultList = runSearch(query, 30);
+      List<HitBase> resultList = runSearch(query);
       // now we apply our own relevance filter
       hits = calculateMatchScoreResortHits(resultList, query);
 
@@ -102,7 +102,6 @@ public class SearchResultsProcessor exte
       return null;
     }
 
-    hits = removeDuplicates(hits, 0.9);
 
     return hits;
   }

Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java?rev=1555944&r1=1555943&r2=1555944&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java Mon Jan  6 17:48:30 2014
@@ -101,8 +101,6 @@ public class SpeechRecognitionResultsPro
     double bestSentScore = -1;
     String bestSent = null;
     for (String sentence : sents) {
-      BingResponse resp = null, // obtained from bing
-      newResp = null; // re-sorted based on similarity
       try {
         List<HitBase> resultList = scraper.runSearch(sentence);
         double scoreForSentence = calculateTotalMatchScoreForHits(resultList,

Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java?rev=1555944&r1=1555943&r2=1555944&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java Mon Jan  6 17:48:30 2014
@@ -17,22 +17,115 @@
 
 package opennlp.tools.similarity.apps;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+
+import opennlp.tools.similarity.apps.utils.StringCleaner;
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.SentencePairMatchResult;
+import opennlp.tools.textsimilarity.TextProcessor;
+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+
 public class StoryDiscourseNavigator {
-  public static final String[] frequentPerformingVerbs = {
-      " born raised meet learn ", " graduated enter discover",
-      " facts inventions life ", "accomplishments childhood timeline",
-      " acquire befriend encounter", " achieve reache describe ",
-      " invent innovate improve ", " impress outstanding award",
-      " curous sceptical pessimistic", " spend enroll assume point",
-      " explain discuss dispute", " learn teach study investigate",
-      " propose suggest indicate", " pioneer explorer discoverer ",
-      " advance promote lead", " direct control simulate ",
-      " guide lead assist ", " inspire first initial",
-      " vision predict foresee", " prediction inspiration achievement",
-      " approve agree confirm", " deny argue disagree",
-      " emotional loud imagination", " release announce celebrate discover",
-      "introduce enjoy follow", " open present show",
-      "meet enjoy follow create", "discover continue produce"
+	protected BingQueryRunner yrunner = new BingQueryRunner();
+	ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor
+			.getInstance();
+	private PorterStemmer ps = new PorterStemmer();
+
+	public static final String[] frequentPerformingVerbs = {
+		" born raised meet learn ", " graduated enter discover",
+		" facts inventions life ", "accomplishments childhood timeline",
+		" acquire befriend encounter", " achieve reache describe ",
+		" invent innovate improve ", " impress outstanding award",
+		" curous sceptical pessimistic", " spend enroll assume point",
+		" explain discuss dispute", " learn teach study investigate",
+		" propose suggest indicate", " pioneer explorer discoverer ",
+		" advance promote lead", " direct control simulate ",
+		" guide lead assist ", " inspire first initial",
+		" vision predict foresee", " prediction inspiration achievement",
+		" approve agree confirm", " deny argue disagree",
+		" emotional loud imagination", " release announce celebrate discover",
+		"introduce enjoy follow", " open present show",
+		"meet enjoy follow create", "discover continue produce"
+
+	};
+
+	public String[] obtainAdditionalKeywordsForAnEntity(String entity){
+		List<List<ParseTreeChunk>> matchList = runSearchForTaxonomyPath(
+				entity, "", "en", 30);
+		Collection<String> keywordsToRemove = TextProcessor.fastTokenize(entity.toLowerCase(), false);
+		List<List<String>> resList = getCommonWordsFromList_List_ParseTreeChunk(matchList);
+		String[] res = new String[resList.size()];
+		int i=0;
+		for(List<String> phrase: resList){
+			phrase.removeAll(keywordsToRemove);
+			String keywords = phrase.toString().replace('[', ' ').replace(']', ' ').replace(',',' ');
+			res[i] = keywords;
+			i++;
+		}
+		return res;
+	}
+
+	public List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query,
+			String domain, String lang, int numbOfHits) {
+		List<List<ParseTreeChunk>> genResult = new ArrayList<List<ParseTreeChunk>>();
+		try {
+			List<HitBase> resultList = yrunner.runSearch(query, numbOfHits);
+
+			for (int i = 0; i < resultList.size(); i++) {
+				{
+					for (int j = i + 1; j < resultList.size(); j++) {
+						HitBase h1 = resultList.get(i);
+						HitBase h2 = resultList.get(j);
+						String snapshot1 = StringCleaner.processSnapshotForMatching(h1
+								.getTitle() + " . " + h1.getAbstractText());
+						String snapshot2 = StringCleaner.processSnapshotForMatching(h2
+								.getTitle() + " . " + h2.getAbstractText());
+						SentencePairMatchResult matchRes = sm.assessRelevance(snapshot1,
+								snapshot2);
+						List<List<ParseTreeChunk>> matchResult = matchRes.getMatchResult();
+						genResult.addAll(matchResult);
+					}
+				}
+			}
+
+		} catch (Exception e) {
+			System.err.print("Problem extracting taxonomy node");
+		}
 
-  };
+		return genResult;
+	}
+	private List<List<String>> getCommonWordsFromList_List_ParseTreeChunk(
+			List<List<ParseTreeChunk>> matchList) {
+		List<List<String>> res = new ArrayList<List<String>>();
+		for (List<ParseTreeChunk> chunks : matchList) {
+			List<String> wordRes = new ArrayList<String>();
+			for (ParseTreeChunk ch : chunks) {
+				List<String> lemmas = ch.getLemmas();
+				for (int w = 0; w < lemmas.size(); w++)
+					if ((!lemmas.get(w).equals("*"))
+							&& ((ch.getPOSs().get(w).startsWith("NN") || ch.getPOSs().get(w)
+									.startsWith("VB"))) && lemmas.get(w).length() > 2) {
+						String formedWord = lemmas.get(w);
+						String stemmedFormedWord = ps.stem(formedWord);
+						if (!stemmedFormedWord.startsWith("invalid"))
+							wordRes.add(formedWord);
+					}
+			}
+			wordRes = new ArrayList<String>(new HashSet<String>(wordRes));	   
+			if (wordRes.size() > 0) {
+				res.add(wordRes);
+			}
+		}
+		res = new ArrayList<List<String>>(new HashSet<List<String>>(res));
+		return res;
+	}
+	public static void main(String[] args){
+		String[] res = new StoryDiscourseNavigator().obtainAdditionalKeywordsForAnEntity("Albert Einstein");
+		System.out.println(Arrays.asList(res));
+	}
 }

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooAnswersMiner.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooAnswersMiner.java?rev=1555944&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooAnswersMiner.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooAnswersMiner.java Mon Jan  6 17:48:30 2014
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import opennlp.tools.jsmlearning.ProfileReaderWriter;
+import opennlp.tools.parse_thicket.Triple;
+
+import net.billylieurance.azuresearch.AzureSearchImageQuery;
+import net.billylieurance.azuresearch.AzureSearchImageResult;
+import net.billylieurance.azuresearch.AzureSearchResultSet;
+import net.billylieurance.azuresearch.AzureSearchWebQuery;
+import net.billylieurance.azuresearch.AzureSearchWebResult;
+
+public class YahooAnswersMiner extends BingQueryRunner{
+
+	private static final Logger LOG = Logger
+			.getLogger("opennlp.tools.similarity.apps.YahooAnswersMiner");
+	private int page = 0;
+	private static final int hitsPerPage = 50;
+
+	public List<HitBase> runSearch(String query) {
+		aq.setAppid(BING_KEY);
+		aq.setQuery("site:answers.yahoo.com "+
+				query);		
+		aq.setPerPage(hitsPerPage);
+		aq.setPage(page);
+
+		aq.doQuery();
+		List<HitBase> results = new ArrayList<HitBase> ();
+		AzureSearchResultSet<AzureSearchWebResult> ars = aq.getQueryResult();
+
+		for (AzureSearchWebResult anr : ars){
+			HitBase h = new HitBase();
+			h.setAbstractText(anr.getDescription());
+			h.setTitle(anr.getTitle());
+			h.setUrl(anr.getUrl());
+			results.add(h);
+		}
+		page++;
+
+		return results;
+	}
+
+
+	public List<HitBase> runSearch(String query, int totalPages) {
+		int count=0;
+		List<HitBase> results = new ArrayList<HitBase>();
+		while(totalPages>page*hitsPerPage){
+			List<HitBase> res = runSearch(query);
+			results.addAll(res);
+			if (count>10)
+				break;
+			count++;
+		}
+
+		return results;
+	}
+
+
+	public static void main(String[] args) {
+		YahooAnswersMiner self = new YahooAnswersMiner();
+		RelatedSentenceFinder extractor = new RelatedSentenceFinder();
+		String topic = "obamacare";
+
+		List<HitBase> resp = self
+				.runSearch(topic, 150);
+		System.out.print(resp.get(0));
+		List<String[]> data = new ArrayList<String[]>();
+
+
+		for(HitBase item: resp){	      
+			Triple<List<String>, String, String[]> fragmentExtractionResults = 
+					extractor.formCandidateFragmentsForPage(item, topic, null);
+
+			List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();
+			String downloadedPage = (String)fragmentExtractionResults.getSecond();
+			String[] sents = (String[])fragmentExtractionResults.getThird();
+
+			for (String fragment : allFragms) {
+				String[] candidateSentences = extractor.formCandidateSentences(fragment, fragmentExtractionResults);
+				System.out.println(candidateSentences);
+				data.add(candidateSentences);
+			}
+			
+		}
+
+		ProfileReaderWriter.writeReport(data, "multi_sentence_queries.csv");
+
+	}
+
+}