You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/10/20 23:28:50 UTC
svn commit: r1187056 [1/3] - in /incubator/opennlp/sandbox/opennlp-similarity: ./ src/main/java/opennlp/tools/similarity/apps/ src/main/java/opennlp/tools/similarity/apps/utils/ src/main/java/opennlp/tools/textsimilarity/ src/main/java/opennlp/tools/te...

Author: joern
Date: Thu Oct 20 21:28:45 2011
New Revision: 1187056

URL: http://svn.apache.org/viewvc?rev=1187056&view=rev
Log:
OPENNLP-323 Fixed dependencies, and compatibility changed to work with 1.5.2. Applied patch provided by Boris Galitsky. Thanks for providing the patch.

Added:
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/gen.txt   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/EpistemicStatesTrainingSet.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParserChunker2MatcherOlderOpenNLP.java.txt   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserConstants.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/PhraseNode.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SentenceNode.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SyntacticTreeNode.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/WordNode.java   (with props)
Removed:
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooHit.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooHitComparable.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooQueryRunner.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponse.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponseBase.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParagraphClassifier.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkFactory.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcher.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherConfiguration.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherFactory.java
    incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/ParagraphClassifierTest.java
Modified:
    incubator/opennlp/sandbox/opennlp-similarity/pom.xml
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/Fragment.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBase.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/GeneralizationListReducer.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaPair.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/POSManager.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkListScorer.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcher.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministic.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/TextProcessor.java
    incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/GeneralizationListReducerTest.java
    incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/LemmaFormManagerTest.java
    incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/ParseTreeChunkFactoryTest.java
    incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/ParseTreeChunkListScorerTest.java
    incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/ParseTreeChunkTest.java
    incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministicTest.java
    incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/SyntMatcherTest.java

Modified: incubator/opennlp/sandbox/opennlp-similarity/pom.xml
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/pom.xml?rev=1187056&r1=1187055&r2=1187056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/pom.xml (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/pom.xml Thu Oct 20 21:28:45 2011
@@ -53,6 +53,23 @@
 			<version>4.8.1</version>
 			<scope>test</scope>
 		</dependency>
+			<dependency>
+			<groupId>commons-lang</groupId>
+			<artifactId>commons-lang</artifactId>
+			<version>2.5</version>
+		</dependency>
+
+		<dependency>
+			<groupId>org.json</groupId>
+			<artifactId>json</artifactId>
+			<version>20090211</version>
+		</dependency>
+
+		<dependency>
+			<groupId>org.apache.tika</groupId>
+			<artifactId>tika-core</artifactId>
+			<version>0.7</version>
+		</dependency>
 	</dependencies>
 	
 	<build>

Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java?rev=1187056&r1=1187055&r2=1187056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java Thu Oct 20 21:28:45 2011
@@ -26,18 +26,17 @@ import java.util.ArrayList;
 import java.util.List;
 
 import org.apache.commons.lang.StringUtils;
+import org.apache.log4j.Logger;
 import org.json.JSONArray;
 import org.json.JSONObject;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 public class BingQueryRunner {
-  protected static final String APP_ID = "XXX";
+  protected static final String APP_ID = "DD4E2A5DF8B7E5801ED443E47DC600D5F3E62713";
+  //TODO user needs to have own APP_ID from Bing API
 
   private float snapshotSimilarityThreshold = 0.4f;
 
-  private static final Logger LOG = LoggerFactory
-      .getLogger(BingQueryRunner.class);
+  private static final Logger LOG = Logger.getLogger(BingQueryRunner.class);
 
   public void setSnapshotSimilarityThreshold(float thr) {
     snapshotSimilarityThreshold = thr;
@@ -54,7 +53,6 @@ public class BingQueryRunner {
   private String constructBingUrl(String query, String domainWeb, String lang,
       int numbOfHits) throws Exception {
     String codedQuery = URLEncoder.encode(query, "UTF-8");
-    // http://boss.yahooapis.com/ysearch/news/v1/{query}?appid=xyz[&param1=val1&param2=val2&etc
     String yahooRequest = "http://api.search.live.net/json.aspx?Appid="
         + APP_ID + "&query=" + codedQuery // +
         // "&sources=web"+
@@ -92,7 +90,6 @@ public class BingQueryRunner {
   public BingResponse populateBingHit(String response) throws Exception {
     BingResponse resp = new BingResponse();
     JSONObject rootObject = new JSONObject(response);
-    // each response is object that under the key of "ysearchresponse"
     JSONObject responseObject = rootObject.getJSONObject("SearchResponse");
     JSONObject web = responseObject.getJSONObject("News");
 

Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java?rev=1187056&r1=1187055&r2=1187056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java Thu Oct 20 21:28:45 2011
@@ -27,23 +27,20 @@ import java.net.URLEncoder;
 import java.util.ArrayList;
 import java.util.List;
 
+import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
+
 import org.apache.commons.lang.StringUtils;
 import org.json.JSONArray;
 import org.json.JSONObject;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.springframework.context.annotation.Profile;
-import org.springframework.stereotype.Component;
-
-import com.zvents.ce.common.util.StringDistanceMeasurer;
-import com.zvents.recommend.event_event.utils.CSVWriter;
 
 public class BingWebQueryRunner {
   private static final Logger LOG = LoggerFactory
       .getLogger(BingWebQueryRunner.class);
 
-  private String constructBingWebUrl(String query, String domainWeb,
-      String lang, int numbOfHits) throws Exception {
+  private String constructBingWebUrl(String query, int numbOfHits) throws Exception {
     String codedQuery = URLEncoder.encode(query, "UTF-8");
 
     String yahooRequest = "http://api.search.live.net/json.aspx?Appid="
@@ -60,10 +57,7 @@ public class BingWebQueryRunner {
     return yahooRequest;
   }
 
-  /*
-     *  
-     */
-
+  
   public BingResponse populateBingHit(String response) throws Exception {
     BingResponse resp = new BingResponse();
     JSONObject rootObject = new JSONObject(response);
@@ -103,7 +97,7 @@ public class BingWebQueryRunner {
 
   public ArrayList<String> search(String query, String domainWeb, String lang,
       int numbOfHits) throws Exception {
-    URL url = new URL(constructBingWebUrl(query, domainWeb, lang, numbOfHits));
+    URL url = new URL(constructBingWebUrl(query, numbOfHits));
     URLConnection connection = url.openConnection();
 
     String line;
@@ -136,24 +130,10 @@ public class BingWebQueryRunner {
 
     hits = removeDuplicates(hits, 0.9);
 
-    writeHitsToCsv(query, hits);
-
     return hits;
   }
 
-  protected void writeHitsToCsv(String query, List<HitBase> hits) {
-    try {
-      CSVWriter writer = new CSVWriter(new FileWriter("bingSearchResults.csv",
-          true));
-      for (HitBase hit : hits) {
-        writer.writeNext(new String[] { query, hit.getTitle(), hit.getUrl() });
-      }
-      writer.close();
-    } catch (IOException e) {
-      LOG.error(e.getMessage(), e);
-    }
-  }
-
+  
   public List<HitBase> runSearch(String query, int num) {
     BingResponse resp = null;
     try {

Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/Fragment.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/Fragment.java?rev=1187056&r1=1187055&r2=1187056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/Fragment.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/Fragment.java Thu Oct 20 21:28:45 2011
@@ -17,7 +17,7 @@
 
 package opennlp.tools.similarity.apps;
 
-import com.zvents.ce.common.util.StringDistanceMeasurer;
+import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
 
 public class Fragment {
 

Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java?rev=1187056&r1=1187055&r2=1187056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java Thu Oct 20 21:28:45 2011
@@ -20,156 +20,171 @@ package opennlp.tools.similarity.apps;
 import java.util.Arrays;
 import java.util.List;
 
+import opennlp.tools.similarity.apps.utils.Utils;
+
 import org.apache.commons.lang.StringUtils;
 
-import com.zvents.bing.Fragment;
-import com.zvents.bing.HitBase;
-import com.zvents.cg.RelatedSentenceFinder;
-import com.zvents.recommend.event_event.utils.Utils;
 
 public class GeneratedSentenceProcessor {
-  public static String acceptableMinedSentence(String sent) {
-    // if too many commas => seo text
-
-    String[] commas = StringUtils.split(sent, ',');
-    String[] spaces = StringUtils.split(sent, ' ');
-    if ((float) commas.length / (float) spaces.length > 0.7) {
-      System.out.println("Rejection: too many commas");
-      return null;
-    }
-
-    String[] pipes = StringUtils.split(sent, '|');
-    if (StringUtils.split(sent, '|').length > 2
-        || StringUtils.split(sent, '>').length > 2) {
-      System.out.println("Rejection: too many |s or >s ");
-      return null;
-    }
-    String sentTry = sent.toLowerCase();
-    // if too many long spaces
-    String sentSpaces = sentTry.replace("   ", "");
-    if (sentSpaces.length() - sentTry.length() > 10) // too many spaces -
-      // suspicious
-      return null;
-
-    if (sentTry.indexOf("click here") > -1 || sentTry.indexOf(" wikip") > -1
-        || sentTry.indexOf("copyright") > -1
-        || sentTry.indexOf("operating hours") > -1
-        || sentTry.indexOf("days per week") > -1
-        || sentTry.indexOf("click for") > -1 || sentTry.indexOf("photos") > -1
-        || sentTry.indexOf("find the latest") > -1
-        || sentTry.startsWith("subscribe")
-        || sentTry.indexOf("Terms of Service") > -1
-        || sentTry.indexOf("clicking here") > -1
-        || sentTry.indexOf("skip to") > -1 || sentTry.indexOf("sidebar") > -1
-        || sentTry.indexOf("Tags:") > -1 || sentTry.startsWith("Posted by")
-        || sentTry.indexOf("available online") > 0
-        || sentTry.indexOf("get online") > 0
-        || sentTry.indexOf("buy online") > 0
-        || sentTry.indexOf("not valid") > 0 || sentTry.indexOf("discount") > 0
-        || sentTry.indexOf("official site") > 0
-        || sentTry.indexOf("discount") > 0
-        || sentTry.indexOf("Related searches") > 0
-        || sentTry.indexOf("Permission is granted") > 0
-        || sentTry.indexOf("Free license") > 0
-        || sentTry.indexOf("Permission is granted") > 0
-        || sentTry.indexOf("under the terms") > 0
-
-        || sentTry.indexOf("wikipedia") > 0 || sentTry.endsWith("the")
-        || sentTry.endsWith("the."))
-      return null;
-
-    // count symbols indicating wrong parts of page to mine for text
-    // if short and contains too many symbols indicating wrong area: reject
-    String sentWrongSym = sentTry.replace(">", "&&&").replace("ï¿½", "&&&")
-        .replace("|", "&&&").replace(":", "&&&").replace("/", "&&&")
-        .replace("-", "&&&").replace("%", "&&&");
-    if ((sentWrongSym.length() - sentTry.length()) >= 4
-        && sentTry.length() < 200) // twice ot more
-      return null;
-
-    sent = sent.replace('[', ' ').replace(']', ' ')
-        .replace("_should_find_orig_", "").replace(".   .", ". ")
-        .replace("amp;", " ").replace("1.", " ").replace("2.", " ")
-        .replace("3.", " ").replace("4.", " ").replace("2009", "2011")
-        .replace("2008", "2011").replace("2006", "2011")
-        .replace("2007", "2011").replace("VIDEO:", " ").replace("Video:", " ")
-        .replace("no comments", " ").replace("  ", " ").replace("  ", " ")
-        .replace("(more.)", "").replace("more.", "").replace("<more>", "")
-        .replace("[more]", "").replace(".,", ".").replace("&lt;", "")
-        .replace("p&gt;", "");
-    // TODO .replace("a.", ".");
-
-    int endIndex = sent.indexOf(" posted");
-    if (endIndex > 0)
-      sent = sent.substring(0, endIndex);
-
-    return sent;
-  }
-
-  public static String processSentence(String pageSentence) {
-    if (pageSentence == null)
-      return "";
-    pageSentence = Utils.fullStripHTML(pageSentence);
-    pageSentence = StringUtils.chomp(pageSentence, "..");
-    pageSentence = StringUtils.chomp(pageSentence, ". .");
-    pageSentence = StringUtils.chomp(pageSentence, " .");
-    pageSentence = StringUtils.chomp(pageSentence, ".");
-    pageSentence = StringUtils.chomp(pageSentence, "...");
-    pageSentence = StringUtils.chomp(pageSentence, " ....");
-    pageSentence = pageSentence.replace("::", ":").replace(".,", ". ")
-        .replace("(.)", "");
-
-    pageSentence = pageSentence.trim();
-    pageSentence = pageSentence.replaceAll("\\s+", " "); // make single
-    // spaces
-    // everywhere
-
-    String[] pipes = StringUtils.split(pageSentence, '|'); // removed
-    // shorter part
-    // of sentence
-    // at the end
-    // after pipe
-    if (pipes.length == 2
-        && ((float) pipes[0].length() / (float) pipes[1].length() > 3.0)) {
-      int pipePos = pageSentence.indexOf("|");
-      if (pipePos > -1)
-        pageSentence = pageSentence.substring(0, pipePos - 1).trim();
-
-    }
-
-    if (!StringUtils.contains(pageSentence, '.')
-        && !StringUtils.contains(pageSentence, '?')
-        && !StringUtils.contains(pageSentence, '!'))
-      pageSentence = pageSentence + ". ";
-
-    pageSentence = pageSentence.replace(" .", ".").replace("..", ".").trim();
-    if (!pageSentence.endsWith("."))
-      pageSentence += ". ";
-    return pageSentence;
-  }
-
-  public static void main(String[] args) {
-    RelatedSentenceFinder f = new RelatedSentenceFinder();
-    try {
-      List<HitBase> hits = f
-          .findRelatedOpinionsForSentence(
-              "Give me a break, there is no reason why you can't retire in ten years if you had been a rational investor and not a crazy trader",
-              Arrays
-                  .asList(new String[] { "Give me a break there is no reason why you can't retire in ten years if you had been a rational investor and not a crazy trader. For example you went to cash in 2008 and stay in cash until now you made nothing. Whereas people who rode out the storm are doing fine so let's quit focusing on the loser who think they are so smart and went to 100% cash and are wondering what happen. Its a market that always moves unlike your mattress.", }));
-      StringBuffer buf = new StringBuffer();
-
-      for (HitBase h : hits) {
-        List<Fragment> frags = h.getFragments();
-        for (Fragment fr : frags) {
-          if (fr.getResultText() != null && fr.getResultText().length() > 3)
-            buf.append(fr.getResultText());
-        }
-      }
-
-    } catch (Exception e) {
-      // TODO Auto-generated catch block
-      e.printStackTrace();
-    }
-
-  }
-}
+		public static String acceptableMinedSentence(String sent)
+		{
+			// if too many commas => seo text
+
+			String[] commas = StringUtils.split(sent, ',');
+			String[] spaces = StringUtils.split(sent, ' ');
+			if ((float) commas.length / (float) spaces.length > 0.7)
+			{
+				System.out.println("Rejection: too many commas");
+				return null;
+			}
+
+			String[] pipes = StringUtils.split(sent, '|');
+			if (StringUtils.split(sent, '|').length > 2 || StringUtils.split(sent, '>').length > 2)
+			{
+				System.out.println("Rejection: too many |s or >s ");
+				return null;
+			}
+			String sentTry = sent.toLowerCase();
+			// if too many long spaces
+			String sentSpaces = sentTry.replace("   ", "");
+			if (sentSpaces.length() - sentTry.length() > 10) // too many spaces -
+																// suspicious
+				return null;
+
+			if (sentTry.indexOf("click here") > -1 || sentTry.indexOf(" wikip") > -1 || sentTry.indexOf("copyright") > -1
+				|| sentTry.indexOf("operating hours") > -1 || sentTry.indexOf("days per week") > -1
+				|| sentTry.indexOf("click for") > -1 || sentTry.indexOf("photos") > -1
+				|| sentTry.indexOf("find the latest") > -1 || sentTry.startsWith("subscribe")
+				|| sentTry.indexOf("Terms of Service") > -1 || sentTry.indexOf("clicking here") > -1
+				|| sentTry.indexOf("skip to") > -1 || sentTry.indexOf("sidebar") > -1 || sentTry.indexOf("Tags:") > -1
+				|| sentTry.startsWith("Posted by") || sentTry.indexOf("available online") > 0
+				|| sentTry.indexOf("get online") > 0 || sentTry.indexOf("buy online") > 0
+				|| sentTry.indexOf("not valid") > 0 || sentTry.indexOf("discount") > 0
+				|| sentTry.indexOf("official site") > 0 || sentTry.indexOf("this video") > 0 || sentTry.indexOf("this book") > 0
+				|| sentTry.indexOf("this product") > 0 || sentTry.indexOf("paperback") > 0 || sentTry.indexOf("hardcover") > 0 ||
+				sentTry.indexOf("audio cd") > 0
+				|| sentTry.indexOf("related searches") > 0 || sentTry.indexOf("permission is granted") > 0
+				|| sentTry.indexOf("[edit") > 0 || sentTry.indexOf("edit categories") > 0
+				|| sentTry.indexOf("free license") > 0 || sentTry.indexOf("permission is granted") > 0
+				|| sentTry.indexOf("under the terms") > 0 		|| sentTry.indexOf("rights reserved") > 0 	
+				|| sentTry.indexOf("wikipedia") > 0 || sentTry.endsWith("the") || sentTry.endsWith("the.")
+				|| sentTry.startsWith("below") 
+			
+			)
+				return null;
+
+			// count symbols indicating wrong parts of page to mine for text
+			// if short and contains too many symbols indicating wrong area: reject
+			String sentWrongSym = sentTry.replace(">", "&&&").replace("ï¿½", "&&&").replace("|", "&&&").replace(":", "&&&")
+				.replace("/", "&&&").replace("-", "&&&").replace("%", "&&&");
+			if ((sentWrongSym.length() - sentTry.length()) >= 4 && sentTry.length()<200) // twice ot more
+				return null;
+
+			sent = sent.replace('[', ' ').replace(']', ' ').replace("_should_find_orig_", "").replace(".   .", ". ")
+				.replace("amp;", " ").replace("1.", " ").replace("2.", " ").replace("3.", " ").replace("4.", " ")
+				.replace("2009", "2011").replace("2008", "2011").replace("2006", "2011").replace("2007", "2011").
+				replace("VIDEO:", " ").replace("Video:", " ").replace("no comments", " ")
+				.replace("  ", " ").replace("  ", " ").replace("(more.)", "").replace("more.", "").replace("<more>", "").
+				replace("[more]", "").replace(".,",".").replace("&lt;", "").replace("p&gt;","" ).
+				replace("product description", "");
+		
+				// TODO .replace("a.", ".");
+
+			int endIndex = sent.indexOf(" posted");
+			if (endIndex > 0)
+				sent = sent.substring(0, endIndex);
+
+			return sent;
+		}
+
+		public static String processSentence(String pageSentence)
+		{
+			if (pageSentence == null)
+				return "";
+			pageSentence = Utils.fullStripHTML(pageSentence);
+			pageSentence = StringUtils.chomp(pageSentence, "..");
+			pageSentence = StringUtils.chomp(pageSentence, ". .");
+			pageSentence = StringUtils.chomp(pageSentence, " .");
+			pageSentence = StringUtils.chomp(pageSentence, ".");
+			pageSentence = StringUtils.chomp(pageSentence, "...");
+			pageSentence = StringUtils.chomp(pageSentence, " ....");
+			pageSentence = pageSentence.replace("::", ":").replace(".,", ". ").replace("(.)", "");
+			
+			pageSentence = pageSentence.trim();
+			pageSentence = pageSentence.replaceAll("\\s+", " "); // make single
+																	// spaces
+																	// everywhere
+
+			String[] pipes = StringUtils.split(pageSentence, '|'); // removed
+																	// shorter part
+																	// of sentence
+																	// at the end
+																	// after pipe
+			if (pipes.length == 2 && ((float) pipes[0].length() / (float) pipes[1].length() > 3.0))
+			{
+				int pipePos = pageSentence.indexOf("|");
+				if (pipePos > -1)
+					pageSentence = pageSentence.substring(0, pipePos - 1).trim();
+
+			}
+
+			if (!StringUtils.contains(pageSentence, '.') && !StringUtils.contains(pageSentence, '?')
+				&& !StringUtils.contains(pageSentence, '!'))
+				pageSentence = pageSentence + ". ";
+
+			pageSentence = pageSentence.replace(" .", ".").replace("..", ".").trim();
+			if (!pageSentence.endsWith("."))
+				pageSentence += ". ";
+			return pageSentence;
+		}
+		
+		public static void main(String[] args)
+		{
+			
+			String para = "About Albert Einstein     15 External links  16 Credits         Youth and schooling  Albert Einstein was born into a Jewish family";
+			para = "inventions of albert einstein                            what was albert einsteins invention                            invention of einstein                            what were albert einsteins inventions ";
+			
+			para = para.replaceAll("  [A-Z]", ". $0");
+			System.out.println(para);
+			
+			para = "Page 2 of 93";
+		    
+			System.exit(0);
+			RelatedSentenceFinder f = new RelatedSentenceFinder();
+			try
+			{
+				List<HitBase> hits = f
+					.findRelatedOpinionsForSentence(
+						"Give me a break, there is no reason why you can't retire in ten years if you had been a rational investor and not a crazy trader",
+						Arrays
+							.asList(new String[] { "Give me a break there is no reason why you can't retire in ten years if you had been a rational investor and not a crazy trader. For example you went to cash in 2008 and stay in cash until now you made nothing. Whereas people who rode out the storm are doing fine so let's quit focusing on the loser who think they are so smart and went to 100% cash and are wondering what happen. Its a market that always moves unlike your mattress.", }));
+				StringBuffer buf = new StringBuffer();
+
+				for (HitBase h : hits)
+				{
+					List<Fragment> frags = h.getFragments();
+					for (Fragment fr : frags)
+					{
+						if (fr.getResultText() != null && fr.getResultText().length() > 3)
+							buf.append(fr.getResultText());
+					}
+				}
+
+			}
+			catch (Exception e)
+			{
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+			}
+
+		}
+
+		public static String normalizeForSentenceSplitting(String pageContent) {
+			pageContent.replace("Jan.", "January").replace("Feb.", "February").replace("Mar.", "March").replace("Apr.", "April").
+			replace("Jun.", "June").replace("Jul.", "July").replace("Aug.", "August").replace("Sep.", "September").
+			replace("Oct.", "October").replace("Nov.", "November").replace("Dec.", "December");
+			
+			return pageContent;
+			
+		}
+	}
\ No newline at end of file

Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBase.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBase.java?rev=1187056&r1=1187055&r2=1187056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBase.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBase.java Thu Oct 20 21:28:45 2011
@@ -20,199 +20,262 @@ package opennlp.tools.similarity.apps;
 import java.util.ArrayList;
 import java.util.List;
 
+import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
+
 import org.apache.commons.lang.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import com.zvents.ce.common.util.StringDistanceMeasurer;
-
-public class HitBase {
-  private static final Logger LOG = LoggerFactory.getLogger(HitBase.class);
-
-  private String abstractText;
-
-  private String clickUrl;
-
-  private String displayUrl;
-
-  private String url;
-
-  private String date;
-
-  private String title;
-
-  private Double generWithQueryScore;
-
-  private String source;
-
-  private List<String> originalSentences;
-
-  private String pageContent;
-
-  private List<Fragment> fragments;
-
-  public HitBase() {
-    super();
-  }
-
-  public String getPageContent() {
-    return pageContent;
-  }
-
-  public HitBase(String orig, String[] generateds) {
-    originalSentences = new ArrayList<String>();
-    originalSentences.add(orig);
-
-    fragments = new ArrayList<Fragment>();
-    for (String sent : generateds) {
-      Fragment f = new Fragment(sent, 0.0);
-      fragments.add(f);
-    }
-    // the rest of params are null
-  }
-
-  public void setPageContent(String pageContent) {
-    this.pageContent = pageContent;
-  }
-
-  public List<Fragment> getFragments() {
-    return fragments;
-  }
-
-  public void setFragments(List<Fragment> fragments) {
-    this.fragments = fragments;
-  }
-
-  public String getSource() {
-    return source;
-  }
-
-  public void setSource(String source) {
-    this.source = source;
-  }
-
-  public List<String> getOriginalSentences() {
-    return originalSentences;
-  }
-
-  public void setOriginalSentences(List<String> originalSentences) {
-    this.originalSentences = originalSentences;
-  }
-
-  public String getTitle() {
-    return title;
-  }
-
-  public void setTitle(String title) {
-    this.title = title;
-  }
-
-  public String getAbstractText() {
-    return abstractText;
-  }
-
-  public void setAbstractText(String abstractText) {
-    this.abstractText = abstractText;
-  }
-
-  public String getClickUrl() {
-    return clickUrl;
-  }
-
-  public void setClickUrl(String clickUrl) {
-    this.clickUrl = clickUrl;
-  }
-
-  public String getDisplayUrl() {
-    return displayUrl;
-  }
-
-  public void setDisplayUrl(String displayUrl) {
-    this.displayUrl = displayUrl;
-  }
-
-  public String getUrl() {
-    return url;
-  }
-
-  public void setUrl(String url) {
-    this.url = url;
-  }
-
-  public String getDate() {
-    return date;
-  }
-
-  public void setDate(String date) {
-    this.date = date;
-  }
-
-  public Double getGenerWithQueryScore() {
-    return generWithQueryScore;
-  }
-
-  public void setGenerWithQueryScore(Double generWithQueryScore) {
-    this.generWithQueryScore = generWithQueryScore;
-  }
-
-  public String toString() {
-    // return "\n"+this.getUrl()+" | " +this.getTitle()+ " | "+
-    // this.abstractText ;
-    if (this.getFragments() != null && this.getFragments().size() > 0)
-      return this.getFragments().toString();
-    else
-      return this.title;
-  }
-
-  public static String toString(List<HitBase> hits) {
-    StringBuffer buf = new StringBuffer();
-    Boolean pBreak = true;
-    for (HitBase hit : hits) {
-      String fragm = (hit.toString());
-      if (fragm.length() > 15) {
-        if (pBreak)
-          buf.append(fragm + " | ");
-        else
-          buf.append(fragm + " | \n");
-        // switch to opposite
-        if (pBreak)
-          pBreak = false;
-        else
-          pBreak = true;
-      }
-
-    }
-    return buf.toString();
-  }
-
-  public static List<HitBase> removeDuplicates(List<HitBase> hits) {
-    StringDistanceMeasurer meas = new StringDistanceMeasurer();
-    double imageDupeThresh = 0.8; // if more similar, then considered dupes
-    List<Integer> idsToRemove = new ArrayList<Integer>();
-    List<HitBase> hitsDedup = new ArrayList<HitBase>();
-    try {
-      for (int i = 0; i < hits.size(); i++)
-        for (int j = i + 1; j < hits.size(); j++) {
-          String title1 = hits.get(i).getTitle();
-          String title2 = hits.get(j).getTitle();
-          if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
-            continue;
-          if (meas.measureStringDistance(title1, title2) > imageDupeThresh) {
-            idsToRemove.add(j); // dupes found, later list member to be deleted
-          }
-        }
-      for (int i = 0; i < hits.size(); i++)
-        if (!idsToRemove.contains(i))
-          hitsDedup.add(hits.get(i));
-      if (hitsDedup.size() < hits.size()) {
-        LOG.debug("Removed duplicates from relevant search results, including "
-            + hits.get(idsToRemove.get(0)).getTitle());
-      }
-    } catch (Exception e) {
-      LOG.error("Problem removing duplicates from relevant images", e);
-    }
-
-    return hitsDedup;
+public class HitBase
+{
+	private static final Logger LOG = LoggerFactory.getLogger(HitBase.class);
+
+	private String abstractText;
+
+	private String clickUrl;
+
+	private String displayUrl;
+
+	private String url;
+
+	private String date;
+
+	private String title;
+
+	private Double generWithQueryScore;
+
+	private String source;
+
+	private List<String> originalSentences;
+
+	private String pageContent;
+
+	private List<Fragment> fragments;
+
+	public HitBase()
+	{
+		super();
+	}
+
+	public String getPageContent()
+	{
+		return pageContent;
+	}
+
+	public HitBase(String orig, String[] generateds)
+	{
+		originalSentences = new ArrayList<String>();
+		originalSentences.add(orig);
+
+		fragments = new ArrayList<Fragment>();
+		for (String sent : generateds)
+		{
+			Fragment f = new Fragment(sent, 0.0);
+			fragments.add(f);
+		}
+		// the rest of params are null
+	}
+
+	public void setPageContent(String pageContent)
+	{
+		this.pageContent = pageContent;
+	}
+
+	public List<Fragment> getFragments()
+	{
+		return fragments;
+	}
+
+	public void setFragments(List<Fragment> fragments)
+	{
+		this.fragments = fragments;
+	}
+
+	public String getSource()
+	{
+		return source;
+	}
+
+	public void setSource(String source)
+	{
+		this.source = source;
+	}
+
+	public List<String> getOriginalSentences()
+	{
+		return originalSentences;
+	}
+
+	public void setOriginalSentences(List<String> originalSentences)
+	{
+		this.originalSentences = originalSentences;
+	}
+
+	public String getTitle()
+	{
+		return title;
+	}
+
+	public void setTitle(String title)
+	{
+		this.title = title;
+	}
+
+	public String getAbstractText()
+	{
+		return abstractText;
+	}
+
+	public void setAbstractText(String abstractText)
+	{
+		this.abstractText = abstractText;
+	}
+
+	public String getClickUrl()
+	{
+		return clickUrl;
+	}
+
+	public void setClickUrl(String clickUrl)
+	{
+		this.clickUrl = clickUrl;
+	}
+
+	public String getDisplayUrl()
+	{
+		return displayUrl;
+	}
+
+	public void setDisplayUrl(String displayUrl)
+	{
+		this.displayUrl = displayUrl;
+	}
+
+	public String getUrl()
+	{
+		return url;
+	}
+
+	public void setUrl(String url)
+	{
+		this.url = url;
+	}
+
+	public String getDate()
+	{
+		return date;
+	}
+
+	public void setDate(String date)
+	{
+		this.date = date;
+	}
+
+	public Double getGenerWithQueryScore()
+	{
+		return generWithQueryScore;
+	}
+
+	public void setGenerWithQueryScore(Double generWithQueryScore)
+	{
+		this.generWithQueryScore = generWithQueryScore;
+	}
+
+	public String toString()
+	{
+		// return "\n"+this.getUrl()+" | " +this.getTitle()+ " | "+ this.abstractText ;
+		if (this.getFragments() != null && this.getFragments().size() > 0)
+			return this.getFragments().toString();
+		else
+			return this.title;
+	}
+
+	public static String toString(List<HitBase> hits)
+	{
+		StringBuffer buf = new StringBuffer();
+		Boolean pBreak = true;
+		for (HitBase hit : hits)
+		{
+			String fragm = (hit.toString());
+			if (fragm.length() > 15)
+			{
+				if (pBreak)
+					buf.append(fragm + " | ");
+				else
+					buf.append(fragm + " | \n");
+				// switch to opposite
+				if (pBreak)
+					pBreak = false;
+				else
+					pBreak = true;
+			}
+
+		}
+		return buf.toString();
+	}
+	
+	public static String toResultantString(List<HitBase> hits)
+	{
+		StringBuffer buf = new StringBuffer();
+		Boolean pBreak = true;
+		for (HitBase hit : hits)
+		{
+			String fragm = hit.getFragments().toString();
+			if (fragm.length() > 15)
+			{
+				if (pBreak)
+					buf.append(fragm + " | 	");
+				else
+					buf.append(fragm + " | \n");
+				// switch to opposite
+				if (pBreak)
+					pBreak = false;
+				else
+					pBreak = true;
+			}
+
+		}
+		return buf.toString().replace("[", "").replace("]", "").replace(" | ", "").replace(".,",".").
+		replace(".\"", "\"").replace(". .", ".").replace(",.", ".");
+	}
+
+	public static List<HitBase> removeDuplicates(List<HitBase> hits)
+	{
+		StringDistanceMeasurer meas = new StringDistanceMeasurer();
+		double imageDupeThresh = 0.8; // if more similar, then considered dupes
+		List<Integer> idsToRemove = new ArrayList<Integer>();
+		List<HitBase> hitsDedup = new ArrayList<HitBase>();
+		try
+		{
+			for (int i = 0; i < hits.size(); i++)
+				for (int j = i + 1; j < hits.size(); j++)
+				{
+					String title1 = hits.get(i).getTitle();
+					String title2 = hits.get(j).getTitle();
+					if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
+						continue;
+					if (meas.measureStringDistance(title1, title2) > imageDupeThresh)
+					{
+						idsToRemove.add(j); // dupes found, later list member to be deleted
+					}
+				}
+			for (int i = 0; i < hits.size(); i++)
+				if (!idsToRemove.contains(i))
+					hitsDedup.add(hits.get(i));
+			if (hitsDedup.size() < hits.size())
+			{
+				LOG.debug("Removed duplicates from relevant search results, including "
+					+ hits.get(idsToRemove.get(0)).getTitle());
+			}
+		}
+		catch (Exception e)
+		{
+			LOG.error("Problem removing duplicates from relevant images", e);
+		}
+
+		
+		
+		return hitsDedup;
 
-  }
+	}
 }

Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java?rev=1187056&r1=1187055&r2=1187056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java Thu Oct 20 21:28:45 2011
@@ -20,562 +20,558 @@ package opennlp.tools.similarity.apps;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.logging.Logger;
+
 
-import opennlp.tools.parser.Parse;
 import opennlp.tools.similarity.apps.utils.PageFetcher;
 import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
 import opennlp.tools.similarity.apps.utils.Utils;
-import opennlp.tools.textsimilarity.LemmaPair;
 import opennlp.tools.textsimilarity.ParseTreeChunk;
 import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
 import opennlp.tools.textsimilarity.SentencePairMatchResult;
-import opennlp.tools.textsimilarity.SyntMatcher;
+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
 
 import org.apache.commons.lang.StringUtils;
-import org.slf4j.LoggerFactory;
 
-public class RelatedSentenceFinder {
 
-  // TODO outsource the timeout value
-  PageFetcher pFetcher = new PageFetcher();
 
-  private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
-  private ParseTreeChunk parseTreeChunk = new ParseTreeChunk();
-
-  private static final org.slf4j.Logger LOG = LoggerFactory
-      .getLogger(RelatedSentenceFinder.class);
-
-  static StringDistanceMeasurer STRING_DISTANCE_MEASURER = new StringDistanceMeasurer();
-
-  // used to indicate that a sentence is an opinion, so more appropriate
-  static List<String> MENTAL_VERBS = new ArrayList<String>(
-      Arrays.asList(new String[] { "want", "know", "believe", "appeal", "ask",
-          "accept", "agree", "allow", "appeal", "ask", "assume", "believe",
-          "check", "confirm", "convince", "deny", "disagree", "explain",
-          "ignore", "inform", "remind", "request", "suggest", "suppose",
-          "think", "threaten", "try", "understand" }));
-
-  private static final int MAX_FRAGMENT_SENTS = 10;
-
-  public RelatedSentenceFinder() {
-
-  }
-
-  public List<HitBase> findRelatedOpinionsForSentenceFastAndDummy(String word,
-      List<String> sents) throws Exception {
-    YahooQueryRunner yrunner = new YahooQueryRunner();
-    List<HitBase> searchResult = yrunner.runSearch(word);
-    return searchResult;
-  }
-
-  public List<HitBase> findRelatedOpinionsForSentence(String sentence,
-      List<String> sents) throws Exception {
-    List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
-    System.out.println(" \n\n=== Sentence  = " + sentence);
-    List<String> nounPhraseQueries = buildSearchEngineQueryFromSentence(sentence);
-
-    YahooQueryRunner yrunner = new YahooQueryRunner();
-    for (String query : nounPhraseQueries) {
-      System.out.println("\nquery = " + query);
-      // query += " "+join(MENTAL_VERBS, " OR ") ;
-      List<HitBase> searchResult = yrunner.runSearch(query);
-      if (searchResult != null) {
-        for (HitBase item : searchResult) { // got some text from .html
-          if (item.getAbstractText() != null
-              && !(item.getUrl().indexOf(".pdf") > 0)) { // exclude
-                                                         // pdf
-            opinionSentencesToAdd
-                .add(augmentWithMinedSentencesAndVerifyRelevance(item,
-                    sentence, sents));
-          }
-        }
-      }
-    }
-
-    opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
-    return opinionSentencesToAdd;
-  }
-
-  public List<HitBase> findActivityDetailsForEventGroupName(String sentence,
-      List<String> sents) throws Exception {
-    List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
-    System.out.println(" \n\n=== Entity to write about = " + sentence);
-    List<String> nounPhraseQueries = new ArrayList<String>();
-    String[] frequentPerformingVerbs = {
-        " release announce celebrate discover", "introduce enjoy follow",
-        "open present show", "meet enjoy follow create",
-        "discover continue produce" };
-
-    nounPhraseQueries.add(sentence + frequentPerformingVerbs);
-
-    YahooQueryRunner yrunner = new YahooQueryRunner();
-    for (String verbAddition : frequentPerformingVerbs) {
-      List<HitBase> searchResult = yrunner.runSearch(sentence + " "
-          + verbAddition);
-      if (searchResult != null) {
-        for (HitBase item : searchResult) { // got some text from .html
-          if (item.getAbstractText() != null
-              && !(item.getUrl().indexOf(".pdf") > 0)) { // exclude
-                                                         // pdf
-            opinionSentencesToAdd
-                .add(augmentWithMinedSentencesAndVerifyRelevance(item,
-                    sentence, sents));
-          }
-        }
-      }
-    }
-
-    opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
-    return opinionSentencesToAdd;
-  }
-
-  public static List<String> buildSearchEngineQueryFromSentence(String sentence) {
-    ParseTreeChunk matcher = new ParseTreeChunk();
-    SyntMatcher pos = SyntMatcher.getInstance();
-    List<List<ParseTreeChunk>> sent1GrpLst = null;
-
-    List<LemmaPair> origChunks1 = new ArrayList<LemmaPair>();
-    String[] sents1 = pos.getSentenceDetectorME().sentDetect(sentence);
-    for (String s1 : sents1) {
-      Parse[] parses1 = pos.parseLine(s1, pos.getParser(), 1);
-      origChunks1.addAll(pos.getAllPhrasesTWPairs(parses1[0]));
-    }
-
-    List<ParseTreeChunk> chunk1List = matcher.buildChunks(origChunks1);
-    sent1GrpLst = matcher.groupChunksAsParses(chunk1List);
-
-    // System.out.println(origChunks1);
-    // System.out.println("=== Grouped chunks 1 "+ sent1GrpLst.get(0));
-    List<ParseTreeChunk> nPhrases = sent1GrpLst.get(0);
-    List<String> queryArrayStr = new ArrayList<String>();
-    for (ParseTreeChunk ch : nPhrases) {
-      String query = "";
-      int size = ch.getLemmas().size();
-
-      for (int i = 0; i < size; i++) {
-        if (ch.getPOSs().get(i).startsWith("N")
-            || ch.getPOSs().get(i).startsWith("J")) {
-          query += ch.getLemmas().get(i) + " ";
-        }
-      }
-      query = query.trim();
-      int len = query.split(" ").length;
-      if (len < 2 || len > 5)
-        continue;
-      if (len < 4) { // every word should start with capital
-        String[] qs = query.split(" ");
-        boolean bAccept = true;
-        for (String w : qs) {
-          if (w.toLowerCase().equals(w)) // idf only two words then
-            // has to be person name,
-            // title or geo location
-            bAccept = false;
-        }
-        if (!bAccept)
-          continue;
-      }
-
-      query = query.trim().replace(" ", " +");
-      query = " +" + query;
-
-      queryArrayStr.add(query);
-
-    }
-    if (queryArrayStr.size() < 1) { // release constraints on NP down to 2
-                                    // keywords
-      for (ParseTreeChunk ch : nPhrases) {
-        String query = "";
-        int size = ch.getLemmas().size();
-
-        for (int i = 0; i < size; i++) {
-          if (ch.getPOSs().get(i).startsWith("N")
-              || ch.getPOSs().get(i).startsWith("J")) {
-            query += ch.getLemmas().get(i) + " ";
-          }
-        }
-        query = query.trim();
-        int len = query.split(" ").length;
-        if (len < 2)
-          continue;
-
-        query = query.trim().replace(" ", " +");
-        query = " +" + query;
-
-        queryArrayStr.add(query);
-
-      }
-    }
-
-    queryArrayStr = removeDuplicatesFromQueries(queryArrayStr);
-    queryArrayStr.add(sentence);
-
-    return queryArrayStr;
-
-  }
-
-  // remove dupes from queries to easy cleaning dupes and repetitive search
-  // afterwards
-  public static List<String> removeDuplicatesFromQueries(List<String> hits) {
-    StringDistanceMeasurer meas = new StringDistanceMeasurer();
-    double dupeThresh = 0.8; // if more similar, then considered dupes was
-    // 0.7
-    List<Integer> idsToRemove = new ArrayList<Integer>();
-    List<String> hitsDedup = new ArrayList<String>();
-    try {
-      for (int i = 0; i < hits.size(); i++)
-        for (int j = i + 1; j < hits.size(); j++) {
-          String title1 = hits.get(i);
-          String title2 = hits.get(j);
-          if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
-            continue;
-          if (meas.measureStringDistance(title1, title2) > dupeThresh) {
-            idsToRemove.add(j); // dupes found, later list member to
-            // be deleted
-
-          }
-        }
-
-      for (int i = 0; i < hits.size(); i++)
-        if (!idsToRemove.contains(i))
-          hitsDedup.add(hits.get(i));
-
-      if (hitsDedup.size() < hits.size()) {
-        LOG.debug("Removed duplicates from formed query, including "
-            + hits.get(idsToRemove.get(0)));
-      }
-
-    } catch (Exception e) {
-      LOG.error("Problem removing duplicates from query list");
-    }
-
-    return hitsDedup;
-
-  }
-
-  public static List<HitBase> removeDuplicatesFromResultantHits(
-      List<HitBase> hits) {
-    StringDistanceMeasurer meas = new StringDistanceMeasurer();
-    double dupeThresh = 0.8; // if more similar, then considered dupes was
-    // 0.7
-    List<Integer> idsToRemove = new ArrayList<Integer>();
-    List<HitBase> hitsDedup = new ArrayList<HitBase>();
-    try {
-      for (int i = 0; i < hits.size(); i++)
-        for (int j = i + 1; j < hits.size(); j++) {
-          String title1 = hits.get(i).toString();
-          String title2 = hits.get(j).toString();
-          if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
-            continue;
-          if (meas.measureStringDistance(title1, title2) > dupeThresh) {
-            idsToRemove.add(j); // dupes found, later list member to
-            // be deleted
-
-          }
-        }
-
-      for (int i = 0; i < hits.size(); i++)
-        if (!idsToRemove.contains(i))
-          hitsDedup.add(hits.get(i));
-
-      if (hitsDedup.size() < hits.size()) {
-        LOG.debug("Removed duplicates from formed query, including "
-            + hits.get(idsToRemove.get(0)));
-      }
-
-    } catch (Exception e) {
-      LOG.error("Problem removing duplicates from query list");
-    }
-
-    return hitsDedup;
-
-  }
-
-  public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item,
-      String originalSentence, List<String> sentsAll) {
-    // put orig sentence in structure
-    List<String> origs = new ArrayList<String>();
-    origs.add(originalSentence);
-    item.setOriginalSentences(origs);
-    String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
-        .replace("  ", " ").replace("  ", " ");
-    // generation results for this sentence
-    List<Fragment> result = new ArrayList<Fragment>();
-    // form plain text from snippet
-    String snapshot = item.getAbstractText().replace("<b>", " ")
-        .replace("</b>", " ").replace("  ", " ").replace("  ", " ");
-
-    SyntMatcher sm = SyntMatcher.getInstance();
-    // fix a template expression which can be substituted by original if
-    // relevant
-    String snapshotMarked = snapshot.replace("...", " _should_find_orig_ .");
-    String[] fragments = sm.getSentenceDetectorME().sentDetect(snapshotMarked);
-    List<String> allFragms = new ArrayList<String>();
-    allFragms.addAll(Arrays.asList(fragments));
-
-    String[] sents = null;
-    try {
-      if (snapshotMarked.length() != snapshot.length()) {
-        String downloadedPage = pFetcher.fetchPage(item.getUrl());
-        if (downloadedPage != null && downloadedPage.length() > 100) {
-          item.setPageContent(downloadedPage);
-          String pageContent = Utils.fullStripHTML(item.getPageContent());
-          pageContent = pageContent.trim().replace("  ", ". ")
-              .replace("..", ".").replace(". . .", " ").trim(); // sometimes
-                                                                // html breaks
-                                                                // are converted
-                                                                // into ' ' (two
-                                                                // spaces), so
-                                                                // we need to
-                                                                // put '.'
-          sents = sm.getSentenceDetectorME().sentDetect(pageContent);
-          sents = cleanListOfSents(sents);
-        }
-      }
-    } catch (Exception e) {
-      // TODO Auto-generated catch block
-      // e.printStackTrace();
-      System.err
-          .println("Problem downloading  the page and splitting into sentences");
-    }
-
-    for (String fragment : allFragms) {
-      String followSent = null;
-      if (fragment.length() < 50)
-        continue;
-      String pageSentence = "";
-      // try to find original sentence from webpage
-      if (fragment.indexOf("_should_find_orig_") > -1 && sents != null
-          && sents.length > 0)
-        try {
-          String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
-              fragment.replace("_should_find_orig_", ""), sents);
-          pageSentence = mainAndFollowSent[0];
-          followSent = mainAndFollowSent[1];
-
-        } catch (Exception e) {
-          // TODO Auto-generated catch block
-          e.printStackTrace();
-        }
-      else
-        // or get original snippet
-        pageSentence = fragment;
-      if (pageSentence != null)
-        pageSentence.replace("_should_find_orig_", "");
-
-      // resultant sentence SHOULD NOT be longer than twice the size of
-      // snippet fragment
-      if (pageSentence != null
-          && (float) pageSentence.length() / (float) fragment.length() < 4.0) { // was
-                                                                                // 2.0,
-                                                                                // but
-                                                                                // since
-                                                                                // snippet
-                                                                                // sentences
-                                                                                // are
-                                                                                // rather
-                                                                                // short
-                                                                                // now...
-        try { // get score from syntactic match between sentence in
-              // original text and mined sentence
-          double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;
-
-          SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence
-              + " " + title, originalSentence);
-          List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
-          if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {
-            System.out
-                .println("Rejected Sentence : No verb OR Yes imperative verb :"
-                    + pageSentence);
-            continue;
-          }
-
-          syntScore = parseTreeChunkListScorer
-              .getParseTreeChunkListScore(match);
-          System.out.println(parseTreeChunk.listToString(match) + " "
-              + syntScore + "\n pre-processed sent = '" + pageSentence);
-
-          if (syntScore < 1.5) { // trying other sents
-            for (String currSent : sentsAll) {
-              if (currSent.startsWith(originalSentence))
-                continue;
-              match = sm.matchOrigSentencesCache(currSent, pageSentence);
-              double syntScoreCurr = parseTreeChunkListScorer
-                  .getParseTreeChunkListScore(match);
-              if (syntScoreCurr > syntScore) {
-                syntScore = syntScoreCurr;
-              }
-            }
-            if (syntScore > 1.5) {
-              System.out.println("Got match with other sent: "
-                  + parseTreeChunk.listToString(match) + " " + syntScore);
-            }
-          }
-
-          measScore = STRING_DISTANCE_MEASURER.measureStringDistance(
-              originalSentence, pageSentence);
-
-          // now possibly increase score by finding mental verbs
-          // indicating opinions
-          for (String s : MENTAL_VERBS) {
-            if (pageSentence.indexOf(s) > -1) {
-              mentalScore += 0.3;
-              break;
-            }
-          }
-
-          if ((syntScore > 1.5 || measScore > 0.5 || mentalScore > 0.5)
-              && measScore < 0.8 && pageSentence.length() > 40) {
-            String pageSentenceProc = GeneratedSentenceProcessor
-                .acceptableMinedSentence(pageSentence);
-            if (pageSentenceProc != null) {
-              pageSentenceProc = GeneratedSentenceProcessor
-                  .processSentence(pageSentenceProc);
-              if (followSent != null) {
-                pageSentenceProc += " "
-                    + GeneratedSentenceProcessor.processSentence(followSent);
-              }
-
-              pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
-              Fragment f = new Fragment(pageSentenceProc, syntScore + measScore
-                  + mentalScore + (double) pageSentenceProc.length()
-                  / (double) 50);
-              f.setSourceURL(item.getUrl());
-              f.fragment = fragment;
-              result.add(f);
-              System.out.println("Accepted sentence: " + pageSentenceProc
-                  + "| with title= " + title);
-              System.out.println("For fragment = " + fragment);
-            } else
-              System.out
-                  .println("Rejected sentence due to wrong area at webpage: "
-                      + pageSentence);
-          } else
-            System.out.println("Rejected sentence due to low score: "
-                + pageSentence);
-          // }
-        } catch (Throwable t) {
-          System.out.println("exception " + t);
-        }
-      }
-    }
-    item.setFragments(result);
-    return item;
-  }
-
-  public static String[] cleanListOfSents(String[] sents) {
-    List<String> sentsClean = new ArrayList<String>();
-    for (String s : sents) {
-      if (s == null || s.trim().length() < 30 || s.length() < 20)
-        continue;
-      sentsClean.add(s);
-    }
-    return (String[]) sentsClean.toArray(new String[0]);
-  }
-
-  public static String[] getFullOriginalSentenceFromWebpageBySnippetFragment(
-      String fragment, String[] sents) {
-    if (fragment.trim().length() < 15)
-      return null;
-
-    StringDistanceMeasurer meas = new StringDistanceMeasurer();
-    Double dist = 0.0;
-    String result = null, followSent = null;
-    for (int i = 0; i < sents.length; i++) {
-      String s = sents[i];
-      if (s == null || s.length() < 30)
-        continue;
-      Double distCurr = meas.measureStringDistance(s, fragment);
-      if (distCurr > dist && distCurr > 0.4) {
-        result = s;
-        dist = distCurr;
-        if (i < sents.length - 1 && sents[i + 1].length() > 60) {
-          followSent = sents[i + 1];
-        }
-
-      }
-    }
-    return new String[] { result, followSent };
-  }
-
-  public static void main(String[] args) {
-    RelatedSentenceFinder f = new RelatedSentenceFinder();
-    String b = GeneratedSentenceProcessor
-        .acceptableMinedSentence("Earlier this year it was also Quidam that broke international ground "
-            + " by becoming the first Cirque du Soleil show to be seen in the Middle "
-            + " East - in Dubai, United Arab Emirates - now currently playing to sold "
-            + " out audiences. ");
-    /*
-     * System.setProperty("resourcesDirectory",
-     * "C:/workspace/ZSearch/resources_external");
-     * System.setProperty("dragonDirectory", "DRAGON_PATH");
-     * System.setProperty("StanfordNE_resources", "STANFORD_NE_RESOURCES");
-     * System.setProperty("vcb_resources", "VCB_RESOURCES");
-     * System.setProperty("bing_app_id", "BING_APP_ID");
-     * System.setProperty("yahoo_app_id",
-     * "lyzaGJDV34EsJbugCymf7_oosEfMtbSwUBOhDQ8abqgy_Sl2roPjyg72T5.k1NIoyQ--");
-     */
-    List<HitBase> hits = null;
-    try {
-      /*
-       * // uncomment the sentence you would like to serve as a seed sentence
-       * for content generation for an event description hits =
-       * f.findRelatedOpinionsForSentence( //
-       * "Did anyone expect there to be any sort of real change? The system is a whole lot bigger than one person these days"
-       * , //
-       * "Reflection of Neocolonial Aggression and Destruction in Libya. Emergence of New Era of Western Democratic Dictatorship"
-       * , //
-       * "It has been reported by Columbia Broadcasting Systems of the American - Israeli Military Industrial Complex that the International Human Rights Organization has called upon the civilized nations of the Globe to place under arrest George Bush"
-       * , //
-       * "Washington bailed out the crooks who created this disaster, then they carry on with business as usual"
-       * , //
-       * "my comment about Rall's taking out his frustrations in writing because he can't make any money drawing never made it onto this board"
-       * ,
-       * "I like the app, I use it to find fun things to do in my area, it's amazing how much cool stuff it finds for me"
-       * , //
-       * "Cyclo-cross  is a form of bicycle racing. Races typically take place in the autumn and winter laps of a short course featuring pavement, wooded trails, grass, steep hills"
-       * , //
-       * "celebrate mama. Pampering for Mom including free make overs and wine. Free Gift Bags with Goodies and Samples to the first 250 Moms"
-       * , //
-       * "Washington Congress is taking a cautious approach to the massive street protests sweeping Egypt encouraging cries for reform, but wary that a more radical regime in Cairo could damage US interests, including the survival of Israel."
-       * , // "College football Virginia Tech", //
-       * " automatic ways and refereneces if they existed to produce content that increase the ratings bias opinions, counter-attach and compensate the former attacks in SEM and Opinion Mining in an automated way instead of having thousands of grantholders that manually verify the honesty of sites"
-       * , //
-       * "Virginia Tech quarterback Tyrod Taylor enjoyed a magnificent night, throwing three touchdown pas,ses and rushing for another in leading Virginia Tech to the ACC championship with a 44-33 victory over Florida State in the leagueï¿½s title game played in front of 72,379 fans at Bank of America Stadium on Saturday night"
-       * , //
-       * "US banking giant Citigroup has taken over the ownership of EMI, the record label where it was the major creditor"
-       * , //
-       * "Egypt's army vows it will not use force against demonstrators, as the government says it is preparing to open talks with the opposition."
-       * ,
-       * 
-       * //
-       * "If you aren't enrolled in Paperless Statements and think you've received this message in error, please call our Customer Support team immediately, using the phone number on the Contact Us page on Chase Online"
-       * , // "Summer camp fair at french american international school",//
-       * "russian composers paganini", Arrays.asList(new String[] {
-       * 
-       * })); System.out.println(HitBase.toString(hits));
-       * System.out.println(HitBase.toString(hits).replace("[", "").replace("]",
-       * "").replace(" | ", ""));
-       */
-      // uncomment the sentence you would like to serve as a seed sentence for
-      // content generation for an event description
-      hits = f.findActivityDetailsForEventGroupName(
-          "Britney Spears - The Femme Fatale Tour",
-          // "Rush Time Machine",
-          // "amazon webservices summit",
-          // "Blue Man Group" ,
-          // "Belly Dance With Zaharah",
-          // "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
-          // "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
-          // "Cyclo-cross  is a form of bicycle racing. Races typically take place in the autumn and winter laps of a short course featuring pavement, wooded trails, grass, steep hills",
-          Arrays.asList(new String[] {}));
-      System.out.println(HitBase.toString(hits));
-      System.out.println(HitBase.toString(hits).replace("[", "")
-          .replace("]", "").replace(" | ", ""));
-
-    } catch (Exception e) {
-      e.printStackTrace();
-    }
+public class RelatedSentenceFinder
+{
+	private static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.RelatedSentenceFinder");
+	PageFetcher pFetcher = new PageFetcher();
+
+	private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
+	private ParseTreeChunk parseTreeChunk  = new ParseTreeChunk(); 
+
+	static StringDistanceMeasurer STRING_DISTANCE_MEASURER = new StringDistanceMeasurer();
+
+	// used to indicate that a sentence is an opinion, so more appropriate
+	static List<String> MENTAL_VERBS = new ArrayList<String>(Arrays.asList(new String[] { "want", "know", "believe",
+			"appeal", "ask", "accept", "agree", "allow", "appeal", "ask", "assume", "believe", "check", "confirm",
+			"convince", "deny", "disagree", "explain", "ignore", "inform", "remind", "request", "suggest", "suppose",
+			"think", "threaten", "try", "understand" }));
+
+	private static final int MAX_FRAGMENT_SENTS = 10;
+
+	public RelatedSentenceFinder()
+	{
+
+	}
+
+	public List<HitBase> findRelatedOpinionsForSentenceFastAndDummy(String word, List<String> sents) throws Exception
+	{
+		BingWebQueryRunner yrunner = new BingWebQueryRunner();
+		List<HitBase> searchResult = yrunner.runSearch(word);
+		return searchResult;
+	}
+
+
+
+	public List<HitBase> findRelatedOpinionsForSentence(String sentence, List<String> sents) throws Exception
+	{
+		List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
+		System.out.println(" \n\n=== Sentence  = " + sentence);
+		List<String> nounPhraseQueries = buildSearchEngineQueryFromSentence(sentence);
+
+		BingWebQueryRunner yrunner = new BingWebQueryRunner();
+		for (String query : nounPhraseQueries)
+		{
+			System.out.println("\nquery = " + query);
+			// query += " "+join(MENTAL_VERBS, " OR ") ;
+			List<HitBase> searchResult = yrunner.runSearch(query);
+			if (searchResult != null)
+			{
+				for (HitBase item : searchResult)
+				{ // got some text from .html
+					if (item.getAbstractText() != null && !(item.getUrl().indexOf(".pdf") > 0))
+					{ // exclude
+						// pdf
+						opinionSentencesToAdd.add(augmentWithMinedSentencesAndVerifyRelevance(item, sentence, sents));
+					}
+				}
+			}
+		}
+
+		opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
+		return opinionSentencesToAdd;
+	}
+	/*
+	 * Main content generation function which takes a seed as a rock group name and produce a list of text fragments by web mining for
+	 * this rock group (or other similar entity). 
+	 */
+
+	public List<HitBase> findActivityDetailsForEventGroupName(String sentence) throws Exception
+	{
+		List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
+		System.out.println(" \n=== Entity to write about = " + sentence);
+		List<String> nounPhraseQueries = new ArrayList<String>();
+
+
+		//nounPhraseQueries.add(sentence + frequentPerformingVerbs);
+
+		BingWebQueryRunner yrunner = new BingWebQueryRunner();
+		for (String verbAddition : StoryDiscourseNavigator.frequentPerformingVerbs)
+		{
+			List<HitBase> searchResult = yrunner.runSearch(sentence + " " + verbAddition);
+			if (searchResult != null)
+			{
+				for (HitBase item : searchResult)
+				{ // got some text from .html
+					if (item.getAbstractText() != null && !(item.getUrl().indexOf(".pdf") > 0))
+					{ // exclude pdf
+						opinionSentencesToAdd.add(augmentWithMinedSentencesAndVerifyRelevance(item, sentence, null));
+					}
+				}
+			}
+		}
+
+		opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
+		return opinionSentencesToAdd;
+	}
+
+	public static List<String> buildSearchEngineQueryFromSentence(String sentence)
+	{
+		ParseTreeChunk matcher = new ParseTreeChunk();
+		ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor.getInstance();
+		List<List<ParseTreeChunk>> sent1GrpLst = null;
+
+		List<ParseTreeChunk> nPhrases = pos.formGroupedPhrasesFromChunksForSentence(sentence).get(0);
+		List<String> queryArrayStr = new ArrayList<String>();
+		for (ParseTreeChunk ch : nPhrases)
+		{
+			String query = "";
+			int size = ch.getLemmas().size();
+
+			for (int i = 0; i < size; i++)
+			{
+				if (ch.getPOSs().get(i).startsWith("N") || ch.getPOSs().get(i).startsWith("J"))
+				{
+					query += ch.getLemmas().get(i) + " ";
+				}
+			}
+			query = query.trim();
+			int len = query.split(" ").length;
+			if (len < 2 || len > 5)
+				continue;
+			if (len < 4)
+			{ // every word should start with capital
+				String[] qs = query.split(" ");
+				boolean bAccept = true;
+				for (String w : qs)
+				{
+					if (w.toLowerCase().equals(w)) // idf only two words then
+						// has to be person name,
+						// title or geo location
+						bAccept = false;
+				}
+				if (!bAccept)
+					continue;
+			}
+
+			query = query.trim().replace(" ", " +");
+			query = " +" + query;
+
+			queryArrayStr.add(query);
+
+		}
+		if (queryArrayStr.size() < 1)
+		{ // release constraints on NP down to 2
+			// keywords
+			for (ParseTreeChunk ch : nPhrases)
+			{
+				String query = "";
+				int size = ch.getLemmas().size();
+
+				for (int i = 0; i < size; i++)
+				{
+					if (ch.getPOSs().get(i).startsWith("N") || ch.getPOSs().get(i).startsWith("J"))
+					{
+						query += ch.getLemmas().get(i) + " ";
+					}
+				}
+				query = query.trim();
+				int len = query.split(" ").length;
+				if (len < 2)
+					continue;
+
+				query = query.trim().replace(" ", " +");
+				query = " +" + query;
+
+				queryArrayStr.add(query);
+
+			}
+		}
+
+		queryArrayStr = removeDuplicatesFromQueries(queryArrayStr);
+		queryArrayStr.add(sentence);
+
+		return queryArrayStr;
+
+	}
+
+	// remove dupes from queries to easy cleaning dupes and repetitive search
+	// afterwards
+	public static List<String> removeDuplicatesFromQueries(List<String> hits)
+	{
+		StringDistanceMeasurer meas = new StringDistanceMeasurer();
+		double dupeThresh = 0.8; // if more similar, then considered dupes was
+		// 0.7
+		List<Integer> idsToRemove = new ArrayList<Integer>();
+		List<String> hitsDedup = new ArrayList<String>();
+		try
+		{
+			for (int i = 0; i < hits.size(); i++)
+				for (int j = i + 1; j < hits.size(); j++)
+				{
+					String title1 = hits.get(i);
+					String title2 = hits.get(j);
+					if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
+						continue;
+					if (meas.measureStringDistance(title1, title2) > dupeThresh)
+					{
+						idsToRemove.add(j); // dupes found, later list member to
+						// be deleted
+
+					}
+				}
+
+			for (int i = 0; i < hits.size(); i++)
+				if (!idsToRemove.contains(i))
+					hitsDedup.add(hits.get(i));
+
+			if (hitsDedup.size() < hits.size())
+			{
+				LOG.info("Removed duplicates from formed query, including " + hits.get(idsToRemove.get(0)));
+			}
+
+		}
+		catch (Exception e)
+		{
+			LOG.severe("Problem removing duplicates from query list");
+		}
+
+		return hitsDedup;
+
+	}
+
+	public static List<HitBase> removeDuplicatesFromResultantHits(List<HitBase> hits)
+	{
+		StringDistanceMeasurer meas = new StringDistanceMeasurer();
+		double dupeThresh = //0.8; // if more similar, then considered dupes was
+		 0.7;
+		List<Integer> idsToRemove = new ArrayList<Integer>();
+		List<HitBase> hitsDedup = new ArrayList<HitBase>();
+		try
+		{
+			for (int i = 0; i < hits.size(); i++)
+				for (int j = i + 1; j < hits.size(); j++)
+				{
+					HitBase hit2 = hits.get(j);
+					List<Fragment> fragmList1 =  hits.get(i).getFragments();
+					List<Fragment> fragmList2 =  hits.get(j).getFragments();
+					List<Fragment> fragmList2Results = new ArrayList<Fragment>(fragmList2);
+					for(Fragment f1: fragmList1)
+						for(Fragment f2: fragmList2){
+							String sf1 = f1.getResultText();
+							String sf2 = f2.getResultText();
+							if (StringUtils.isEmpty(sf1) || StringUtils.isEmpty(sf1))
+								continue;
+							if (meas.measureStringDistance(sf1, sf2) > dupeThresh)
+							{
+								fragmList2Results.remove(f2);	
+								LOG.info("Removed duplicates from formed fragments list: " + sf2);
+							}
+						}
+
+					hit2.setFragments(fragmList2Results);
+					hits.set(j, hit2 );
+				}
+		}
+		catch (Exception e)
+		{
+			LOG.severe("Problem removing duplicates from list of fragment");
+		}
+		return hits;
+	}
+
+	public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item, String originalSentence,
+			List<String> sentsAll)
+	{
+		if (sentsAll==null)
+			sentsAll = new ArrayList<String>();
+		// put orig sentence in structure
+		List<String> origs = new ArrayList<String>();
+		origs.add(originalSentence);
+		item.setOriginalSentences(origs);
+		String title = item.getTitle().replace("<b>", " ").replace("</b>", " ").replace("  ", " ").replace("  ", " ");
+		// generation results for this sentence
+		List<Fragment> result = new ArrayList<Fragment>();
+		// form plain text from snippet
+		String snapshot = item.getAbstractText().replace("<b>", " ").replace("</b>", " ").replace("  ", " ")
+		.replace("  ", " ");
+
+		ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor.getInstance();
+		// fix a template expression which can be substituted by original if
+		// relevant
+		String snapshotMarked = snapshot.replace("...", " _should_find_orig_ . _should_find_orig_");
+		String[] fragments = sm.splitSentences(snapshotMarked);
+		List<String> allFragms = new ArrayList<String>();
+		allFragms.addAll(Arrays.asList(fragments));
+
+		String[] sents = null; String downloadedPage;
+		try
+		{
+			if (snapshotMarked.length() != snapshot.length())
+			{
+				downloadedPage = pFetcher.fetchPage(item.getUrl());
+				if (downloadedPage != null && downloadedPage.length() > 100)
+				{
+					item.setPageContent(downloadedPage);
+					String pageContent = Utils.fullStripHTML(item.getPageContent());
+					pageContent = GeneratedSentenceProcessor.normalizeForSentenceSplitting(pageContent);
+					pageContent = pageContent.trim().replaceAll("  [A-Z]", ". $0")//.replace("  ", ". ")
+					.replace("..", ".").replace(". . .", " ")
+					.trim(); // sometimes html breaks are converted into ' ' (two spaces), so we need to put '.'
+					sents = sm.splitSentences(snapshotMarked);;
+					sents = cleanListOfSents(sents);
+				}
+			}
+		}
+		catch (Exception e)
+		{
+			// TODO Auto-generated catch block
+			// e.printStackTrace();
+			System.err.println("Problem downloading  the page and splitting into sentences");
+			return item;
+		}
+
+		for (String fragment : allFragms)
+		{
+			String followSent = null;
+			if (fragment.length() < 50)
+				continue;
+			String pageSentence = "";
+			// try to find original sentence from webpage
+			if (fragment.indexOf("_should_find_orig_") > -1 && sents != null && sents.length > 0)
+				try
+			{
+					String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
+							fragment.replace("_should_find_orig_", ""), sents);
+					pageSentence = mainAndFollowSent[0];
+					followSent = mainAndFollowSent[1];
+
+			}
+			catch (Exception e)
+			{
+
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+			}
+			else
+				// or get original snippet
+				pageSentence = fragment;
+			if (pageSentence != null)
+				pageSentence.replace("_should_find_orig_", "");
+
+			// resultant sentence SHOULD NOT be longer than twice the size of
+			// snippet fragment
+			if (pageSentence != null && (float) pageSentence.length() / (float) fragment.length() < 4.0)
+			{ // was 2.0, but since snippet sentences are rather short now...
+				try
+				{ // get score from syntactic match between sentence in
+					// original text and mined sentence
+					double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;
+
+					SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence + " " + title, originalSentence);
+					List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
+					if (!matchRes.isVerbExists() || matchRes.isImperativeVerb())
+					{
+						System.out.println("Rejected Sentence : No verb OR Yes imperative verb :" + pageSentence);
+						continue;
+					}
+
+					syntScore =parseTreeChunkListScorer.getParseTreeChunkListScore(match);
+					System.out.println(parseTreeChunk.listToString(match) + " " + syntScore
+							+ "\n pre-processed sent = '" + pageSentence);
+
+					if (syntScore < 1.5)
+					{ // trying other sents
+						for (String currSent : sentsAll)
+						{
+							if (currSent.startsWith(originalSentence))
+								continue;
+							match = sm.assessRelevance(currSent, pageSentence).getMatchResult();
+							double syntScoreCurr = parseTreeChunkListScorer.getParseTreeChunkListScore(match);
+							if (syntScoreCurr > syntScore)
+							{
+								syntScore = syntScoreCurr;
+							}
+						}
+						if (syntScore > 1.5)
+						{
+							System.out.println("Got match with other sent: " + parseTreeChunk.listToString(match) + " "
+									+ syntScore);
+						}
+					}
+
+					measScore = STRING_DISTANCE_MEASURER.measureStringDistance(originalSentence, pageSentence);
+
+					// now possibly increase score by finding mental verbs
+					// indicating opinions
+					for (String s : MENTAL_VERBS)
+					{
+						if (pageSentence.indexOf(s) > -1)
+						{
+							mentalScore += 0.3;
+							break;
+						}
+					}
+
+					if ((syntScore > 1.5 || measScore > 0.5 || mentalScore > 0.5) && measScore < 0.8
+							&& pageSentence.length() > 40) // >70
+					{
+						String pageSentenceProc = GeneratedSentenceProcessor.acceptableMinedSentence(pageSentence);
+						if (pageSentenceProc != null)
+						{
+							pageSentenceProc = GeneratedSentenceProcessor.processSentence(pageSentenceProc);
+							if (followSent != null)
+							{
+								pageSentenceProc += " " + GeneratedSentenceProcessor.processSentence(followSent);
+							}
+
+							pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
+							Fragment f = new Fragment(pageSentenceProc, syntScore + measScore + mentalScore
+									+ (double) pageSentenceProc.length() / (double) 50);
+							f.setSourceURL(item.getUrl());
+							f.fragment = fragment;
+							result.add(f);
+							System.out.println("Accepted sentence: " + pageSentenceProc + "| with title= " + title);
+							System.out.println("For fragment = " + fragment);
+						}
+						else
+							System.out.println("Rejected sentence due to wrong area at webpage: " + pageSentence);
+					}
+					else
+						System.out.println("Rejected sentence due to low score: " + pageSentence);
+					// }
+				}
+				catch (Throwable t)
+				{
+					t.printStackTrace();
+				}
+			}
+		}
+		item.setFragments(result);
+		return item;
+	}
+
+	public static String[] cleanListOfSents(String[] sents)
+	{
+		List<String> sentsClean = new ArrayList<String>();
+		for (String s : sents)
+		{
+			if (s == null || s.trim().length() < 30 || s.length() < 20)
+				continue;
+			sentsClean.add(s);
+		}
+		return (String[]) sentsClean.toArray(new String[0]);
+	}
+
+	// given a fragment from snippet, finds an original sentence at a webpage by optimizing alignmemt score
+	public static String[] getFullOriginalSentenceFromWebpageBySnippetFragment(String fragment, String[] sents)
+	{
+		if (fragment.trim().length() < 15)
+			return null;
+
+		StringDistanceMeasurer meas = new StringDistanceMeasurer();
+		Double dist = 0.0;
+		String result = null, followSent = null;
+		for (int i = 0; i < sents.length; i++)
+		{
+			String s = sents[i];
+			if (s == null || s.length() < 30)
+				continue;
+			Double distCurr = meas.measureStringDistance(s, fragment);
+			if (distCurr > dist && distCurr > 0.4)
+			{
+				result = s;
+				dist = distCurr;
+				if (i < sents.length - 1 && sents[i + 1].length() > 60)
+				{
+					followSent = sents[i + 1];
+				}
+
+			}
+		}
+		return new String[] { result, followSent };
+	}
+
+	// given a fragment from snippet, finds an original sentence at a webpage by optimizing alignmemt score
+	public static String[] getBestFullOriginalSentenceFromWebpageBySnippetFragment(String fragment, String[] sents)
+	{
+		if (fragment.trim().length() < 15)
+			return null;
+		int bestSentIndex = -1;
+		StringDistanceMeasurer meas = new StringDistanceMeasurer();
+		Double distBest = 10.0; // + sup
+		String result = null, followSent = null;
+		for (int i = 0; i < sents.length; i++)
+		{
+			String s = sents[i];
+			if (s == null || s.length() < 30)
+				continue;
+			Double distCurr = meas.measureStringDistance(s, fragment);
+			if (distCurr>distBest){
+				distBest = distCurr;
+				bestSentIndex = i;			
+			}
+
+		}
+		if (distBest > 0.4)
+		{
+			result = sents[bestSentIndex];
+
+			if (bestSentIndex < sents.length - 1 && sents[bestSentIndex + 1].length() > 60)
+			{
+				followSent = sents[bestSentIndex + 1];
+			}
+
+		}
+
+		return new String[] { result, followSent };
+	}
+
+	public static void main(String[] args)
+	{
+		RelatedSentenceFinder f = new RelatedSentenceFinder();
+
+		List<HitBase> hits = null; 
+		try
+		{
+			// uncomment the sentence you would like to serve as a seed sentence for content generation for an event description
+
+			// uncomment the sentence you would like to serve as a seed sentence for content generation for an event description
+			hits = f.findActivityDetailsForEventGroupName(
+					"Albert Einstein"
+					//"Britney Spears - The Femme Fatale Tour"
+					// "Rush Time Machine",
+					// "Blue Man Group" ,
+					// "Belly Dance With Zaharah",
+					// "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
+					// "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
+			);
+			System.out.println(HitBase.toString(hits));
+			System.out.println(HitBase.toResultantString(hits));
+			//WordFileGenerator.createWordDoc("Essey about Albert Einstein", hits.get(0).getTitle(), hits);
+
+
+
+		}
+		catch (Exception e)
+		{
+			e.printStackTrace();
+		}
 
-  }
+	}
 
-}
+}
\ No newline at end of file

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java?rev=1187056&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java Thu Oct 20 21:28:45 2011
@@ -0,0 +1,32 @@
+package opennlp.tools.similarity.apps;
+
+public class StoryDiscourseNavigator {
+	public static final String[] frequentPerformingVerbs = { 
+		" born raised meet learn ", 
+		" graduated enter discover",
+		" facts inventions life ", 
+		"accomplishments childhood timeline",
+		" acquire befriend encounter",
+		" achieve reache describe ",
+		" invent innovate improve ",
+		" impress outstanding award",
+		" curous sceptical pessimistic",
+		" spend enroll assume point",
+		" explain discuss dispute",
+		" learn teach study investigate",
+		" propose suggest indicate",
+		" pioneer explorer discoverer ",
+		" advance promote lead",
+		" direct control simulate ",
+		" guide lead assist ",
+		" inspire first initial",
+		" vision predict foresee",
+		" prediction inspiration achievement",
+		" approve agree confirm",
+		" deny argue disagree",
+		" emotional loud imagination",
+		" release announce celebrate discover", "introduce enjoy follow",
+		" open present show", "meet enjoy follow create", "discover continue produce" 
+		
+		};
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain