You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/10/11 17:36:33 UTC

svn commit: r1181845 [1/5] - in /incubator/opennlp/sandbox/opennlp-similarity: ./ src/main/java/opennlp/tools/similarity/ src/main/java/opennlp/tools/similarity/apps/ src/main/java/opennlp/tools/similarity/apps/utils/ src/main/java/opennlp/tools/textsi...

Author: joern
Date: Tue Oct 11 15:36:31 2011
New Revision: 1181845

URL: http://svn.apache.org/viewvc?rev=1181845&view=rev
Log:
OPENNLP-253 Initial check in of contribution from Boris Galitsky. Thanks for contributing.

Added:
    incubator/opennlp/sandbox/opennlp-similarity/pom.xml   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingHit.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingResponse.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingSearchResultsScraper.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/Fragment.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBase.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooHit.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooHitComparable.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooQueryRunner.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponse.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponseBase.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/CountItemsList.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/LevensteinDistanceFinder.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PorterStemmer.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringCleaner.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Utils.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/ValueSortMap.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/GeneralizationListReducer.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaPair.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/POSManager.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParagraphClassifier.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkComparable.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkFactory.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkListScorer.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcher.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministic.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/PorterStemmer.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SentencePairMatchResult.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcher.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherConfiguration.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherFactory.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/TextProcessor.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/TokenObject.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/
    incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/GeneralizationListReducerTest.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/LemmaFormManagerTest.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/ParagraphClassifierTest.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/ParseTreeChunkFactoryTest.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/ParseTreeChunkListScorerTest.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/ParseTreeChunkTest.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministicTest.java   (with props)
    incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/SyntMatcherTest.java   (with props)
Modified:
    incubator/opennlp/sandbox/opennlp-similarity/   (props changed)

Propchange: incubator/opennlp/sandbox/opennlp-similarity/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Tue Oct 11 15:36:31 2011
@@ -0,0 +1,7 @@
+.project
+
+.classpath
+
+target
+
+.settings

Added: incubator/opennlp/sandbox/opennlp-similarity/pom.xml
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/pom.xml?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/pom.xml (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/pom.xml Tue Oct 11 15:36:31 2011
@@ -0,0 +1,71 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.    
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+	<modelVersion>4.0.0</modelVersion>
+
+	<parent>
+		<groupId>org.apache</groupId>
+		<artifactId>apache</artifactId>
+		<version>9</version>
+		<relativePath />
+	</parent>
+
+	<groupId>org.apache.opennlp</groupId>
+	<artifactId>tools-similarity</artifactId>
+	<version>0.0.1-incubating-SNAPSHOT</version>
+	<packaging>jar</packaging>
+
+	<name>OpenNLP Tool Similarity</name>
+
+	<prerequisites>
+		<maven>3.0</maven>
+	</prerequisites>
+
+	<dependencies>
+		<dependency>
+		  <groupId>org.apache.opennlp</groupId>
+		  <artifactId>opennlp-tools</artifactId>
+		  <version>1.5.2-incubating-SNAPSHOT</version>
+		</dependency>
+		
+		<dependency>
+			<groupId>junit</groupId>
+			<artifactId>junit</artifactId>
+			<version>4.8.1</version>
+			<scope>test</scope>
+		</dependency>
+	</dependencies>
+	
+	<build>
+		<plugins>
+			<plugin>
+				<groupId>org.apache.maven.plugins</groupId>
+				<artifactId>maven-compiler-plugin</artifactId>
+				<configuration>
+					<source>1.5</source>
+					<target>1.5</target>
+          			<compilerArgument>-Xlint</compilerArgument>
+				</configuration>
+			</plugin>
+		</plugins>
+	</build>
+</project>
\ No newline at end of file

Propchange: incubator/opennlp/sandbox/opennlp-similarity/pom.xml
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingHit.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingHit.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingHit.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingHit.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+public class BingHit extends HitBase {
+
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingHit.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.net.URLConnection;
+import java.net.URLEncoder;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.lang.StringUtils;
+import org.json.JSONArray;
+import org.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class BingQueryRunner {
+  protected static final String APP_ID = "XXX";
+
+  private float snapshotSimilarityThreshold = 0.4f;
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(BingQueryRunner.class);
+
+  public void setSnapshotSimilarityThreshold(float thr) {
+    snapshotSimilarityThreshold = thr;
+  }
+
+  public float getSnapshotSimilarityThreshold() {
+    return snapshotSimilarityThreshold;
+  }
+
+  public BingQueryRunner() {
+
+  }
+
+  private String constructBingUrl(String query, String domainWeb, String lang,
+      int numbOfHits) throws Exception {
+    String codedQuery = URLEncoder.encode(query, "UTF-8");
+    // http://boss.yahooapis.com/ysearch/news/v1/{query}?appid=xyz[&param1=val1&param2=val2&etc
+    String yahooRequest = "http://api.search.live.net/json.aspx?Appid="
+        + APP_ID + "&query=" + codedQuery // +
+        // "&sources=web"+
+        + "&Sources=News"
+        // Common request fields (optional)
+        + "&Version=2.0" + "&Market=en-us"
+        // + "&Options=EnableHighlighting"
+
+        // News-specific request fields (optional)
+        + "&News.Offset=0";
+
+    return yahooRequest;
+  }
+
+  /*
+     *  
+     */
+  public ArrayList<String> search(String query, String domainWeb, String lang,
+      int numbOfHits) throws Exception {
+    URL url = new URL(constructBingUrl(query, domainWeb, lang, numbOfHits));
+    URLConnection connection = url.openConnection();
+
+    String line;
+    ArrayList<String> result = new ArrayList<String>();
+    BufferedReader reader = new BufferedReader(new InputStreamReader(
+        connection.getInputStream()));
+    int count = 0;
+    while ((line = reader.readLine()) != null) {
+      result.add(line);
+      count++;
+    }
+    return result;
+  }
+
+  public BingResponse populateBingHit(String response) throws Exception {
+    BingResponse resp = new BingResponse();
+    JSONObject rootObject = new JSONObject(response);
+    // each response is object that under the key of "ysearchresponse"
+    JSONObject responseObject = rootObject.getJSONObject("SearchResponse");
+    JSONObject web = responseObject.getJSONObject("News");
+
+    // the search result is in an array under the name of "results"
+    JSONArray resultSet = null;
+    try {
+      resultSet = web.getJSONArray("Results");
+    } catch (Exception e) {
+      System.err.print("\n!!!!!!!");
+      LOG.error("\nNo search results");
+
+    }
+    if (resultSet != null) {
+      for (int i = 0; i < resultSet.length(); i++) {
+        HitBase hit = new HitBase();
+        JSONObject singleResult = resultSet.getJSONObject(i);
+        hit.setAbstractText(singleResult.getString("Snippet"));
+        hit.setDate(singleResult.getString("Date"));
+        String title = StringUtils.replace(singleResult.getString("Title"),
+            "", " ");
+        hit.setTitle(title);
+        hit.setUrl(singleResult.getString("Url"));
+        hit.setSource(singleResult.getString("Source"));
+
+        resp.appendHits(hit);
+      }
+    }
+    return resp;
+  }
+
+  public List<HitBase> runSearch(String query) {
+    BingResponse resp = null;
+    try {
+      List<String> resultList = search(query, "", "", 8);
+      resp = populateBingHit(resultList.get(0));
+
+    } catch (Exception e) {
+      // e.printStackTrace();
+      LOG.debug("No news search results for query " + query);
+      return null;
+    }
+    // cast to super class
+    List<HitBase> hits = new ArrayList<HitBase>();
+    for (HitBase h : resp.getHits())
+      hits.add((HitBase) h);
+
+    hits = HitBase.removeDuplicates(hits);
+    return hits;
+  }
+
+  // TODO comment back when dependencies resolved (CopyrightViolations)
+  /*
+   * public List<CopyrightViolations> runCopyRightViolExtenralSearch(String
+   * query, String report) {
+   * 
+   * List<CopyrightViolations> genResult = new ArrayList<CopyrightViolations>();
+   * BingResponse newResp = null; StringDistanceMeasurer meas = new
+   * StringDistanceMeasurer(); try { List<String> resultList = search(query, "",
+   * "", 5);
+   * 
+   * BingResponse resp = populateBingHit(resultList.get(0));
+   * //printSearchResult(resultList.get(0));
+   * 
+   * for(int i=0; i<resp.getHits().size(); i++){ BingHit h1 =
+   * resp.getHits().get(i); String snippet = h1.getAbstractText(); Double sim =
+   * meas.measureStringDistance(report, snippet); if
+   * (sim>snapshotSimilarityThreshold){ //genResult.add(snapshot);
+   * CopyrightViolations cvr = new CopyrightViolations();
+   * cvr.setSnippet(snippet); cvr.setTitle(h1.getTitle());
+   * cvr.setUrl(h1.getDisplayUrl()); genResult.add(cvr); log.debug(new
+   * String("Copyright violation detected in snapshot"
+   * ).toUpperCase()+" : sim = "+ new Double(sim).toString().substring(0, 3)+
+   * " \n "+snippet);
+   * 
+   * } else { log.debug("Different news: sim = "+ new
+   * Double(sim).toString().substring(0, 3)+ " \n "+snippet);
+   * 
+   * }
+   * 
+   * }
+   * 
+   * } catch (Exception e) { e.printStackTrace(); }
+   * 
+   * 
+   * return genResult; }
+   */
+
+  public static void main(String[] args) {
+    BingQueryRunner self = new BingQueryRunner();
+    try {
+      List<HitBase> resp = self
+          .runSearch("Rates rise at weekly Treasury auction");
+      // "British Actress Lynn Redgrave dies at 67");
+      System.out.print(resp.get(0));
+    } catch (Exception e) {
+      // TODO Auto-generated catch block
+      e.printStackTrace();
+    }
+
+    /*
+     * String[] submittedNews = new String[]{
+     * "Asian airports had already increased security following the Christmas Day attack, but South Korea and Pakistan are thinking about additional measures."
+     * ,
+     * "Europe remains the key origin for air travelers heading to the United States, with about 1000 trans-Atlantic flights a day in 2009."
+     * ,
+     * "DeLaughter became an instant hero of the civil rights movement. Alec Baldwin portrayed him in the 1996 movie, Ghosts of Mississippi and his closing statement was once dubbed one of the greatest closing arguments in modern law."
+     * ,
+     * "After US president made the statement, Cuba protested extra screening for Cubans coming to the US"
+     * ,
+     * 
+     * }; for(String query: submittedNews){ System.out.println(query);
+     * List<CopyrightViolations> genResult =
+     * self.runCopyRightViolExtenralSearch(query, query); if
+     * (genResult.size()>0){
+     * 
+     * System.out.println(genResult.toString()); System.out.println("\n\n");
+     * 
+     * } }
+     */
+
+  }
+
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingResponse.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingResponse.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingResponse.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingResponse.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class BingResponse {
+  List<HitBase> hits;
+
+  int responseCode;
+
+  String nextPageUrl;
+
+  int totalHits;
+
+  int deepHits;
+
+  int startIndex;
+
+  int pageSize;
+
+  public BingResponse() {
+    hits = new ArrayList<HitBase>();
+  }
+
+  public void appendHits(HitBase hit) {
+    hits.add(hit);
+  }
+
+  public List<HitBase> getHits() {
+    return hits;
+  }
+
+  public void setHits(List<HitBase> hits) {
+    this.hits = hits;
+  }
+
+  public int getResponseCode() {
+    return responseCode;
+  }
+
+  public void setResponseCode(int responseCode) {
+    this.responseCode = responseCode;
+  }
+
+  public String getNextPageUrl() {
+    return nextPageUrl;
+  }
+
+  public void setNextPageUrl(String nextPageUrl) {
+    this.nextPageUrl = nextPageUrl;
+  }
+
+  public int getTotalHits() {
+    return totalHits;
+  }
+
+  public void setTotalHits(int totalHits) {
+    this.totalHits = totalHits;
+  }
+
+  public int getDeepHits() {
+    return deepHits;
+  }
+
+  public void setDeepHits(int deepHits) {
+    this.deepHits = deepHits;
+  }
+
+  public int getStartIndex() {
+    return startIndex;
+  }
+
+  public void setStartIndex(int startIndex) {
+    this.startIndex = startIndex;
+  }
+
+  public int getPageSize() {
+    return pageSize;
+  }
+
+  public void setPageSize(int pageSize) {
+    this.pageSize = pageSize;
+  }
+
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingResponse.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingSearchResultsScraper.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingSearchResultsScraper.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingSearchResultsScraper.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingSearchResultsScraper.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.net.URLConnection;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.lang.StringUtils;
+
+public class BingSearchResultsScraper {
+
+  protected static String fetchPageBing(String url) {
+    System.out.println("fetch url " + url);
+    String pageContent = null;
+    StringBuffer buf = new StringBuffer();
+    try {
+      URLConnection connection = new URL(url).openConnection();
+      connection.setReadTimeout(50000);
+      connection
+          .setRequestProperty(
+              "User-Agent",
+              "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3");
+      String line;
+      BufferedReader reader = null;
+      try {
+        reader = new BufferedReader(new InputStreamReader(
+            connection.getInputStream()));
+      } catch (Exception e) {
+        e.printStackTrace();
+      }
+
+      while ((line = reader.readLine()) != null) {
+        buf.append(line);
+      }
+
+    } catch (Exception e) {
+      // e.printStackTrace();
+      System.err.println("error fetching url " + url);
+    }
+
+    return buf.toString();
+  }
+
+  private static List<String> extractURLesFromPage(String content, String domain) {
+    List<String> results = new ArrayList<String>();
+    if (content == null)
+      return results;
+    content = StringUtils.substringBetween(content, ">Advanced</a></div>",
+        "<input type=\"text\" value=");
+    if (content == null)
+      return results;
+    String[] urls = content.split("<cite>");
+    if (urls == null)
+      return results;
+    for (String u : urls) {
+      int endPos = u.indexOf("</cite>");
+
+      if (endPos > 0) {
+        u = u.substring(0, endPos).replace("</strong>", "")
+            .replace("<strong>", "");
+        if (!u.equals(domain))
+          results.add(u);
+      }
+    }
+
+    return results;
+  }
+
+  private static String formRequestURL(String seedURL) {
+    String requestUrl = "http://www.bing.com/search?q=site:" + seedURL;
+
+    return requestUrl;
+  }
+
+  public List<String> getURLsForWebDomain(String domain) {
+    return extractURLesFromPage(fetchPageBing(formRequestURL(domain)), domain);
+  }
+
+  public Set<String> getURLsForWebDomainIterations(String domain) {
+    List<String> results = new ArrayList<String>();
+    List<String> res = extractURLesFromPage(
+        fetchPageBing(formRequestURL(domain)), domain);
+    for (String r : res)
+      results.addAll(extractURLesFromPage(fetchPageBing(formRequestURL(r)), r));
+
+    return new HashSet<String>(results);
+  }
+
+  public static void main(String[] args) {
+    System.out.println(new BingSearchResultsScraper()
+        .getURLsForWebDomainIterations("www.sfgate.com/entertainment/"));
+  }
+
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingSearchResultsScraper.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,229 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+import java.io.BufferedReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.net.URLConnection;
+import java.net.URLEncoder;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.lang.StringUtils;
+import org.json.JSONArray;
+import org.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.context.annotation.Profile;
+import org.springframework.stereotype.Component;
+
+import com.zvents.ce.common.util.StringDistanceMeasurer;
+import com.zvents.recommend.event_event.utils.CSVWriter;
+
+public class BingWebQueryRunner {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(BingWebQueryRunner.class);
+
+  private String constructBingWebUrl(String query, String domainWeb,
+      String lang, int numbOfHits) throws Exception {
+    String codedQuery = URLEncoder.encode(query, "UTF-8");
+
+    String yahooRequest = "http://api.search.live.net/json.aspx?Appid="
+        + BingQueryRunner.APP_ID + "&query=" + codedQuery // +
+        // "&sources=web"+
+        + "&Sources=Web"
+        // Common request fields (optional)
+        + "&Version=2.0" + "&Market=en-us&web.count=" + numbOfHits
+        // + "&Options=EnableHighlighting"
+
+        // News-specific request fields (optional)
+        + "&News.Offset=0";
+
+    return yahooRequest;
+  }
+
+  /*
+     *  
+     */
+
+  public BingResponse populateBingHit(String response) throws Exception {
+    BingResponse resp = new BingResponse();
+    JSONObject rootObject = new JSONObject(response);
+    // each response is object that under the key of "ysearchresponse"
+    JSONObject responseObject = rootObject.getJSONObject("SearchResponse");
+    JSONObject web = responseObject.getJSONObject("Web"); // "News"
+
+    // the search result is in an array under the name of "results"
+    JSONArray resultSet = null;
+    try {
+      resultSet = web.getJSONArray("Results");
+      int count = (int) web.getLong("Total");
+      resp.setTotalHits(new Integer(count));
+    } catch (Exception e) {
+      e.printStackTrace();
+      LOG.error("\nNo search results", e);
+      LOG.error(response);
+      LOG.error("---------------------");
+
+    }
+    if (resultSet != null) {
+      for (int i = 0; i < resultSet.length(); i++) {
+        BingHit hit = new BingHit();
+        JSONObject singleResult = resultSet.getJSONObject(i);
+        hit.setAbstractText(singleResult.getString("Description"));
+        hit.setDate(singleResult.getString("DateTime"));
+        String title = StringUtils.replace(singleResult.getString("Title"),
+            "", " ");
+        hit.setTitle(title);
+        hit.setUrl(singleResult.getString("Url"));
+
+        resp.appendHits(hit);
+      }
+    }
+    return resp;
+  }
+
+  public ArrayList<String> search(String query, String domainWeb, String lang,
+      int numbOfHits) throws Exception {
+    URL url = new URL(constructBingWebUrl(query, domainWeb, lang, numbOfHits));
+    URLConnection connection = url.openConnection();
+
+    String line;
+    ArrayList<String> result = new ArrayList<String>();
+    BufferedReader reader = new BufferedReader(new InputStreamReader(
+        connection.getInputStream()));
+    int count = 0;
+    while ((line = reader.readLine()) != null) {
+      result.add(line);
+      count++;
+    }
+    return result;
+  }
+
+  public List<HitBase> runSearch(String query) {
+    BingResponse resp = null;
+    try {
+      List<String> resultList = search(query, "", "", 8);
+      resp = populateBingHit(resultList.get(0));
+
+    } catch (Exception e) {
+      // e.printStackTrace();
+      LOG.debug("No news search results for query " + query);
+      return null;
+    }
+    // cast to super class
+    List<HitBase> hits = new ArrayList<HitBase>();
+    for (HitBase h : resp.getHits())
+      hits.add((HitBase) h);
+
+    hits = removeDuplicates(hits, 0.9);
+
+    writeHitsToCsv(query, hits);
+
+    return hits;
+  }
+
+  protected void writeHitsToCsv(String query, List<HitBase> hits) {
+    try {
+      CSVWriter writer = new CSVWriter(new FileWriter("bingSearchResults.csv",
+          true));
+      for (HitBase hit : hits) {
+        writer.writeNext(new String[] { query, hit.getTitle(), hit.getUrl() });
+      }
+      writer.close();
+    } catch (IOException e) {
+      LOG.error(e.getMessage(), e);
+    }
+  }
+
+  public List<HitBase> runSearch(String query, int num) {
+    BingResponse resp = null;
+    try {
+      List<String> resultList = search(query, "", "", num);
+      resp = populateBingHit(resultList.get(0));
+
+    } catch (Exception e) {
+      // e.printStackTrace();
+      LOG.debug("No news search results for query " + query);
+      return null;
+    }
+    // cast to super class
+    List<HitBase> hits = new ArrayList<HitBase>();
+    for (HitBase h : resp.getHits())
+      hits.add((HitBase) h);
+
+    hits = removeDuplicates(hits, 0.9);
+    return hits;
+  }
+
+  public static List<HitBase> removeDuplicates(List<HitBase> hits,
+      double imageDupeThresh) {
+    StringDistanceMeasurer meas = new StringDistanceMeasurer();
+
+    List<Integer> idsToRemove = new ArrayList<Integer>();
+    List<HitBase> hitsDedup = new ArrayList<HitBase>();
+    try {
+      for (int i = 0; i < hits.size(); i++)
+        for (int j = i + 1; j < hits.size(); j++) {
+          String title1 = hits.get(i).getTitle();
+          String title2 = hits.get(j).getTitle();
+          if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
+            continue;
+          if (meas.measureStringDistance(title1, title2) > imageDupeThresh) {
+            idsToRemove.add(j); // dupes found, later list member to
+            // be deleted
+          }
+        }
+      for (int i = 0; i < hits.size(); i++)
+        if (!idsToRemove.contains(i))
+          hitsDedup.add(hits.get(i));
+      if (hitsDedup.size() < hits.size()) {
+        LOG.debug("Removed duplicates from relevant search results, including "
+            + hits.get(idsToRemove.get(0)).getTitle());
+      }
+    } catch (Exception e) {
+      LOG.error("Problem removing duplicates from relevant images");
+    }
+
+    return hitsDedup;
+
+  }
+
+  public int getTotalPagesAtASite(String site) {
+    BingResponse resp = null;
+    try {
+      List<String> resultList = search("site:" + site, "", "", 10);
+      resp = populateBingHit(resultList.get(0));
+
+    } catch (Exception e) {
+      // e.printStackTrace();
+      LOG.debug("No news search results for query = 'site:" + site);
+      return 0;
+    }
+
+    return resp.totalHits;
+  }
+
+  public static void main(String[] args) {
+    int res = new BingWebQueryRunner().getTotalPagesAtASite("www.zvents.com");
+    new BingWebQueryRunner().runSearch("site:www.tripadvisor.com", 10);
+  };
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/Fragment.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/Fragment.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/Fragment.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/Fragment.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+import com.zvents.ce.common.util.StringDistanceMeasurer;
+
+public class Fragment {
+
+  public String resultText; // result
+
+  public double score;
+
+  public String fragment; // original
+
+  public String sourceURL;
+
+  public Fragment(String text, double score) {
+    this.resultText = text;
+    this.score = score;
+  }
+
+  public String getResultText() {
+    return resultText;
+  }
+
+  public void setResultText(String resultText) {
+    this.resultText = resultText;
+  }
+
+  public double getScore() {
+    return score;
+  }
+
+  public void setScore(double score) {
+    this.score = score;
+  }
+
+  public String getFragment() {
+    return fragment;
+  }
+
+  public void setFragment(String fragment) {
+    this.fragment = fragment;
+  }
+
+  public String getSourceURL() {
+    return sourceURL;
+  }
+
+  public void setSourceURL(String sourceURL) {
+    this.sourceURL = sourceURL;
+  }
+
+  public String toString() {
+    return this.resultText;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+
+    Fragment fragment = (Fragment) o;
+
+    if (resultText == null && fragment.resultText == null) {
+      return true;
+    } else if ((resultText == null && fragment.resultText != null)
+        || (resultText != null && fragment.resultText == null)) {
+      return false;
+    }
+
+    StringDistanceMeasurer sdm = new StringDistanceMeasurer();
+    return sdm.measureStringDistance(resultText, fragment.resultText) > 0.8;
+  }
+
+  @Override
+  public int hashCode() {
+    return resultText != null ? resultText.hashCode() : 0;
+  }
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/Fragment.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.commons.lang.StringUtils;
+
+import com.zvents.bing.Fragment;
+import com.zvents.bing.HitBase;
+import com.zvents.cg.RelatedSentenceFinder;
+import com.zvents.recommend.event_event.utils.Utils;
+
+public class GeneratedSentenceProcessor {
+  public static String acceptableMinedSentence(String sent) {
+    // if too many commas => seo text
+
+    String[] commas = StringUtils.split(sent, ',');
+    String[] spaces = StringUtils.split(sent, ' ');
+    if ((float) commas.length / (float) spaces.length > 0.7) {
+      System.out.println("Rejection: too many commas");
+      return null;
+    }
+
+    String[] pipes = StringUtils.split(sent, '|');
+    if (StringUtils.split(sent, '|').length > 2
+        || StringUtils.split(sent, '>').length > 2) {
+      System.out.println("Rejection: too many |s or >s ");
+      return null;
+    }
+    String sentTry = sent.toLowerCase();
+    // if too many long spaces
+    String sentSpaces = sentTry.replace("   ", "");
+    if (sentSpaces.length() - sentTry.length() > 10) // too many spaces -
+      // suspicious
+      return null;
+
+    if (sentTry.indexOf("click here") > -1 || sentTry.indexOf(" wikip") > -1
+        || sentTry.indexOf("copyright") > -1
+        || sentTry.indexOf("operating hours") > -1
+        || sentTry.indexOf("days per week") > -1
+        || sentTry.indexOf("click for") > -1 || sentTry.indexOf("photos") > -1
+        || sentTry.indexOf("find the latest") > -1
+        || sentTry.startsWith("subscribe")
+        || sentTry.indexOf("Terms of Service") > -1
+        || sentTry.indexOf("clicking here") > -1
+        || sentTry.indexOf("skip to") > -1 || sentTry.indexOf("sidebar") > -1
+        || sentTry.indexOf("Tags:") > -1 || sentTry.startsWith("Posted by")
+        || sentTry.indexOf("available online") > 0
+        || sentTry.indexOf("get online") > 0
+        || sentTry.indexOf("buy online") > 0
+        || sentTry.indexOf("not valid") > 0 || sentTry.indexOf("discount") > 0
+        || sentTry.indexOf("official site") > 0
+        || sentTry.indexOf("discount") > 0
+        || sentTry.indexOf("Related searches") > 0
+        || sentTry.indexOf("Permission is granted") > 0
+        || sentTry.indexOf("Free license") > 0
+        || sentTry.indexOf("Permission is granted") > 0
+        || sentTry.indexOf("under the terms") > 0
+
+        || sentTry.indexOf("wikipedia") > 0 || sentTry.endsWith("the")
+        || sentTry.endsWith("the."))
+      return null;
+
+    // count symbols indicating wrong parts of page to mine for text
+    // if short and contains too many symbols indicating wrong area: reject
+    String sentWrongSym = sentTry.replace(">", "&&&").replace("�", "&&&")
+        .replace("|", "&&&").replace(":", "&&&").replace("/", "&&&")
+        .replace("-", "&&&").replace("%", "&&&");
+    if ((sentWrongSym.length() - sentTry.length()) >= 4
+        && sentTry.length() < 200) // twice ot more
+      return null;
+
+    sent = sent.replace('[', ' ').replace(']', ' ')
+        .replace("_should_find_orig_", "").replace(".   .", ". ")
+        .replace("amp;", " ").replace("1.", " ").replace("2.", " ")
+        .replace("3.", " ").replace("4.", " ").replace("2009", "2011")
+        .replace("2008", "2011").replace("2006", "2011")
+        .replace("2007", "2011").replace("VIDEO:", " ").replace("Video:", " ")
+        .replace("no comments", " ").replace("  ", " ").replace("  ", " ")
+        .replace("(more.)", "").replace("more.", "").replace("<more>", "")
+        .replace("[more]", "").replace(".,", ".").replace("&lt;", "")
+        .replace("p&gt;", "");
+    // TODO .replace("a.", ".");
+
+    int endIndex = sent.indexOf(" posted");
+    if (endIndex > 0)
+      sent = sent.substring(0, endIndex);
+
+    return sent;
+  }
+
+  public static String processSentence(String pageSentence) {
+    if (pageSentence == null)
+      return "";
+    pageSentence = Utils.fullStripHTML(pageSentence);
+    pageSentence = StringUtils.chomp(pageSentence, "..");
+    pageSentence = StringUtils.chomp(pageSentence, ". .");
+    pageSentence = StringUtils.chomp(pageSentence, " .");
+    pageSentence = StringUtils.chomp(pageSentence, ".");
+    pageSentence = StringUtils.chomp(pageSentence, "...");
+    pageSentence = StringUtils.chomp(pageSentence, " ....");
+    pageSentence = pageSentence.replace("::", ":").replace(".,", ". ")
+        .replace("(.)", "");
+
+    pageSentence = pageSentence.trim();
+    pageSentence = pageSentence.replaceAll("\\s+", " "); // make single
+    // spaces
+    // everywhere
+
+    String[] pipes = StringUtils.split(pageSentence, '|'); // removed
+    // shorter part
+    // of sentence
+    // at the end
+    // after pipe
+    if (pipes.length == 2
+        && ((float) pipes[0].length() / (float) pipes[1].length() > 3.0)) {
+      int pipePos = pageSentence.indexOf("|");
+      if (pipePos > -1)
+        pageSentence = pageSentence.substring(0, pipePos - 1).trim();
+
+    }
+
+    if (!StringUtils.contains(pageSentence, '.')
+        && !StringUtils.contains(pageSentence, '?')
+        && !StringUtils.contains(pageSentence, '!'))
+      pageSentence = pageSentence + ". ";
+
+    pageSentence = pageSentence.replace(" .", ".").replace("..", ".").trim();
+    if (!pageSentence.endsWith("."))
+      pageSentence += ". ";
+    return pageSentence;
+  }
+
+  public static void main(String[] args) {
+    RelatedSentenceFinder f = new RelatedSentenceFinder();
+    try {
+      List<HitBase> hits = f
+          .findRelatedOpinionsForSentence(
+              "Give me a break, there is no reason why you can't retire in ten years if you had been a rational investor and not a crazy trader",
+              Arrays
+                  .asList(new String[] { "Give me a break there is no reason why you can't retire in ten years if you had been a rational investor and not a crazy trader. For example you went to cash in 2008 and stay in cash until now you made nothing. Whereas people who rode out the storm are doing fine so let's quit focusing on the loser who think they are so smart and went to 100% cash and are wondering what happen. Its a market that always moves unlike your mattress.", }));
+      StringBuffer buf = new StringBuffer();
+
+      for (HitBase h : hits) {
+        List<Fragment> frags = h.getFragments();
+        for (Fragment fr : frags) {
+          if (fr.getResultText() != null && fr.getResultText().length() > 3)
+            buf.append(fr.getResultText());
+        }
+      }
+
+    } catch (Exception e) {
+      // TODO Auto-generated catch block
+      e.printStackTrace();
+    }
+
+  }
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBase.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBase.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBase.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBase.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.lang.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.zvents.ce.common.util.StringDistanceMeasurer;
+
+public class HitBase {
+  private static final Logger LOG = LoggerFactory.getLogger(HitBase.class);
+
+  private String abstractText;
+
+  private String clickUrl;
+
+  private String displayUrl;
+
+  private String url;
+
+  private String date;
+
+  private String title;
+
+  private Double generWithQueryScore;
+
+  private String source;
+
+  private List<String> originalSentences;
+
+  private String pageContent;
+
+  private List<Fragment> fragments;
+
+  public HitBase() {
+    super();
+  }
+
+  public String getPageContent() {
+    return pageContent;
+  }
+
+  public HitBase(String orig, String[] generateds) {
+    originalSentences = new ArrayList<String>();
+    originalSentences.add(orig);
+
+    fragments = new ArrayList<Fragment>();
+    for (String sent : generateds) {
+      Fragment f = new Fragment(sent, 0.0);
+      fragments.add(f);
+    }
+    // the rest of params are null
+  }
+
+  public void setPageContent(String pageContent) {
+    this.pageContent = pageContent;
+  }
+
+  public List<Fragment> getFragments() {
+    return fragments;
+  }
+
+  public void setFragments(List<Fragment> fragments) {
+    this.fragments = fragments;
+  }
+
+  public String getSource() {
+    return source;
+  }
+
+  public void setSource(String source) {
+    this.source = source;
+  }
+
+  public List<String> getOriginalSentences() {
+    return originalSentences;
+  }
+
+  public void setOriginalSentences(List<String> originalSentences) {
+    this.originalSentences = originalSentences;
+  }
+
+  public String getTitle() {
+    return title;
+  }
+
+  public void setTitle(String title) {
+    this.title = title;
+  }
+
+  public String getAbstractText() {
+    return abstractText;
+  }
+
+  public void setAbstractText(String abstractText) {
+    this.abstractText = abstractText;
+  }
+
+  public String getClickUrl() {
+    return clickUrl;
+  }
+
+  public void setClickUrl(String clickUrl) {
+    this.clickUrl = clickUrl;
+  }
+
+  public String getDisplayUrl() {
+    return displayUrl;
+  }
+
+  public void setDisplayUrl(String displayUrl) {
+    this.displayUrl = displayUrl;
+  }
+
+  public String getUrl() {
+    return url;
+  }
+
+  public void setUrl(String url) {
+    this.url = url;
+  }
+
+  public String getDate() {
+    return date;
+  }
+
+  public void setDate(String date) {
+    this.date = date;
+  }
+
+  public Double getGenerWithQueryScore() {
+    return generWithQueryScore;
+  }
+
+  public void setGenerWithQueryScore(Double generWithQueryScore) {
+    this.generWithQueryScore = generWithQueryScore;
+  }
+
+  public String toString() {
+    // return "\n"+this.getUrl()+" | " +this.getTitle()+ " | "+
+    // this.abstractText ;
+    if (this.getFragments() != null && this.getFragments().size() > 0)
+      return this.getFragments().toString();
+    else
+      return this.title;
+  }
+
+  public static String toString(List<HitBase> hits) {
+    StringBuffer buf = new StringBuffer();
+    Boolean pBreak = true;
+    for (HitBase hit : hits) {
+      String fragm = (hit.toString());
+      if (fragm.length() > 15) {
+        if (pBreak)
+          buf.append(fragm + " | ");
+        else
+          buf.append(fragm + " | \n");
+        // switch to opposite
+        if (pBreak)
+          pBreak = false;
+        else
+          pBreak = true;
+      }
+
+    }
+    return buf.toString();
+  }
+
+  public static List<HitBase> removeDuplicates(List<HitBase> hits) {
+    StringDistanceMeasurer meas = new StringDistanceMeasurer();
+    double imageDupeThresh = 0.8; // if more similar, then considered dupes
+    List<Integer> idsToRemove = new ArrayList<Integer>();
+    List<HitBase> hitsDedup = new ArrayList<HitBase>();
+    try {
+      for (int i = 0; i < hits.size(); i++)
+        for (int j = i + 1; j < hits.size(); j++) {
+          String title1 = hits.get(i).getTitle();
+          String title2 = hits.get(j).getTitle();
+          if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
+            continue;
+          if (meas.measureStringDistance(title1, title2) > imageDupeThresh) {
+            idsToRemove.add(j); // dupes found, later list member to be deleted
+          }
+        }
+      for (int i = 0; i < hits.size(); i++)
+        if (!idsToRemove.contains(i))
+          hitsDedup.add(hits.get(i));
+      if (hitsDedup.size() < hits.size()) {
+        LOG.debug("Removed duplicates from relevant search results, including "
+            + hits.get(idsToRemove.get(0)).getTitle());
+      }
+    } catch (Exception e) {
+      LOG.error("Problem removing duplicates from relevant images", e);
+    }
+
+    return hitsDedup;
+
+  }
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBase.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,581 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import opennlp.tools.parser.Parse;
+import opennlp.tools.similarity.apps.utils.PageFetcher;
+import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
+import opennlp.tools.similarity.apps.utils.Utils;
+import opennlp.tools.textsimilarity.LemmaPair;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
+import opennlp.tools.textsimilarity.SentencePairMatchResult;
+import opennlp.tools.textsimilarity.SyntMatcher;
+
+import org.apache.commons.lang.StringUtils;
+import org.slf4j.LoggerFactory;
+
+public class RelatedSentenceFinder {
+
+  // TODO outsource the timeout value
+  PageFetcher pFetcher = new PageFetcher();
+
+  private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
+  private ParseTreeChunk parseTreeChunk = new ParseTreeChunk();
+
+  private static final org.slf4j.Logger LOG = LoggerFactory
+      .getLogger(RelatedSentenceFinder.class);
+
+  static StringDistanceMeasurer STRING_DISTANCE_MEASURER = new StringDistanceMeasurer();
+
+  // used to indicate that a sentence is an opinion, so more appropriate
+  static List<String> MENTAL_VERBS = new ArrayList<String>(
+      Arrays.asList(new String[] { "want", "know", "believe", "appeal", "ask",
+          "accept", "agree", "allow", "appeal", "ask", "assume", "believe",
+          "check", "confirm", "convince", "deny", "disagree", "explain",
+          "ignore", "inform", "remind", "request", "suggest", "suppose",
+          "think", "threaten", "try", "understand" }));
+
+  private static final int MAX_FRAGMENT_SENTS = 10;
+
+  public RelatedSentenceFinder() {
+
+  }
+
+  public List<HitBase> findRelatedOpinionsForSentenceFastAndDummy(String word,
+      List<String> sents) throws Exception {
+    YahooQueryRunner yrunner = new YahooQueryRunner();
+    List<HitBase> searchResult = yrunner.runSearch(word);
+    return searchResult;
+  }
+
+  public List<HitBase> findRelatedOpinionsForSentence(String sentence,
+      List<String> sents) throws Exception {
+    List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
+    System.out.println(" \n\n=== Sentence  = " + sentence);
+    List<String> nounPhraseQueries = buildSearchEngineQueryFromSentence(sentence);
+
+    YahooQueryRunner yrunner = new YahooQueryRunner();
+    for (String query : nounPhraseQueries) {
+      System.out.println("\nquery = " + query);
+      // query += " "+join(MENTAL_VERBS, " OR ") ;
+      List<HitBase> searchResult = yrunner.runSearch(query);
+      if (searchResult != null) {
+        for (HitBase item : searchResult) { // got some text from .html
+          if (item.getAbstractText() != null
+              && !(item.getUrl().indexOf(".pdf") > 0)) { // exclude
+                                                         // pdf
+            opinionSentencesToAdd
+                .add(augmentWithMinedSentencesAndVerifyRelevance(item,
+                    sentence, sents));
+          }
+        }
+      }
+    }
+
+    opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
+    return opinionSentencesToAdd;
+  }
+
+  public List<HitBase> findActivityDetailsForEventGroupName(String sentence,
+      List<String> sents) throws Exception {
+    List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
+    System.out.println(" \n\n=== Entity to write about = " + sentence);
+    List<String> nounPhraseQueries = new ArrayList<String>();
+    String[] frequentPerformingVerbs = {
+        " release announce celebrate discover", "introduce enjoy follow",
+        "open present show", "meet enjoy follow create",
+        "discover continue produce" };
+
+    nounPhraseQueries.add(sentence + frequentPerformingVerbs);
+
+    YahooQueryRunner yrunner = new YahooQueryRunner();
+    for (String verbAddition : frequentPerformingVerbs) {
+      List<HitBase> searchResult = yrunner.runSearch(sentence + " "
+          + verbAddition);
+      if (searchResult != null) {
+        for (HitBase item : searchResult) { // got some text from .html
+          if (item.getAbstractText() != null
+              && !(item.getUrl().indexOf(".pdf") > 0)) { // exclude
+                                                         // pdf
+            opinionSentencesToAdd
+                .add(augmentWithMinedSentencesAndVerifyRelevance(item,
+                    sentence, sents));
+          }
+        }
+      }
+    }
+
+    opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
+    return opinionSentencesToAdd;
+  }
+
+  public static List<String> buildSearchEngineQueryFromSentence(String sentence) {
+    ParseTreeChunk matcher = new ParseTreeChunk();
+    SyntMatcher pos = SyntMatcher.getInstance();
+    List<List<ParseTreeChunk>> sent1GrpLst = null;
+
+    List<LemmaPair> origChunks1 = new ArrayList<LemmaPair>();
+    String[] sents1 = pos.getSentenceDetectorME().sentDetect(sentence);
+    for (String s1 : sents1) {
+      Parse[] parses1 = pos.parseLine(s1, pos.getParser(), 1);
+      origChunks1.addAll(pos.getAllPhrasesTWPairs(parses1[0]));
+    }
+
+    List<ParseTreeChunk> chunk1List = matcher.buildChunks(origChunks1);
+    sent1GrpLst = matcher.groupChunksAsParses(chunk1List);
+
+    // System.out.println(origChunks1);
+    // System.out.println("=== Grouped chunks 1 "+ sent1GrpLst.get(0));
+    List<ParseTreeChunk> nPhrases = sent1GrpLst.get(0);
+    List<String> queryArrayStr = new ArrayList<String>();
+    for (ParseTreeChunk ch : nPhrases) {
+      String query = "";
+      int size = ch.getLemmas().size();
+
+      for (int i = 0; i < size; i++) {
+        if (ch.getPOSs().get(i).startsWith("N")
+            || ch.getPOSs().get(i).startsWith("J")) {
+          query += ch.getLemmas().get(i) + " ";
+        }
+      }
+      query = query.trim();
+      int len = query.split(" ").length;
+      if (len < 2 || len > 5)
+        continue;
+      if (len < 4) { // every word should start with capital
+        String[] qs = query.split(" ");
+        boolean bAccept = true;
+        for (String w : qs) {
+          if (w.toLowerCase().equals(w)) // idf only two words then
+            // has to be person name,
+            // title or geo location
+            bAccept = false;
+        }
+        if (!bAccept)
+          continue;
+      }
+
+      query = query.trim().replace(" ", " +");
+      query = " +" + query;
+
+      queryArrayStr.add(query);
+
+    }
+    if (queryArrayStr.size() < 1) { // release constraints on NP down to 2
+                                    // keywords
+      for (ParseTreeChunk ch : nPhrases) {
+        String query = "";
+        int size = ch.getLemmas().size();
+
+        for (int i = 0; i < size; i++) {
+          if (ch.getPOSs().get(i).startsWith("N")
+              || ch.getPOSs().get(i).startsWith("J")) {
+            query += ch.getLemmas().get(i) + " ";
+          }
+        }
+        query = query.trim();
+        int len = query.split(" ").length;
+        if (len < 2)
+          continue;
+
+        query = query.trim().replace(" ", " +");
+        query = " +" + query;
+
+        queryArrayStr.add(query);
+
+      }
+    }
+
+    queryArrayStr = removeDuplicatesFromQueries(queryArrayStr);
+    queryArrayStr.add(sentence);
+
+    return queryArrayStr;
+
+  }
+
+  // remove dupes from queries to easy cleaning dupes and repetitive search
+  // afterwards
+  public static List<String> removeDuplicatesFromQueries(List<String> hits) {
+    StringDistanceMeasurer meas = new StringDistanceMeasurer();
+    double dupeThresh = 0.8; // if more similar, then considered dupes was
+    // 0.7
+    List<Integer> idsToRemove = new ArrayList<Integer>();
+    List<String> hitsDedup = new ArrayList<String>();
+    try {
+      for (int i = 0; i < hits.size(); i++)
+        for (int j = i + 1; j < hits.size(); j++) {
+          String title1 = hits.get(i);
+          String title2 = hits.get(j);
+          if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
+            continue;
+          if (meas.measureStringDistance(title1, title2) > dupeThresh) {
+            idsToRemove.add(j); // dupes found, later list member to
+            // be deleted
+
+          }
+        }
+
+      for (int i = 0; i < hits.size(); i++)
+        if (!idsToRemove.contains(i))
+          hitsDedup.add(hits.get(i));
+
+      if (hitsDedup.size() < hits.size()) {
+        LOG.debug("Removed duplicates from formed query, including "
+            + hits.get(idsToRemove.get(0)));
+      }
+
+    } catch (Exception e) {
+      LOG.error("Problem removing duplicates from query list");
+    }
+
+    return hitsDedup;
+
+  }
+
+  public static List<HitBase> removeDuplicatesFromResultantHits(
+      List<HitBase> hits) {
+    StringDistanceMeasurer meas = new StringDistanceMeasurer();
+    double dupeThresh = 0.8; // if more similar, then considered dupes was
+    // 0.7
+    List<Integer> idsToRemove = new ArrayList<Integer>();
+    List<HitBase> hitsDedup = new ArrayList<HitBase>();
+    try {
+      for (int i = 0; i < hits.size(); i++)
+        for (int j = i + 1; j < hits.size(); j++) {
+          String title1 = hits.get(i).toString();
+          String title2 = hits.get(j).toString();
+          if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
+            continue;
+          if (meas.measureStringDistance(title1, title2) > dupeThresh) {
+            idsToRemove.add(j); // dupes found, later list member to
+            // be deleted
+
+          }
+        }
+
+      for (int i = 0; i < hits.size(); i++)
+        if (!idsToRemove.contains(i))
+          hitsDedup.add(hits.get(i));
+
+      if (hitsDedup.size() < hits.size()) {
+        LOG.debug("Removed duplicates from formed query, including "
+            + hits.get(idsToRemove.get(0)));
+      }
+
+    } catch (Exception e) {
+      LOG.error("Problem removing duplicates from query list");
+    }
+
+    return hitsDedup;
+
+  }
+
+  public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item,
+      String originalSentence, List<String> sentsAll) {
+    // put orig sentence in structure
+    List<String> origs = new ArrayList<String>();
+    origs.add(originalSentence);
+    item.setOriginalSentences(origs);
+    String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
+        .replace("  ", " ").replace("  ", " ");
+    // generation results for this sentence
+    List<Fragment> result = new ArrayList<Fragment>();
+    // form plain text from snippet
+    String snapshot = item.getAbstractText().replace("<b>", " ")
+        .replace("</b>", " ").replace("  ", " ").replace("  ", " ");
+
+    SyntMatcher sm = SyntMatcher.getInstance();
+    // fix a template expression which can be substituted by original if
+    // relevant
+    String snapshotMarked = snapshot.replace("...", " _should_find_orig_ .");
+    String[] fragments = sm.getSentenceDetectorME().sentDetect(snapshotMarked);
+    List<String> allFragms = new ArrayList<String>();
+    allFragms.addAll(Arrays.asList(fragments));
+
+    String[] sents = null;
+    try {
+      if (snapshotMarked.length() != snapshot.length()) {
+        String downloadedPage = pFetcher.fetchPage(item.getUrl());
+        if (downloadedPage != null && downloadedPage.length() > 100) {
+          item.setPageContent(downloadedPage);
+          String pageContent = Utils.fullStripHTML(item.getPageContent());
+          pageContent = pageContent.trim().replace("  ", ". ")
+              .replace("..", ".").replace(". . .", " ").trim(); // sometimes
+                                                                // html breaks
+                                                                // are converted
+                                                                // into ' ' (two
+                                                                // spaces), so
+                                                                // we need to
+                                                                // put '.'
+          sents = sm.getSentenceDetectorME().sentDetect(pageContent);
+          sents = cleanListOfSents(sents);
+        }
+      }
+    } catch (Exception e) {
+      // TODO Auto-generated catch block
+      // e.printStackTrace();
+      System.err
+          .println("Problem downloading  the page and splitting into sentences");
+    }
+
+    for (String fragment : allFragms) {
+      String followSent = null;
+      if (fragment.length() < 50)
+        continue;
+      String pageSentence = "";
+      // try to find original sentence from webpage
+      if (fragment.indexOf("_should_find_orig_") > -1 && sents != null
+          && sents.length > 0)
+        try {
+          String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
+              fragment.replace("_should_find_orig_", ""), sents);
+          pageSentence = mainAndFollowSent[0];
+          followSent = mainAndFollowSent[1];
+
+        } catch (Exception e) {
+          // TODO Auto-generated catch block
+          e.printStackTrace();
+        }
+      else
+        // or get original snippet
+        pageSentence = fragment;
+      if (pageSentence != null)
+        pageSentence.replace("_should_find_orig_", "");
+
+      // resultant sentence SHOULD NOT be longer than twice the size of
+      // snippet fragment
+      if (pageSentence != null
+          && (float) pageSentence.length() / (float) fragment.length() < 4.0) { // was
+                                                                                // 2.0,
+                                                                                // but
+                                                                                // since
+                                                                                // snippet
+                                                                                // sentences
+                                                                                // are
+                                                                                // rather
+                                                                                // short
+                                                                                // now...
+        try { // get score from syntactic match between sentence in
+              // original text and mined sentence
+          double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;
+
+          SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence
+              + " " + title, originalSentence);
+          List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
+          if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {
+            System.out
+                .println("Rejected Sentence : No verb OR Yes imperative verb :"
+                    + pageSentence);
+            continue;
+          }
+
+          syntScore = parseTreeChunkListScorer
+              .getParseTreeChunkListScore(match);
+          System.out.println(parseTreeChunk.listToString(match) + " "
+              + syntScore + "\n pre-processed sent = '" + pageSentence);
+
+          if (syntScore < 1.5) { // trying other sents
+            for (String currSent : sentsAll) {
+              if (currSent.startsWith(originalSentence))
+                continue;
+              match = sm.matchOrigSentencesCache(currSent, pageSentence);
+              double syntScoreCurr = parseTreeChunkListScorer
+                  .getParseTreeChunkListScore(match);
+              if (syntScoreCurr > syntScore) {
+                syntScore = syntScoreCurr;
+              }
+            }
+            if (syntScore > 1.5) {
+              System.out.println("Got match with other sent: "
+                  + parseTreeChunk.listToString(match) + " " + syntScore);
+            }
+          }
+
+          measScore = STRING_DISTANCE_MEASURER.measureStringDistance(
+              originalSentence, pageSentence);
+
+          // now possibly increase score by finding mental verbs
+          // indicating opinions
+          for (String s : MENTAL_VERBS) {
+            if (pageSentence.indexOf(s) > -1) {
+              mentalScore += 0.3;
+              break;
+            }
+          }
+
+          if ((syntScore > 1.5 || measScore > 0.5 || mentalScore > 0.5)
+              && measScore < 0.8 && pageSentence.length() > 40) {
+            String pageSentenceProc = GeneratedSentenceProcessor
+                .acceptableMinedSentence(pageSentence);
+            if (pageSentenceProc != null) {
+              pageSentenceProc = GeneratedSentenceProcessor
+                  .processSentence(pageSentenceProc);
+              if (followSent != null) {
+                pageSentenceProc += " "
+                    + GeneratedSentenceProcessor.processSentence(followSent);
+              }
+
+              pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
+              Fragment f = new Fragment(pageSentenceProc, syntScore + measScore
+                  + mentalScore + (double) pageSentenceProc.length()
+                  / (double) 50);
+              f.setSourceURL(item.getUrl());
+              f.fragment = fragment;
+              result.add(f);
+              System.out.println("Accepted sentence: " + pageSentenceProc
+                  + "| with title= " + title);
+              System.out.println("For fragment = " + fragment);
+            } else
+              System.out
+                  .println("Rejected sentence due to wrong area at webpage: "
+                      + pageSentence);
+          } else
+            System.out.println("Rejected sentence due to low score: "
+                + pageSentence);
+          // }
+        } catch (Throwable t) {
+          System.out.println("exception " + t);
+        }
+      }
+    }
+    item.setFragments(result);
+    return item;
+  }
+
+  public static String[] cleanListOfSents(String[] sents) {
+    List<String> sentsClean = new ArrayList<String>();
+    for (String s : sents) {
+      if (s == null || s.trim().length() < 30 || s.length() < 20)
+        continue;
+      sentsClean.add(s);
+    }
+    return (String[]) sentsClean.toArray(new String[0]);
+  }
+
+  public static String[] getFullOriginalSentenceFromWebpageBySnippetFragment(
+      String fragment, String[] sents) {
+    if (fragment.trim().length() < 15)
+      return null;
+
+    StringDistanceMeasurer meas = new StringDistanceMeasurer();
+    Double dist = 0.0;
+    String result = null, followSent = null;
+    for (int i = 0; i < sents.length; i++) {
+      String s = sents[i];
+      if (s == null || s.length() < 30)
+        continue;
+      Double distCurr = meas.measureStringDistance(s, fragment);
+      if (distCurr > dist && distCurr > 0.4) {
+        result = s;
+        dist = distCurr;
+        if (i < sents.length - 1 && sents[i + 1].length() > 60) {
+          followSent = sents[i + 1];
+        }
+
+      }
+    }
+    return new String[] { result, followSent };
+  }
+
+  public static void main(String[] args) {
+    RelatedSentenceFinder f = new RelatedSentenceFinder();
+    String b = GeneratedSentenceProcessor
+        .acceptableMinedSentence("Earlier this year it was also Quidam that broke international ground "
+            + " by becoming the first Cirque du Soleil show to be seen in the Middle "
+            + " East - in Dubai, United Arab Emirates - now currently playing to sold "
+            + " out audiences. ");
+    /*
+     * System.setProperty("resourcesDirectory",
+     * "C:/workspace/ZSearch/resources_external");
+     * System.setProperty("dragonDirectory", "DRAGON_PATH");
+     * System.setProperty("StanfordNE_resources", "STANFORD_NE_RESOURCES");
+     * System.setProperty("vcb_resources", "VCB_RESOURCES");
+     * System.setProperty("bing_app_id", "BING_APP_ID");
+     * System.setProperty("yahoo_app_id",
+     * "lyzaGJDV34EsJbugCymf7_oosEfMtbSwUBOhDQ8abqgy_Sl2roPjyg72T5.k1NIoyQ--");
+     */
+    List<HitBase> hits = null;
+    try {
+      /*
+       * // uncomment the sentence you would like to serve as a seed sentence
+       * for content generation for an event description hits =
+       * f.findRelatedOpinionsForSentence( //
+       * "Did anyone expect there to be any sort of real change? The system is a whole lot bigger than one person these days"
+       * , //
+       * "Reflection of Neocolonial Aggression and Destruction in Libya. Emergence of New Era of Western Democratic Dictatorship"
+       * , //
+       * "It has been reported by Columbia Broadcasting Systems of the American - Israeli Military Industrial Complex that the International Human Rights Organization has called upon the civilized nations of the Globe to place under arrest George Bush"
+       * , //
+       * "Washington bailed out the crooks who created this disaster, then they carry on with business as usual"
+       * , //
+       * "my comment about Rall's taking out his frustrations in writing because he can't make any money drawing never made it onto this board"
+       * ,
+       * "I like the app, I use it to find fun things to do in my area, it's amazing how much cool stuff it finds for me"
+       * , //
+       * "Cyclo-cross  is a form of bicycle racing. Races typically take place in the autumn and winter laps of a short course featuring pavement, wooded trails, grass, steep hills"
+       * , //
+       * "celebrate mama. Pampering for Mom including free make overs and wine. Free Gift Bags with Goodies and Samples to the first 250 Moms"
+       * , //
+       * "Washington Congress is taking a cautious approach to the massive street protests sweeping Egypt encouraging cries for reform, but wary that a more radical regime in Cairo could damage US interests, including the survival of Israel."
+       * , // "College football Virginia Tech", //
+       * " automatic ways and refereneces if they existed to produce content that increase the ratings bias opinions, counter-attach and compensate the former attacks in SEM and Opinion Mining in an automated way instead of having thousands of grantholders that manually verify the honesty of sites"
+       * , //
+       * "Virginia Tech quarterback Tyrod Taylor enjoyed a magnificent night, throwing three touchdown pas,ses and rushing for another in leading Virginia Tech to the ACC championship with a 44-33 victory over Florida State in the league�s title game played in front of 72,379 fans at Bank of America Stadium on Saturday night"
+       * , //
+       * "US banking giant Citigroup has taken over the ownership of EMI, the record label where it was the major creditor"
+       * , //
+       * "Egypt's army vows it will not use force against demonstrators, as the government says it is preparing to open talks with the opposition."
+       * ,
+       * 
+       * //
+       * "If you aren't enrolled in Paperless Statements and think you've received this message in error, please call our Customer Support team immediately, using the phone number on the Contact Us page on Chase Online"
+       * , // "Summer camp fair at french american international school",//
+       * "russian composers paganini", Arrays.asList(new String[] {
+       * 
+       * })); System.out.println(HitBase.toString(hits));
+       * System.out.println(HitBase.toString(hits).replace("[", "").replace("]",
+       * "").replace(" | ", ""));
+       */
+      // uncomment the sentence you would like to serve as a seed sentence for
+      // content generation for an event description
+      hits = f.findActivityDetailsForEventGroupName(
+          "Britney Spears - The Femme Fatale Tour",
+          // "Rush Time Machine",
+          // "amazon webservices summit",
+          // "Blue Man Group" ,
+          // "Belly Dance With Zaharah",
+          // "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
+          // "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
+          // "Cyclo-cross  is a form of bicycle racing. Races typically take place in the autumn and winter laps of a short course featuring pavement, wooded trails, grass, steep hills",
+          Arrays.asList(new String[] {}));
+      System.out.println(HitBase.toString(hits));
+      System.out.println(HitBase.toString(hits).replace("[", "")
+          .replace("]", "").replace(" | ", ""));
+
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+
+  }
+
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooHit.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooHit.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooHit.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooHit.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+public class YahooHit extends com.zvents.bing.HitBase implements
+    Comparable<YahooHit> {
+
+  public YahooHit(String orig, String[] generateds) {
+    super(orig, generateds);
+
+  }
+
+  int originalRank = -1;
+
+  int taxoScore = 0;
+
+  public YahooHit() {
+  };
+
+  public int getOriginalRank() {
+    return originalRank;
+  }
+
+  public void setOriginalRank(int originalRank) {
+    this.originalRank = originalRank;
+  }
+
+  public String processSnapshotForMatching(String snapshot) {
+    snapshot = snapshot.replace("<b>...</b>", ". ").replace("<b>", "")
+        .replace("</b>", "").replace(". . ", " ").replace(" . . . ", " ")
+        .replace("...", " ").replace(",..", " ").replace("&amp;", " ")
+        .replace("  ", " ");
+    snapshot = snapshot.replace('\'', ' ').replace('-', ' ');
+
+    return snapshot;
+  }
+
+  public int getTaxoScore() {
+    return taxoScore;
+  }
+
+  public void setTaxoScore(int taxoScore) {
+    this.taxoScore = taxoScore;
+  }
+
+  @Override
+  public int compareTo(YahooHit obj) {
+    YahooHit tmp = (YahooHit) obj;
+    if (this.taxoScore > tmp.taxoScore) {
+      return -1;
+    } else if (this.taxoScore < tmp.taxoScore) {
+      return 1;
+    }
+    return 0;
+  }
+
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooHit.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooHitComparable.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooHitComparable.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooHitComparable.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooHitComparable.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+import java.util.Comparator;
+
+public class YahooHitComparable implements Comparator<YahooHit> {
+  @Override
+  public int compare(YahooHit o1, YahooHit o2) {
+    return (o1.getGenerWithQueryScore() > o2.getGenerWithQueryScore() ? -1
+        : (o1 == o2 ? 0 : 1));
+  }
+}
\ No newline at end of file

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooHitComparable.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain