You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/10/11 17:36:33 UTC

svn commit: r1181845 [2/5] - in /incubator/opennlp/sandbox/opennlp-similarity: ./ src/main/java/opennlp/tools/similarity/ src/main/java/opennlp/tools/similarity/apps/ src/main/java/opennlp/tools/similarity/apps/utils/ src/main/java/opennlp/tools/textsi...

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooQueryRunner.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooQueryRunner.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooQueryRunner.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooQueryRunner.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,267 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.net.URLConnection;
+import java.net.URLEncoder;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.json.JSONArray;
+import org.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class is the superclass of all classes which are using yahoo-websearch
+ * API with JSON.
+ * 
+ */
+public class YahooQueryRunner {
+  protected static final String APP_ID = "XXX";
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(YahooQueryRunner.class);
+
+  /**
+   * To run a query on Yahoo, one needs the
+   * 
+   * @param query
+   *          it can be a row text with some pre-processing
+   * @param domainWeb
+   *          some sub-domain if necessary (default "")
+   * @param lang
+   *          language settings
+   * @param numbOfHits
+   * @return
+   * @throws Exception
+   */
+  protected String constructBossUrl(String query, String domainWeb,
+      String lang, int numbOfHits) throws Exception {
+    String _lang = "en";
+
+    String codedQuery = URLEncoder.encode(query, "UTF-8");
+    String yahooRequest = "http://boss.yahooapis.com/ysearch/web" + "/v1/"
+        + codedQuery + "?appid=" + APP_ID + "&count=" + numbOfHits
+        + "&format=json&sites=" + domainWeb + "&lang=" + _lang;
+    return yahooRequest;
+  }
+
+  /**
+   * 
+   * @param query
+   * @param domainWeb
+   * @param numbOfHits
+   *          For more details
+   *          http://developer.yahoo.com/search/image/V1/imageSearch.html
+   * @return
+   * @throws Exception
+   */
+  protected String constructBossImageSearchUrl(String query, String domainWeb,
+      int numbOfHits) throws Exception {
+    String codedQuery = URLEncoder.encode(query, "UTF-8");
+    String yahooRequest = "http://boss.yahooapis.com/ysearch/images/v1/"
+        + codedQuery + "?appid=" + APP_ID + "&count=" + numbOfHits
+        + "&format=json&sites=" + domainWeb;
+    return yahooRequest;
+  }
+
+  public ArrayList<String> search(String query, String domainWeb, String lang,
+      int numbOfHits) throws Exception {
+    URL url = new URL(constructBossUrl(query, domainWeb, lang, numbOfHits));
+    URLConnection connection = url.openConnection();
+
+    String line;
+    ArrayList<String> result = new ArrayList<String>();
+    BufferedReader reader = new BufferedReader(new InputStreamReader(
+        connection.getInputStream()));
+    int count = 0;
+    while ((line = reader.readLine()) != null) {
+      result.add(line);
+      count++;
+    }
+    return result;
+  }
+
+  public ArrayList<String> searchImage(String query, String domainWeb,
+      int numbOfHits) throws Exception {
+    URL url = new URL(constructBossImageSearchUrl(query, domainWeb, numbOfHits));
+    URLConnection connection = url.openConnection();
+
+    String line;
+    ArrayList<String> result = new ArrayList<String>();
+    BufferedReader reader = new BufferedReader(new InputStreamReader(
+        connection.getInputStream()));
+    int count = 0;
+    while ((line = reader.readLine()) != null) {
+      result.add(line);
+      count++;
+    }
+    return result;
+  }
+
+  public YahooResponse populateYahooHit(String response) throws Exception {
+    YahooResponse resp = new YahooResponse();
+    JSONObject rootObject = new JSONObject(response);
+    // each response is object that under the key of "ysearchresponse"
+    JSONObject responseObject = rootObject.getJSONObject("ysearchresponse");
+    try {
+      resp.setResponseCode(responseObject.getInt("responsecode"));
+      resp.setNextPageUrl(responseObject.getString("nextpage"));
+      resp.setTotalHits(responseObject.getInt("totalhits"));
+      resp.setDeepHits(responseObject.getInt("deephits"));
+      resp.setStartIndex(responseObject.getInt("start"));
+      resp.setPageSize(responseObject.getInt("count"));
+    } catch (Exception e) {
+      LOG.error("Reduced number of original results");
+    }
+
+    // the search result is in an array under the name of "resultset_web"
+    JSONArray resultSet = null;
+    try {
+      resultSet = responseObject.getJSONArray("resultset_web");
+    } catch (Exception e) {
+      System.err.print("\n!!!!!");
+      LOG.error("\nNo search results", e);
+      resultSet = null;
+    }
+    if (resultSet != null) {
+      for (int i = 0; i < resultSet.length(); i++) {
+        HitBase hit = new HitBase();
+        JSONObject singleResult = resultSet.getJSONObject(i);
+        hit.setAbstractText(singleResult.getString("abstract"));
+        hit.setClickUrl(singleResult.getString("clickurl"));
+        hit.setDisplayUrl(singleResult.getString("dispurl"));
+        hit.setUrl(singleResult.getString("url"));
+        hit.setDate(singleResult.getString("date"));
+        hit.setTitle(singleResult.getString("title"));
+
+        resp.appendHits(hit);
+      }
+    } else {
+      return null;
+    }
+    return resp;
+  }
+
+  protected void printSearchResult(String response) throws Exception {
+    JSONObject rootObject = new JSONObject(response);
+    // each response is object that under the key of "ysearchresponse"
+    JSONObject responseObject = rootObject.getJSONObject("ysearchresponse");
+    // printResponseAttributes(responseObject);
+
+    // the search result is in an array under the name of "resultset_web"
+    JSONArray resultSet = responseObject.getJSONArray("resultset_web");
+    System.out.println("Search Result:");
+    System.out.println("---------------------------");
+    for (int i = 0; i < resultSet.length(); i++) {
+      printSingleSearchResult(resultSet.getJSONObject(i));
+    }
+  }
+
+  protected void printResponseAttributes(JSONObject responseObject)
+      throws Exception {
+    // the response object has a few top level attributes
+    int responseCode = responseObject.getInt("responsecode");
+    String nextPageUrl = responseObject.getString("nextpage");
+    int totalHits = responseObject.getInt("totalhits");
+    int deepHits = responseObject.getInt("deephits");
+    int startIndex = responseObject.getInt("start");
+    int pageSize = responseObject.getInt("count");
+
+    System.out.println("responseCode = " + responseCode + ", totalHits = "
+        + totalHits + ", deepHits = " + deepHits + ", startIndex = "
+        + startIndex + ", pageSize = " + pageSize);
+    System.out.println("nextPageUrl = " + nextPageUrl);
+  }
+
+  protected void printSingleSearchResult(JSONObject singleResult)
+      throws Exception {
+    // each single search result has a few attributes
+    String abstractText = singleResult.getString("abstract");
+    String clickUrl = singleResult.getString("clickurl");
+    String displayUrl = singleResult.getString("dispurl");
+    String url = singleResult.getString("url");
+    String date = singleResult.getString("date");
+
+    // System.out.println("URL = " + url + ", date = " + date);
+    System.out.println("Abstract = " + abstractText);
+    // System.out.println("Display URL = " + displayUrl);
+    // System.out.println("Click URL = " + clickUrl);
+    System.out.println("---------------------------");
+  }
+
+  public List<HitBase> runSearch(String query) {
+    YahooResponse resp = null;
+    try {
+
+      List<String> resultList = search(query, "", "en", 30);
+      LOG.info(query);
+      if (resultList.size() != 0) {
+        resp = populateYahooHit(resultList.get(0));
+      } else {
+        LOG.info("Fikamika " + query);
+      }
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+
+    if (resp != null) {
+      List<HitBase> hits = new ArrayList<HitBase>();
+      for (HitBase h : resp.getHits())
+        hits.add((HitBase) h);
+
+      hits = HitBase.removeDuplicates(hits);
+      return hits;
+    } else {
+      return null;
+    }
+
+  }
+
+  public List<HitBase> runSearchInDomain(String domain) {
+    YahooResponse resp = null;
+    try {
+
+      List<String> resultList = search("the", domain, "en", 30);
+
+      if (resultList.size() != 0) {
+        resp = populateYahooHit(resultList.get(0));
+      } else {
+        LOG.info("No search results in domain " + domain);
+      }
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+
+    if (resp != null) {
+      List<HitBase> hits = new ArrayList<HitBase>();
+      for (HitBase h : resp.getHits())
+        hits.add((HitBase) h);
+
+      hits = HitBase.removeDuplicates(hits);
+      return hits;
+    } else {
+      return null;
+    }
+
+  }
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooQueryRunner.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponse.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponse.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponse.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponse.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class YahooResponse extends YahooResponseBase {
+  private List<HitBase> hits;
+
+  public YahooResponse() {
+    hits = new ArrayList<HitBase>();
+  }
+
+  public void appendHits(HitBase hit) {
+    hits.add(hit);
+  }
+
+  public List<HitBase> getHits() {
+    return hits;
+  }
+
+  public void setHits(List<HitBase> hits) {
+    this.hits = hits;
+  }
+
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponse.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponseBase.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponseBase.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponseBase.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponseBase.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+public class YahooResponseBase {
+  private int responseCode;
+
+  private String nextPageUrl;
+
+  private int totalHits;
+
+  private int deepHits;
+
+  private int startIndex;
+
+  private int pageSize;
+
+  public int getResponseCode() {
+    return responseCode;
+  }
+
+  public void setResponseCode(int responseCode) {
+    this.responseCode = responseCode;
+  }
+
+  public String getNextPageUrl() {
+    return nextPageUrl;
+  }
+
+  public void setNextPageUrl(String nextPageUrl) {
+    this.nextPageUrl = nextPageUrl;
+  }
+
+  public int getTotalHits() {
+    return totalHits;
+  }
+
+  public void setTotalHits(int totalHits) {
+    this.totalHits = totalHits;
+  }
+
+  public int getDeepHits() {
+    return deepHits;
+  }
+
+  public void setDeepHits(int deepHits) {
+    this.deepHits = deepHits;
+  }
+
+  public int getStartIndex() {
+    return startIndex;
+  }
+
+  public void setStartIndex(int startIndex) {
+    this.startIndex = startIndex;
+  }
+
+  public int getPageSize() {
+    return pageSize;
+  }
+
+  public void setPageSize(int pageSize) {
+    this.pageSize = pageSize;
+  }
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponseBase.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/CountItemsList.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/CountItemsList.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/CountItemsList.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/CountItemsList.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps.utils;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+public class CountItemsList<E> extends ArrayList<E> {
+
+  /**
+	 * 
+	 */
+  private static final long serialVersionUID = 1L;
+
+  // This is private. It is not visible from outside.
+  private Map<E, Integer> count = new HashMap<E, Integer>();
+
+  // There are several entry points to this class
+  // this is just to show one of them.
+  public boolean add(E element) {
+    if (!count.containsKey(element)) {
+      count.put(element, 1);
+    } else {
+      count.put(element, count.get(element) + 1);
+    }
+    return super.add(element);
+  }
+
+  // This method belongs to CountItemList interface ( or class )
+  // to used you have to cast.
+  public int getCount(E element) {
+    if (!count.containsKey(element)) {
+      return 0;
+    }
+    return count.get(element);
+  }
+
+  public List<E> getFrequentTags() {
+    Map<E, Integer> sortedMap = ValueSortMap.sortMapByValue(count, false);
+    List<E> vals = new ArrayList<E>(sortedMap.keySet());
+    if (vals.size() > 3) {
+      vals = vals.subList(0, 3);
+    }
+    return vals;
+  }
+
+}
\ No newline at end of file

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/CountItemsList.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/LevensteinDistanceFinder.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/LevensteinDistanceFinder.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/LevensteinDistanceFinder.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/LevensteinDistanceFinder.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps.utils;
+
+public class LevensteinDistanceFinder {
+
+  public static double matchLevensteinDistance(String str1, String str2) {
+    if (str1.length() <= str2.length()) {
+      if (str2.indexOf(str1) == 0) {
+        return 0;
+      }
+    }
+    if (str2.length() < str1.length()) {
+      if (str1.indexOf(str2) == 0) {
+        return 0;
+      }
+    }
+
+    return levensteinDistance(str1, str2, 1, 10, 1, 10)
+        / (str1.length() + 1 + str2.length());
+  }
+
+  /**
+   * Computes Levenstain distance (unit distance) between two strings. Use
+   * dynamic programming algorithm to calculate matrix with distances between
+   * substrings. Time complexity - O(length1 * length2), memory - O(length1 +
+   * length2)
+   * 
+   * @return distance between strings.
+   */
+  public static double levensteinDistance(String str1, String str2,
+      int letterInsDelCost, int digitInsDelCost, int letterReplaceCost,
+      int digitReplaceCost) {
+    int length1 = str1.length() + 1;
+    int length2 = str2.length() + 1;
+    int[] upper = new int[length2];
+    int[] left = new int[length1];
+    upper[0] = 0;
+    left[0] = 0;
+    for (int i = 1; i < length1; i++) {
+      int cost = letterInsDelCost; // 1 is a cost for deleting a character
+      if (Character.isDigit(str1.charAt(i - 1))) {
+        cost = digitInsDelCost;
+      }
+      left[i] = left[i - 1] + cost;
+    }
+    for (int j = 1; j < length2; j++) {
+      int cost = letterInsDelCost; // 1 is a cost for inserting a character
+      if (Character.isDigit(str2.charAt(j - 1))) {
+        cost = digitInsDelCost;
+      }
+      upper[j] = upper[j - 1] + cost;
+      int min = 0;
+      for (int i = 1; i < length1; i++) {
+        cost = letterInsDelCost; // 1 is a cost for inserting a character
+        if (Character.isDigit(str1.charAt(i - 1))) {
+          cost = digitInsDelCost;
+        }
+        int fromLeft = left[i] + cost;
+        cost = letterInsDelCost; // 1 is a cost for deleting a character
+        if (Character.isDigit(str2.charAt(j - 1))) {
+          cost = digitInsDelCost;
+        }
+        int fromUp = upper[j] + cost;
+        int delta = 0;
+        if (str1.charAt(i - 1) != str2.charAt(j - 1)) {
+          // 1 is a cost for replacing a character
+          delta = letterReplaceCost;
+          if (Character.isDigit(str1.charAt(i - 1))
+              || Character.isDigit(str2.charAt(j - 1))) {
+            delta = digitReplaceCost;
+          }
+        }
+        int cross = left[i - 1] + delta;
+        if (fromLeft < fromUp) {
+          if (fromLeft < cross) {
+            min = fromLeft;
+          } else {
+            min = cross;
+          }
+        } else {
+          if (fromUp < cross) {
+            min = fromUp;
+          } else {
+            min = cross;
+          }
+        }
+        left[i - 1] = upper[j];
+        upper[j] = min;
+      }
+    }
+    return upper[length2 - 1];
+  }
+
+  public static double distanceBetweenStringArraysAsSpaceSepar(String line1,
+      String line2) {
+    String[] strings1 = line1.split(" ");
+    String[] strings2 = line2.split(" ");
+    if (strings1.length == 0 || strings2.length == 0) {
+      return -1;
+    }
+    boolean[] selected2 = new boolean[strings2.length];
+    boolean[] selected1 = new boolean[strings1.length];
+    int intersectNum = 0;
+    for (int i = 0; i < strings1.length; i++) {
+      for (int j = 0; j < strings2.length; j++) {
+        if (selected1[i]) {
+          continue;
+        }
+        if (selected2[j]) {
+          continue;
+        }
+        if (levensteinDistance(strings1[i], strings2[j], 1, 1, 1, 1)
+            / (strings1.length + strings2.length) < 0.2) {
+          intersectNum++;
+          selected2[j] = true;
+          selected1[i] = true;
+        }
+      }
+    }
+    if (strings1.length == intersectNum || strings2.length == intersectNum) {
+      return ((double) (strings1.length + strings2.length - 2 * intersectNum))
+          / (strings1.length + strings2.length) / 10; // bg - 20
+    } else {
+      return ((double) (strings1.length + strings2.length - 2 * intersectNum))
+          / (strings1.length + strings2.length) / 4; // bg - 1.5
+    }
+  }
+
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/LevensteinDistanceFinder.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps.utils;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.net.URLConnection;
+
+import org.apache.tika.Tika;
+import org.apache.tika.exception.TikaException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.stereotype.Component;
+
+@Component
+public class PageFetcher {
+  private static final Logger LOG = LoggerFactory.getLogger(PageFetcher.class);
+
+  private static int DEFAULT_TIMEOUT = 15000;
+
+  public String fetchPage(final String url) {
+    return fetchPage(url, DEFAULT_TIMEOUT);
+  }
+
+  public String fetchPage(final String url, final int timeout) {
+    String fetchURL = addHttp(url);
+
+    LOG.info("fetch url " + fetchURL);
+
+    String pageContent = null;
+    URLConnection connection;
+    try {
+      connection = new URL(url).openConnection();
+      connection.setReadTimeout(DEFAULT_TIMEOUT);
+      Tika tika = new Tika();
+      pageContent = tika.parseToString(connection.getInputStream())
+          .replace('\n', ' ').replace('\t', ' ');
+    } catch (MalformedURLException e) {
+      LOG.error(e.getMessage(), e);
+    } catch (IOException e) {
+      LOG.error(e.getMessage(), e);
+    } catch (TikaException e) {
+      LOG.error(e.getMessage(), e);
+    }
+    return pageContent;
+  }
+
+  private String addHttp(final String url) {
+    if (!url.startsWith("http://")) {
+      return "http://" + url;
+    }
+    return url;
+  }
+
+  public String fetchOrigHTML(String url) {
+    System.out.println("fetch url " + url);
+    String pageContent = null;
+    StringBuffer buf = new StringBuffer();
+    try {
+      URLConnection connection = new URL(url).openConnection();
+      connection.setReadTimeout(10000);
+      connection
+          .setRequestProperty(
+              "User-Agent",
+              "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3");
+      String line;
+      BufferedReader reader = null;
+      try {
+        reader = new BufferedReader(new InputStreamReader(
+            connection.getInputStream()));
+      } catch (Exception e) {
+        // we dont need to log trial web pages if access fails
+        // LOG.error(e.getMessage(), e);
+      }
+
+      while ((line = reader.readLine()) != null) {
+        buf.append(line);
+      }
+
+    }
+    // normal case when a hypothetical page does not exist
+    catch (Exception e) {
+
+      // LOG.error(e.getMessage(), e);
+      // System.err.println("error fetching url " + url);
+    }
+    try {
+      Thread.sleep(50); // do nothing 4 sec
+    } catch (InterruptedException e) {
+      e.printStackTrace();
+    }
+    return buf.toString();
+  }
+
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps.utils;
+
+/**
+ * Generic pair class for holding two objects. Often used as return object.
+ * 
+ * @author Albert-Jan de Vries
+ * 
+ * @param <T1>
+ * @param <T2>
+ */
+public class Pair<T1, T2> {
+  private T1 first;
+
+  private T2 second;
+
+  public Pair() {
+
+  }
+
+  public Pair(T1 first, T2 second) {
+    this.first = first;
+    this.second = second;
+  }
+
+  public T1 getFirst() {
+    return first;
+  }
+
+  public void setFirst(T1 first) {
+    this.first = first;
+  }
+
+  public T2 getSecond() {
+    return second;
+  }
+
+  public void setSecond(T2 second) {
+    this.second = second;
+  }
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PorterStemmer.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PorterStemmer.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PorterStemmer.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PorterStemmer.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,409 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps.utils;
+
+public class PorterStemmer {
+  public String stem(String str) {
+    // check for zero length
+    if (str.length() > 0) {
+      // all characters must be letters
+      char[] c = str.toCharArray();
+      for (int i = 0; i < c.length; i++) {
+        if (!Character.isLetter(c[i]))
+          return "Invalid term";
+      }
+    } else {
+      return "No term entered";
+    }
+    str = step1a(str);
+    str = step1b(str);
+    str = step1c(str);
+    str = step2(str);
+    str = step3(str);
+    str = step4(str);
+    str = step5a(str);
+    str = step5b(str);
+    return str;
+  } // end stem
+
+  protected String step1a(String str) {
+    // SSES -> SS
+    if (str.endsWith("sses")) {
+      return str.substring(0, str.length() - 2);
+      // IES -> I
+    } else if (str.endsWith("ies")) {
+      return str.substring(0, str.length() - 2);
+      // SS -> S
+    } else if (str.endsWith("ss")) {
+      return str;
+      // S ->
+    } else if (str.endsWith("s")) {
+      return str.substring(0, str.length() - 1);
+    } else {
+      return str;
+    }
+  } // end step1a
+
+  protected String step1b(String str) {
+    // (m > 0) EED -> EE
+    if (str.endsWith("eed")) {
+      if (stringMeasure(str.substring(0, str.length() - 3)) > 0)
+        return str.substring(0, str.length() - 1);
+      else
+        return str;
+      // (*v*) ED ->
+    } else if ((str.endsWith("ed"))
+        && (containsVowel(str.substring(0, str.length() - 2)))) {
+      return step1b2(str.substring(0, str.length() - 2));
+      // (*v*) ING ->
+    } else if ((str.endsWith("ing"))
+        && (containsVowel(str.substring(0, str.length() - 3)))) {
+      return step1b2(str.substring(0, str.length() - 3));
+    } // end if
+    return str;
+  } // end step1b
+
+  protected String step1b2(String str) {
+    // AT -> ATE
+    if (str.endsWith("at") || str.endsWith("bl") || str.endsWith("iz")) {
+      return str + "e";
+    } else if ((endsWithDoubleConsonent(str))
+        && (!(str.endsWith("l") || str.endsWith("s") || str.endsWith("z")))) {
+      return str.substring(0, str.length() - 1);
+    } else if ((stringMeasure(str) == 1) && (endsWithCVC(str))) {
+      return str + "e";
+    } else {
+      return str;
+    }
+  } // end step1b2
+
+  protected String step1c(String str) {
+    // (*v*) Y -> I
+    if (str.endsWith("y")) {
+      if (containsVowel(str.substring(0, str.length() - 1)))
+        return str.substring(0, str.length() - 1) + "i";
+    } // end if
+    return str;
+  } // end step1c
+
+  protected String step2(String str) {
+    // (m > 0) ATIONAL -> ATE
+    if ((str.endsWith("ational"))
+        && (stringMeasure(str.substring(0, str.length() - 5)) > 0)) {
+      return str.substring(0, str.length() - 5) + "e";
+      // (m > 0) TIONAL -> TION
+    } else if ((str.endsWith("tional"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+      return str.substring(0, str.length() - 2);
+      // (m > 0) ENCI -> ENCE
+    } else if ((str.endsWith("enci"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+      return str.substring(0, str.length() - 2);
+      // (m > 0) ANCI -> ANCE
+    } else if ((str.endsWith("anci"))
+        && (stringMeasure(str.substring(0, str.length() - 1)) > 0)) {
+      return str.substring(0, str.length() - 1) + "e";
+      // (m > 0) IZER -> IZE
+    } else if ((str.endsWith("izer"))
+        && (stringMeasure(str.substring(0, str.length() - 1)) > 0)) {
+      return str.substring(0, str.length() - 1);
+      // (m > 0) ABLI -> ABLE
+    } else if ((str.endsWith("abli"))
+        && (stringMeasure(str.substring(0, str.length() - 1)) > 0)) {
+      return str.substring(0, str.length() - 1) + "e";
+      // (m > 0) ENTLI -> ENT
+    } else if ((str.endsWith("alli"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+      return str.substring(0, str.length() - 2);
+      // (m > 0) ELI -> E
+    } else if ((str.endsWith("entli"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+      return str.substring(0, str.length() - 2);
+      // (m > 0) OUSLI -> OUS
+    } else if ((str.endsWith("eli"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+      return str.substring(0, str.length() - 2);
+      // (m > 0) IZATION -> IZE
+    } else if ((str.endsWith("ousli"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+      return str.substring(0, str.length() - 2);
+      // (m > 0) IZATION -> IZE
+    } else if ((str.endsWith("ization"))
+        && (stringMeasure(str.substring(0, str.length() - 5)) > 0)) {
+      return str.substring(0, str.length() - 5) + "e";
+      // (m > 0) ATION -> ATE
+    } else if ((str.endsWith("ation"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+      return str.substring(0, str.length() - 3) + "e";
+      // (m > 0) ATOR -> ATE
+    } else if ((str.endsWith("ator"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+      return str.substring(0, str.length() - 2) + "e";
+      // (m > 0) ALISM -> AL
+    } else if ((str.endsWith("alism"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 0) IVENESS -> IVE
+    } else if ((str.endsWith("iveness"))
+        && (stringMeasure(str.substring(0, str.length() - 4)) > 0)) {
+      return str.substring(0, str.length() - 4);
+      // (m > 0) FULNESS -> FUL
+    } else if ((str.endsWith("fulness"))
+        && (stringMeasure(str.substring(0, str.length() - 4)) > 0)) {
+      return str.substring(0, str.length() - 4);
+      // (m > 0) OUSNESS -> OUS
+    } else if ((str.endsWith("ousness"))
+        && (stringMeasure(str.substring(0, str.length() - 4)) > 0)) {
+      return str.substring(0, str.length() - 4);
+      // (m > 0) ALITII -> AL
+    } else if ((str.endsWith("aliti"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 0) IVITI -> IVE
+    } else if ((str.endsWith("iviti"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+      return str.substring(0, str.length() - 3) + "e";
+      // (m > 0) BILITI -> BLE
+    } else if ((str.endsWith("biliti"))
+        && (stringMeasure(str.substring(0, str.length() - 5)) > 0)) {
+      return str.substring(0, str.length() - 5) + "le";
+    } // end if
+    return str;
+  } // end step2
+
+  protected String step3(String str) {
+    // (m > 0) ICATE -> IC
+    if ((str.endsWith("icate"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 0) ATIVE ->
+    } else if ((str.endsWith("ative"))
+        && (stringMeasure(str.substring(0, str.length() - 5)) > 0)) {
+      return str.substring(0, str.length() - 5);
+      // (m > 0) ALIZE -> AL
+    } else if ((str.endsWith("alize"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 0) ICITI -> IC
+    } else if ((str.endsWith("iciti"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 0) ICAL -> IC
+    } else if ((str.endsWith("ical"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+      return str.substring(0, str.length() - 2);
+      // (m > 0) FUL ->
+    } else if ((str.endsWith("ful"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 0) NESS ->
+    } else if ((str.endsWith("ness"))
+        && (stringMeasure(str.substring(0, str.length() - 4)) > 0)) {
+      return str.substring(0, str.length() - 4);
+    } // end if
+    return str;
+  } // end step3
+
+  protected String step4(String str) {
+    if ((str.endsWith("al"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 1)) {
+      return str.substring(0, str.length() - 2);
+      // (m > 1) ANCE ->
+    } else if ((str.endsWith("ance"))
+        && (stringMeasure(str.substring(0, str.length() - 4)) > 1)) {
+      return str.substring(0, str.length() - 4);
+      // (m > 1) ENCE ->
+    } else if ((str.endsWith("ence"))
+        && (stringMeasure(str.substring(0, str.length() - 4)) > 1)) {
+      return str.substring(0, str.length() - 4);
+      // (m > 1) ER ->
+    } else if ((str.endsWith("er"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 1)) {
+      return str.substring(0, str.length() - 2);
+      // (m > 1) IC ->
+    } else if ((str.endsWith("ic"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 1)) {
+      return str.substring(0, str.length() - 2);
+      // (m > 1) ABLE ->
+    } else if ((str.endsWith("able"))
+        && (stringMeasure(str.substring(0, str.length() - 4)) > 1)) {
+      return str.substring(0, str.length() - 4);
+      // (m > 1) IBLE ->
+    } else if ((str.endsWith("ible"))
+        && (stringMeasure(str.substring(0, str.length() - 4)) > 1)) {
+      return str.substring(0, str.length() - 4);
+      // (m > 1) ANT ->
+    } else if ((str.endsWith("ant"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 1) EMENT ->
+    } else if ((str.endsWith("ement"))
+        && (stringMeasure(str.substring(0, str.length() - 5)) > 1)) {
+      return str.substring(0, str.length() - 5);
+      // (m > 1) MENT ->
+    } else if ((str.endsWith("ment"))
+        && (stringMeasure(str.substring(0, str.length() - 4)) > 1)) {
+      return str.substring(0, str.length() - 4);
+      // (m > 1) ENT ->
+    } else if ((str.endsWith("ent"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 1) and (*S or *T) ION ->
+    } else if ((str.endsWith("sion") || str.endsWith("tion"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 1) OU ->
+    } else if ((str.endsWith("ou"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 1)) {
+      return str.substring(0, str.length() - 2);
+      // (m > 1) ISM ->
+    } else if ((str.endsWith("ism"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 1) ATE ->
+    } else if ((str.endsWith("ate"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 1) ITI ->
+    } else if ((str.endsWith("iti"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 1) OUS ->
+    } else if ((str.endsWith("ous"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 1) IVE ->
+    } else if ((str.endsWith("ive"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 1) IZE ->
+    } else if ((str.endsWith("ize"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+      return str.substring(0, str.length() - 3);
+    } // end if
+    return str;
+  } // end step4
+
+  protected String step5a(String str) {
+    // (m > 1) E ->
+    if ((stringMeasure(str.substring(0, str.length() - 1)) > 1)
+        && str.endsWith("e"))
+      return str.substring(0, str.length() - 1);
+    // (m = 1 and not *0) E ->
+    else if ((stringMeasure(str.substring(0, str.length() - 1)) == 1)
+        && (!endsWithCVC(str.substring(0, str.length() - 1)))
+        && (str.endsWith("e")))
+      return str.substring(0, str.length() - 1);
+    else
+      return str;
+  } // end step5a
+
+  protected String step5b(String str) {
+    // (m > 1 and *d and *L) ->
+    if (str.endsWith("l") && endsWithDoubleConsonent(str)
+        && (stringMeasure(str.substring(0, str.length() - 1)) > 1)) {
+      return str.substring(0, str.length() - 1);
+    } else {
+      return str;
+    }
+  } // end step5b
+
+  /*
+   * ------------------------------------------------------- The following are
+   * functions to help compute steps 1 - 5
+   * -------------------------------------------------------
+   */
+
+  // does string end with 's'?
+  protected boolean endsWithS(String str) {
+    return str.endsWith("s");
+  } // end function
+
+  // does string contain a vowel?
+  protected boolean containsVowel(String str) {
+    char[] strchars = str.toCharArray();
+    for (int i = 0; i < strchars.length; i++) {
+      if (isVowel(strchars[i]))
+        return true;
+    }
+    // no aeiou but there is y
+    if (str.indexOf('y') > -1)
+      return true;
+    else
+      return false;
+  } // end function
+
+  // is char a vowel?
+  public boolean isVowel(char c) {
+    if ((c == 'a') || (c == 'e') || (c == 'i') || (c == 'o') || (c == 'u'))
+      return true;
+    else
+      return false;
+  } // end function
+
+  // does string end with a double consonent?
+  protected boolean endsWithDoubleConsonent(String str) {
+    char c = str.charAt(str.length() - 1);
+    if (c == str.charAt(str.length() - 2))
+      if (!containsVowel(str.substring(str.length() - 2))) {
+        return true;
+      }
+    return false;
+  } // end function
+
+  // returns a CVC measure for the string
+  protected int stringMeasure(String str) {
+    int count = 0;
+    boolean vowelSeen = false;
+    char[] strchars = str.toCharArray();
+
+    for (int i = 0; i < strchars.length; i++) {
+      if (isVowel(strchars[i])) {
+        vowelSeen = true;
+      } else if (vowelSeen) {
+        count++;
+        vowelSeen = false;
+      }
+    } // end for
+    return count;
+  } // end function
+
+  // does stem end with CVC?
+  protected boolean endsWithCVC(String str) {
+    char c, v, c2 = ' ';
+    if (str.length() >= 3) {
+      c = str.charAt(str.length() - 1);
+      v = str.charAt(str.length() - 2);
+      c2 = str.charAt(str.length() - 3);
+    } else {
+      return false;
+    }
+
+    if ((c == 'w') || (c == 'x') || (c == 'y')) {
+      return false;
+    } else if (isVowel(c)) {
+      return false;
+    } else if (!isVowel(v)) {
+      return false;
+    } else if (isVowel(c2)) {
+      return false;
+    } else {
+      return true;
+    }
+  } // end function
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PorterStemmer.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringCleaner.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringCleaner.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringCleaner.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringCleaner.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps.utils;
+
+public class StringCleaner {
+  public static String processSnapshotForMatching(String snapshot) {
+    snapshot = snapshot.replace("<b>...</b>", ". ").replace("<b>", "")
+        .replace("</b>", "").replace(". . ", " ").replace(" . . . ", " ")
+        .replace("...", " ").replace(",..", " ").replace("&amp;", " ")
+        .replace('\"', ' ').replace("  ", " ");
+    snapshot = snapshot.replace('\'', ' ').replace('-', ' ');
+
+    return snapshot;
+  }
+}
\ No newline at end of file

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringCleaner.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,331 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps.utils;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.springframework.stereotype.Component;
+
+@Component
+public class StringDistanceMeasurer {
+  // external tools
+  private PorterStemmer ps; // stemmer
+
+  private static final int MIN_STRING_LENGTH_FOR_WORD = 4;
+
+  protected int MIN_STRING_LENGTH_FOR_DISTORTED_WORD = 6;
+
+  protected static final int ACCEPTABLE_DEVIATION_IN_CHAR = 2;
+
+  private static final double MIN_SCORE_FOR_LING = 100; // 0.7;
+
+  public StringDistanceMeasurer() {
+    // first get stemmer
+    ps = new PorterStemmer();
+    if (MIN_SCORE_FOR_LING > 1.0)
+      return;
+
+  }
+
+  // gets string array and process numbers, applies stemming and forms a list
+  protected List<String> filterWordArray(String[] strWords) {
+    List<String> strList = new ArrayList<String>();
+    for (String w : strWords) {
+      Boolean bInteger = true;
+      try {
+        Integer.parseInt(w);
+      } catch (Exception e) {
+        bInteger = false;
+      }
+      if (w.length() < MIN_STRING_LENGTH_FOR_WORD && !bInteger) // only
+                                                                // non-integer
+                                                                // short
+        // string like preposition is uninteresting
+        continue;
+      try {
+        w = ps.stem(w.toLowerCase());
+      } catch (Exception e) {
+        // do nothing, just have original term
+      }
+      if (w.startsWith("Invalid"))
+        continue;
+      strList.add(w);
+    }
+    return strList;
+  }
+
+  protected List<String> filterWordArrayNoStem(String[] strWords) {
+    List<String> strList = new ArrayList<String>();
+    for (String w : strWords) {
+      Boolean bInteger = true;
+      try {
+        Integer.parseInt(w);
+      } catch (Exception e) {
+        bInteger = false;
+      }
+      if (w.length() < MIN_STRING_LENGTH_FOR_WORD && !bInteger) // only
+                                                                // non-integer
+                                                                // short
+        // string like preposition is uninteresting
+        continue;
+      w = w.toLowerCase();
+
+      strList.add(w);
+    }
+    return strList;
+  }
+
+  // main entry point. Gets two strings and applies string match
+  // and also linguistic match if score > a threshold
+  public double measureStringDistance(String str1, String str2) {
+    double result = (double) -1.0;
+    try {
+      str1 = StringCleaner.processSnapshotForMatching(str1);
+      str2 = StringCleaner.processSnapshotForMatching(str2);
+      if (str1.equals(str2)) // || str1.endsWith(str2) || str2.endsWith(str1))
+                             // bg 03-2011
+        return 1.0;
+
+      String[] str1Words = str1.split(" ");
+      String[] str2Words = str2.split(" ");
+      List<String> str1List = filterWordArray(str1Words), str2List = filterWordArray(str2Words);
+
+      int l1 = str1List.size(), l2 = str2List.size();
+      if (l1 < 2)
+        l1 = str1Words.length;
+      if (l2 < 2)
+        l2 = str2Words.length;
+
+      int lOverlap = 0;
+      List<String> strListOverlap = new ArrayList<String>(str1List);
+      strListOverlap.retainAll(str2List);
+      for (String w : strListOverlap) {
+        if (w.toLowerCase().equals(w)) // no special interest word
+          lOverlap++;
+        else
+          lOverlap += 2; // if capitalized, or specific word => important so
+                         // double score
+      }
+      result = Math.pow((double) (lOverlap * lOverlap) / (double) l1
+          / (double) l2, 0.4);
+
+      // now we try to find similar words which are long or Upper case
+      int countSimilar = 0;
+      str1List.removeAll(strListOverlap);
+      str2List.removeAll(strListOverlap);
+      for (String w1 : str1List) {
+        for (String w2 : str2List) {
+          if (w1.length() > MIN_STRING_LENGTH_FOR_DISTORTED_WORD
+              || !w1.toLowerCase().equals(w1))
+            if (w2.length() > MIN_STRING_LENGTH_FOR_DISTORTED_WORD
+                || !w2.toLowerCase().equals(w2))
+              if (LevensteinDistanceFinder.levensteinDistance(w1, w2, 1, 10, 1,
+                  10) <= ACCEPTABLE_DEVIATION_IN_CHAR)
+                countSimilar++;
+        }
+      }
+      lOverlap += countSimilar;
+      result = Math.pow((double) (lOverlap * lOverlap) / (double) l1
+          / (double) l2, 0.4);
+      if (result > 1)
+        result = (double) 1.0;
+
+      // double ld = LevensteinDistanceFinder. levensteinDistance(str1, str2, 1,
+      // 10, 1, 10);
+      // System.out.println(ld);
+
+    } catch (Exception e) {
+      e.printStackTrace();
+      return (double) -1.0;
+    }
+
+    Double linguisticScore = (double) -1.0;
+    // to be developed - employs linguistic processor
+    /*
+     * if (result>MIN_SCORE_FOR_LING) { List<List<ParseTreeChunk>> matchResult =
+     * pos.matchOrigSentencesCache(str1, str2); linguisticScore =
+     * ParseTreeChunkListScorer.getParseTreeChunkListScore(matchResult);
+     * System.out.println(matchResult);
+     * 
+     * // magic formula for 0.7 string match and 0.3 linguistic match result =
+     * result*0.7 + linguisticScore/6.0* 0.3; }
+     */
+    return result;
+  }
+
+  public double measureStringDistanceNoStemming(String str1, String str2) {
+    double result = (double) -1.0;
+    try {
+      str1 = StringCleaner.processSnapshotForMatching(str1);
+      str2 = StringCleaner.processSnapshotForMatching(str2);
+      if (str1.equals(str2)) // || str1.endsWith(str2) || str2.endsWith(str1))
+                             // bg 03-2011
+        return 1.0;
+
+      String[] str1Words = str1.split(" ");
+      String[] str2Words = str2.split(" ");
+      List<String> str1List = filterWordArrayNoStem(str1Words), str2List = filterWordArrayNoStem(str2Words);
+
+      int l1 = str1List.size(), l2 = str2List.size();
+      if (l1 < 2)
+        l1 = str1Words.length;
+      if (l2 < 2)
+        l2 = str2Words.length;
+
+      int lOverlap = 0;
+      List<String> strListOverlap = new ArrayList<String>(str1List);
+      strListOverlap.retainAll(str2List);
+      for (String w : strListOverlap) {
+        if (w.toLowerCase().equals(w)) // no special interest word
+          lOverlap++;
+        else
+          lOverlap += 2; // if capitalized, or specific word => important so
+                         // double score
+      }
+      result = Math.pow((double) (lOverlap * lOverlap) / (double) l1
+          / (double) l2, 0.4);
+
+      // now we try to find similar words which are long or Upper case
+      int countSimilar = 0;
+      str1List.removeAll(strListOverlap);
+      str2List.removeAll(strListOverlap);
+      for (String w1 : str1List) {
+        for (String w2 : str2List) {
+          if (w1.length() > MIN_STRING_LENGTH_FOR_DISTORTED_WORD
+              || !w1.toLowerCase().equals(w1))
+            if (w2.length() > MIN_STRING_LENGTH_FOR_DISTORTED_WORD
+                || !w2.toLowerCase().equals(w2))
+              if (LevensteinDistanceFinder.levensteinDistance(w1, w2, 1, 10, 1,
+                  10) <= ACCEPTABLE_DEVIATION_IN_CHAR)
+                countSimilar++;
+        }
+      }
+      lOverlap += countSimilar;
+      result = Math.pow((double) (lOverlap * lOverlap) / (double) l1
+          / (double) l2, 0.4);
+      if (result > 1)
+        result = (double) 1.0;
+
+      // double ld = LevensteinDistanceFinder. levensteinDistance(str1, str2, 1,
+      // 10, 1, 10);
+      // System.out.println(ld);
+
+    } catch (Exception e) {
+      e.printStackTrace();
+      return (double) -1.0;
+    }
+
+    Double linguisticScore = (double) -1.0;
+    // to be developed - employs linguistic processor
+    /*
+     * if (result>MIN_SCORE_FOR_LING) { List<List<ParseTreeChunk>> matchResult =
+     * pos.matchOrigSentencesCache(str1, str2); linguisticScore =
+     * ParseTreeChunkListScorer.getParseTreeChunkListScore(matchResult);
+     * System.out.println(matchResult);
+     * 
+     * // magic formula for 0.7 string match and 0.3 linguistic match result =
+     * result*0.7 + linguisticScore/6.0* 0.3; }
+     */
+    return result;
+  }
+
+  public static void main(String[] args) {
+    StringDistanceMeasurer meas = new StringDistanceMeasurer();
+
+    // String sent1 =
+    // "estoy en LA,California y no encuentro tu album en NINGUNA parte!!! " +
+    // "NO MANCHES!!!lo tengo que comprar por internet!! " ;
+
+    // redunction of announcement
+    String sent2a = "Tomarow come check us out if your in the area show starts at 6:00pm "
+        + "2404 E. La Palma Anaheim, California 92806 Cost:$3";
+    String sent2b = "Tomorrow you can check us if you area will show start at 6 pm "
+        + "2404 East La Palma Anaheim, $3";
+    // common sub-chunk = [VBZ-starts IN-at NNP-* NNP-* ]
+
+    // original posting and its yahoo.com search snapshot
+    String sent4a = "Fliers may have to wait years for new liquid screening equipment";
+    String sent4b = "for screening checkpoints and equipment; improving ... "
+        + "Wait times are not just a problem at large airports";
+
+    // slang and search snapshot
+    String sent5a = "hell yea i stay in california. and hell no lol LA sucks hella bad, "
+        + "i lived there for a while and hated it sooo much, so boring! ";
+
+    String sent5b = "My life is so boring without Tree Hill and the OC. America is sooo "
+        + "racist I LOVE YOU SO MUCH. TO everyone that has hurt me...no one in the ..... Yeah sucks I know";
+
+    String sent6a = "I think its gonna be in the east coast as well. California is pretty but way "
+        + "to close to LA and helicopters are gonna ruin it";
+    String sent6b = "could be in east coast as well. California is pretty but way "
+        + "to close to LA and choppers will ruin it";
+    // common sub-chunk = [JJ-east NN-coast ]
+
+    String sent7a = "Iran nuke document called 'alarming'. Their Program started in the 50s with our help!";
+    String sent7b = "nuke project of Iran is alarming' Program started in 1950s with our help";
+    // common sub-chunk = [VBD-started IN-in NNS-50s IN-with PRP$-our NN-help ]
+
+    // News title for the same event
+    String sent8a = "Pakistan slaps travel ban on defence minister";
+    String sent8b = "Pakistan corruption fall-out threatens stability";
+    String sent8c = "Pakistan defence minister 'barred from leaving country'";
+    String sent8d = "Pakistani defence minister banned from travel";
+    String sent8dd = "Pakistani defence minister banned from travel"; // to
+                                                                      // check
+                                                                      // the
+                                                                      // case of
+                                                                      // 1.0
+
+    // common sub-chunk = [NN-defence NN-minister ]
+
+    List<Double> matchRes = new ArrayList<Double>();
+    matchRes.add(meas.measureStringDistance(sent2a, sent2b));
+    matchRes.add(meas.measureStringDistance(sent4a, sent4b));
+    matchRes.add(meas.measureStringDistance(sent5a, sent5b));
+    matchRes.add(meas.measureStringDistance(sent6a, sent6b));
+    matchRes.add(meas.measureStringDistance(sent7a, sent7b));
+
+    System.out.println(matchRes);
+    // [0.8178702752867737, 0.21082473737065027, 0.27594593229224296,
+    // 0.7517586466500455, 0.9100766715907641]
+
+    matchRes = new ArrayList<Double>();
+    matchRes.add(meas.measureStringDistance(sent8a, sent8b));
+    matchRes.add(meas.measureStringDistance(sent8a, sent8c));
+    matchRes.add(meas.measureStringDistance(sent8a, sent8d));
+    matchRes.add(meas.measureStringDistance(sent8b, sent8c));
+    matchRes.add(meas.measureStringDistance(sent8b, sent8d));
+    matchRes.add(meas.measureStringDistance(sent8c, sent8d));
+
+    System.out.println(matchRes);
+    // [0.48044977359257246, 0.8365116420730185, 0.8365116420730185,
+    // 0.48044977359257246, 0.27594593229224296,
+    // 0.6391010941257969]
+
+    matchRes = new ArrayList<Double>();
+    // to verify that the same sentence gives 1
+    matchRes.add(meas.measureStringDistance(sent8dd, sent8d));
+    // to verify that totally different sentences give 0
+    matchRes.add(meas.measureStringDistance(sent2a, sent8d));
+
+    System.out.println("Now testing 1 and 0: \n" + matchRes);
+    // Now testing 1 and 0:
+    // [1.0, 0.0]
+  }
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain