You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/10/11 17:36:33 UTC
svn commit: r1181845 [2/5] - in
/incubator/opennlp/sandbox/opennlp-similarity: ./
src/main/java/opennlp/tools/similarity/
src/main/java/opennlp/tools/similarity/apps/
src/main/java/opennlp/tools/similarity/apps/utils/
src/main/java/opennlp/tools/textsi...
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooQueryRunner.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooQueryRunner.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooQueryRunner.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooQueryRunner.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,267 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.net.URLConnection;
+import java.net.URLEncoder;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.json.JSONArray;
+import org.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class is the superclass of all classes which are using yahoo-websearch
+ * API with JSON.
+ *
+ */
+public class YahooQueryRunner {
+ protected static final String APP_ID = "XXX";
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(YahooQueryRunner.class);
+
+ /**
+ * To run a query on Yahoo, one needs the
+ *
+ * @param query
+ * it can be a row text with some pre-processing
+ * @param domainWeb
+ * some sub-domain if necessary (default "")
+ * @param lang
+ * language settings
+ * @param numbOfHits
+ * @return
+ * @throws Exception
+ */
+ protected String constructBossUrl(String query, String domainWeb,
+ String lang, int numbOfHits) throws Exception {
+ String _lang = "en";
+
+ String codedQuery = URLEncoder.encode(query, "UTF-8");
+ String yahooRequest = "http://boss.yahooapis.com/ysearch/web" + "/v1/"
+ + codedQuery + "?appid=" + APP_ID + "&count=" + numbOfHits
+ + "&format=json&sites=" + domainWeb + "&lang=" + _lang;
+ return yahooRequest;
+ }
+
+ /**
+ *
+ * @param query
+ * @param domainWeb
+ * @param numbOfHits
+ * For more details
+ * http://developer.yahoo.com/search/image/V1/imageSearch.html
+ * @return
+ * @throws Exception
+ */
+ protected String constructBossImageSearchUrl(String query, String domainWeb,
+ int numbOfHits) throws Exception {
+ String codedQuery = URLEncoder.encode(query, "UTF-8");
+ String yahooRequest = "http://boss.yahooapis.com/ysearch/images/v1/"
+ + codedQuery + "?appid=" + APP_ID + "&count=" + numbOfHits
+ + "&format=json&sites=" + domainWeb;
+ return yahooRequest;
+ }
+
+ public ArrayList<String> search(String query, String domainWeb, String lang,
+ int numbOfHits) throws Exception {
+ URL url = new URL(constructBossUrl(query, domainWeb, lang, numbOfHits));
+ URLConnection connection = url.openConnection();
+
+ String line;
+ ArrayList<String> result = new ArrayList<String>();
+ BufferedReader reader = new BufferedReader(new InputStreamReader(
+ connection.getInputStream()));
+ int count = 0;
+ while ((line = reader.readLine()) != null) {
+ result.add(line);
+ count++;
+ }
+ return result;
+ }
+
+ public ArrayList<String> searchImage(String query, String domainWeb,
+ int numbOfHits) throws Exception {
+ URL url = new URL(constructBossImageSearchUrl(query, domainWeb, numbOfHits));
+ URLConnection connection = url.openConnection();
+
+ String line;
+ ArrayList<String> result = new ArrayList<String>();
+ BufferedReader reader = new BufferedReader(new InputStreamReader(
+ connection.getInputStream()));
+ int count = 0;
+ while ((line = reader.readLine()) != null) {
+ result.add(line);
+ count++;
+ }
+ return result;
+ }
+
+ public YahooResponse populateYahooHit(String response) throws Exception {
+ YahooResponse resp = new YahooResponse();
+ JSONObject rootObject = new JSONObject(response);
+ // each response is object that under the key of "ysearchresponse"
+ JSONObject responseObject = rootObject.getJSONObject("ysearchresponse");
+ try {
+ resp.setResponseCode(responseObject.getInt("responsecode"));
+ resp.setNextPageUrl(responseObject.getString("nextpage"));
+ resp.setTotalHits(responseObject.getInt("totalhits"));
+ resp.setDeepHits(responseObject.getInt("deephits"));
+ resp.setStartIndex(responseObject.getInt("start"));
+ resp.setPageSize(responseObject.getInt("count"));
+ } catch (Exception e) {
+ LOG.error("Reduced number of original results");
+ }
+
+ // the search result is in an array under the name of "resultset_web"
+ JSONArray resultSet = null;
+ try {
+ resultSet = responseObject.getJSONArray("resultset_web");
+ } catch (Exception e) {
+ System.err.print("\n!!!!!");
+ LOG.error("\nNo search results", e);
+ resultSet = null;
+ }
+ if (resultSet != null) {
+ for (int i = 0; i < resultSet.length(); i++) {
+ HitBase hit = new HitBase();
+ JSONObject singleResult = resultSet.getJSONObject(i);
+ hit.setAbstractText(singleResult.getString("abstract"));
+ hit.setClickUrl(singleResult.getString("clickurl"));
+ hit.setDisplayUrl(singleResult.getString("dispurl"));
+ hit.setUrl(singleResult.getString("url"));
+ hit.setDate(singleResult.getString("date"));
+ hit.setTitle(singleResult.getString("title"));
+
+ resp.appendHits(hit);
+ }
+ } else {
+ return null;
+ }
+ return resp;
+ }
+
+ protected void printSearchResult(String response) throws Exception {
+ JSONObject rootObject = new JSONObject(response);
+ // each response is object that under the key of "ysearchresponse"
+ JSONObject responseObject = rootObject.getJSONObject("ysearchresponse");
+ // printResponseAttributes(responseObject);
+
+ // the search result is in an array under the name of "resultset_web"
+ JSONArray resultSet = responseObject.getJSONArray("resultset_web");
+ System.out.println("Search Result:");
+ System.out.println("---------------------------");
+ for (int i = 0; i < resultSet.length(); i++) {
+ printSingleSearchResult(resultSet.getJSONObject(i));
+ }
+ }
+
+ protected void printResponseAttributes(JSONObject responseObject)
+ throws Exception {
+ // the response object has a few top level attributes
+ int responseCode = responseObject.getInt("responsecode");
+ String nextPageUrl = responseObject.getString("nextpage");
+ int totalHits = responseObject.getInt("totalhits");
+ int deepHits = responseObject.getInt("deephits");
+ int startIndex = responseObject.getInt("start");
+ int pageSize = responseObject.getInt("count");
+
+ System.out.println("responseCode = " + responseCode + ", totalHits = "
+ + totalHits + ", deepHits = " + deepHits + ", startIndex = "
+ + startIndex + ", pageSize = " + pageSize);
+ System.out.println("nextPageUrl = " + nextPageUrl);
+ }
+
+ protected void printSingleSearchResult(JSONObject singleResult)
+ throws Exception {
+ // each single search result has a few attributes
+ String abstractText = singleResult.getString("abstract");
+ String clickUrl = singleResult.getString("clickurl");
+ String displayUrl = singleResult.getString("dispurl");
+ String url = singleResult.getString("url");
+ String date = singleResult.getString("date");
+
+ // System.out.println("URL = " + url + ", date = " + date);
+ System.out.println("Abstract = " + abstractText);
+ // System.out.println("Display URL = " + displayUrl);
+ // System.out.println("Click URL = " + clickUrl);
+ System.out.println("---------------------------");
+ }
+
+ public List<HitBase> runSearch(String query) {
+ YahooResponse resp = null;
+ try {
+
+ List<String> resultList = search(query, "", "en", 30);
+ LOG.info(query);
+ if (resultList.size() != 0) {
+ resp = populateYahooHit(resultList.get(0));
+ } else {
+ LOG.info("Fikamika " + query);
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ if (resp != null) {
+ List<HitBase> hits = new ArrayList<HitBase>();
+ for (HitBase h : resp.getHits())
+ hits.add((HitBase) h);
+
+ hits = HitBase.removeDuplicates(hits);
+ return hits;
+ } else {
+ return null;
+ }
+
+ }
+
+ public List<HitBase> runSearchInDomain(String domain) {
+ YahooResponse resp = null;
+ try {
+
+ List<String> resultList = search("the", domain, "en", 30);
+
+ if (resultList.size() != 0) {
+ resp = populateYahooHit(resultList.get(0));
+ } else {
+ LOG.info("No search results in domain " + domain);
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ if (resp != null) {
+ List<HitBase> hits = new ArrayList<HitBase>();
+ for (HitBase h : resp.getHits())
+ hits.add((HitBase) h);
+
+ hits = HitBase.removeDuplicates(hits);
+ return hits;
+ } else {
+ return null;
+ }
+
+ }
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooQueryRunner.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponse.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponse.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponse.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponse.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class YahooResponse extends YahooResponseBase {
+ private List<HitBase> hits;
+
+ public YahooResponse() {
+ hits = new ArrayList<HitBase>();
+ }
+
+ public void appendHits(HitBase hit) {
+ hits.add(hit);
+ }
+
+ public List<HitBase> getHits() {
+ return hits;
+ }
+
+ public void setHits(List<HitBase> hits) {
+ this.hits = hits;
+ }
+
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponse.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponseBase.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponseBase.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponseBase.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponseBase.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+public class YahooResponseBase {
+ private int responseCode;
+
+ private String nextPageUrl;
+
+ private int totalHits;
+
+ private int deepHits;
+
+ private int startIndex;
+
+ private int pageSize;
+
+ public int getResponseCode() {
+ return responseCode;
+ }
+
+ public void setResponseCode(int responseCode) {
+ this.responseCode = responseCode;
+ }
+
+ public String getNextPageUrl() {
+ return nextPageUrl;
+ }
+
+ public void setNextPageUrl(String nextPageUrl) {
+ this.nextPageUrl = nextPageUrl;
+ }
+
+ public int getTotalHits() {
+ return totalHits;
+ }
+
+ public void setTotalHits(int totalHits) {
+ this.totalHits = totalHits;
+ }
+
+ public int getDeepHits() {
+ return deepHits;
+ }
+
+ public void setDeepHits(int deepHits) {
+ this.deepHits = deepHits;
+ }
+
+ public int getStartIndex() {
+ return startIndex;
+ }
+
+ public void setStartIndex(int startIndex) {
+ this.startIndex = startIndex;
+ }
+
+ public int getPageSize() {
+ return pageSize;
+ }
+
+ public void setPageSize(int pageSize) {
+ this.pageSize = pageSize;
+ }
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooResponseBase.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/CountItemsList.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/CountItemsList.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/CountItemsList.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/CountItemsList.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps.utils;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+public class CountItemsList<E> extends ArrayList<E> {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+ // This is private. It is not visible from outside.
+ private Map<E, Integer> count = new HashMap<E, Integer>();
+
+ // There are several entry points to this class
+ // this is just to show one of them.
+ public boolean add(E element) {
+ if (!count.containsKey(element)) {
+ count.put(element, 1);
+ } else {
+ count.put(element, count.get(element) + 1);
+ }
+ return super.add(element);
+ }
+
+ // This method belongs to CountItemList interface ( or class )
+ // to used you have to cast.
+ public int getCount(E element) {
+ if (!count.containsKey(element)) {
+ return 0;
+ }
+ return count.get(element);
+ }
+
+ public List<E> getFrequentTags() {
+ Map<E, Integer> sortedMap = ValueSortMap.sortMapByValue(count, false);
+ List<E> vals = new ArrayList<E>(sortedMap.keySet());
+ if (vals.size() > 3) {
+ vals = vals.subList(0, 3);
+ }
+ return vals;
+ }
+
+}
\ No newline at end of file
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/CountItemsList.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/LevensteinDistanceFinder.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/LevensteinDistanceFinder.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/LevensteinDistanceFinder.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/LevensteinDistanceFinder.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps.utils;
+
+public class LevensteinDistanceFinder {
+
+ public static double matchLevensteinDistance(String str1, String str2) {
+ if (str1.length() <= str2.length()) {
+ if (str2.indexOf(str1) == 0) {
+ return 0;
+ }
+ }
+ if (str2.length() < str1.length()) {
+ if (str1.indexOf(str2) == 0) {
+ return 0;
+ }
+ }
+
+ return levensteinDistance(str1, str2, 1, 10, 1, 10)
+ / (str1.length() + 1 + str2.length());
+ }
+
+ /**
+ * Computes Levenstain distance (unit distance) between two strings. Use
+ * dynamic programming algorithm to calculate matrix with distances between
+ * substrings. Time complexity - O(length1 * length2), memory - O(length1 +
+ * length2)
+ *
+ * @return distance between strings.
+ */
+ public static double levensteinDistance(String str1, String str2,
+ int letterInsDelCost, int digitInsDelCost, int letterReplaceCost,
+ int digitReplaceCost) {
+ int length1 = str1.length() + 1;
+ int length2 = str2.length() + 1;
+ int[] upper = new int[length2];
+ int[] left = new int[length1];
+ upper[0] = 0;
+ left[0] = 0;
+ for (int i = 1; i < length1; i++) {
+ int cost = letterInsDelCost; // 1 is a cost for deleting a character
+ if (Character.isDigit(str1.charAt(i - 1))) {
+ cost = digitInsDelCost;
+ }
+ left[i] = left[i - 1] + cost;
+ }
+ for (int j = 1; j < length2; j++) {
+ int cost = letterInsDelCost; // 1 is a cost for inserting a character
+ if (Character.isDigit(str2.charAt(j - 1))) {
+ cost = digitInsDelCost;
+ }
+ upper[j] = upper[j - 1] + cost;
+ int min = 0;
+ for (int i = 1; i < length1; i++) {
+ cost = letterInsDelCost; // 1 is a cost for inserting a character
+ if (Character.isDigit(str1.charAt(i - 1))) {
+ cost = digitInsDelCost;
+ }
+ int fromLeft = left[i] + cost;
+ cost = letterInsDelCost; // 1 is a cost for deleting a character
+ if (Character.isDigit(str2.charAt(j - 1))) {
+ cost = digitInsDelCost;
+ }
+ int fromUp = upper[j] + cost;
+ int delta = 0;
+ if (str1.charAt(i - 1) != str2.charAt(j - 1)) {
+ // 1 is a cost for replacing a character
+ delta = letterReplaceCost;
+ if (Character.isDigit(str1.charAt(i - 1))
+ || Character.isDigit(str2.charAt(j - 1))) {
+ delta = digitReplaceCost;
+ }
+ }
+ int cross = left[i - 1] + delta;
+ if (fromLeft < fromUp) {
+ if (fromLeft < cross) {
+ min = fromLeft;
+ } else {
+ min = cross;
+ }
+ } else {
+ if (fromUp < cross) {
+ min = fromUp;
+ } else {
+ min = cross;
+ }
+ }
+ left[i - 1] = upper[j];
+ upper[j] = min;
+ }
+ }
+ return upper[length2 - 1];
+ }
+
+ public static double distanceBetweenStringArraysAsSpaceSepar(String line1,
+ String line2) {
+ String[] strings1 = line1.split(" ");
+ String[] strings2 = line2.split(" ");
+ if (strings1.length == 0 || strings2.length == 0) {
+ return -1;
+ }
+ boolean[] selected2 = new boolean[strings2.length];
+ boolean[] selected1 = new boolean[strings1.length];
+ int intersectNum = 0;
+ for (int i = 0; i < strings1.length; i++) {
+ for (int j = 0; j < strings2.length; j++) {
+ if (selected1[i]) {
+ continue;
+ }
+ if (selected2[j]) {
+ continue;
+ }
+ if (levensteinDistance(strings1[i], strings2[j], 1, 1, 1, 1)
+ / (strings1.length + strings2.length) < 0.2) {
+ intersectNum++;
+ selected2[j] = true;
+ selected1[i] = true;
+ }
+ }
+ }
+ if (strings1.length == intersectNum || strings2.length == intersectNum) {
+ return ((double) (strings1.length + strings2.length - 2 * intersectNum))
+ / (strings1.length + strings2.length) / 10; // bg - 20
+ } else {
+ return ((double) (strings1.length + strings2.length - 2 * intersectNum))
+ / (strings1.length + strings2.length) / 4; // bg - 1.5
+ }
+ }
+
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/LevensteinDistanceFinder.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps.utils;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.net.URLConnection;
+
+import org.apache.tika.Tika;
+import org.apache.tika.exception.TikaException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.stereotype.Component;
+
+@Component
+public class PageFetcher {
+ private static final Logger LOG = LoggerFactory.getLogger(PageFetcher.class);
+
+ private static int DEFAULT_TIMEOUT = 15000;
+
+ public String fetchPage(final String url) {
+ return fetchPage(url, DEFAULT_TIMEOUT);
+ }
+
+ public String fetchPage(final String url, final int timeout) {
+ String fetchURL = addHttp(url);
+
+ LOG.info("fetch url " + fetchURL);
+
+ String pageContent = null;
+ URLConnection connection;
+ try {
+ connection = new URL(url).openConnection();
+ connection.setReadTimeout(DEFAULT_TIMEOUT);
+ Tika tika = new Tika();
+ pageContent = tika.parseToString(connection.getInputStream())
+ .replace('\n', ' ').replace('\t', ' ');
+ } catch (MalformedURLException e) {
+ LOG.error(e.getMessage(), e);
+ } catch (IOException e) {
+ LOG.error(e.getMessage(), e);
+ } catch (TikaException e) {
+ LOG.error(e.getMessage(), e);
+ }
+ return pageContent;
+ }
+
+ private String addHttp(final String url) {
+ if (!url.startsWith("http://")) {
+ return "http://" + url;
+ }
+ return url;
+ }
+
+ public String fetchOrigHTML(String url) {
+ System.out.println("fetch url " + url);
+ String pageContent = null;
+ StringBuffer buf = new StringBuffer();
+ try {
+ URLConnection connection = new URL(url).openConnection();
+ connection.setReadTimeout(10000);
+ connection
+ .setRequestProperty(
+ "User-Agent",
+ "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3");
+ String line;
+ BufferedReader reader = null;
+ try {
+ reader = new BufferedReader(new InputStreamReader(
+ connection.getInputStream()));
+ } catch (Exception e) {
+ // we dont need to log trial web pages if access fails
+ // LOG.error(e.getMessage(), e);
+ }
+
+ while ((line = reader.readLine()) != null) {
+ buf.append(line);
+ }
+
+ }
+ // normal case when a hypothetical page does not exist
+ catch (Exception e) {
+
+ // LOG.error(e.getMessage(), e);
+ // System.err.println("error fetching url " + url);
+ }
+ try {
+ Thread.sleep(50); // do nothing 4 sec
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ return buf.toString();
+ }
+
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps.utils;
+
+/**
+ * Generic pair class for holding two objects. Often used as return object.
+ *
+ * @author Albert-Jan de Vries
+ *
+ * @param <T1>
+ * @param <T2>
+ */
+public class Pair<T1, T2> {
+ private T1 first;
+
+ private T2 second;
+
+ public Pair() {
+
+ }
+
+ public Pair(T1 first, T2 second) {
+ this.first = first;
+ this.second = second;
+ }
+
+ public T1 getFirst() {
+ return first;
+ }
+
+ public void setFirst(T1 first) {
+ this.first = first;
+ }
+
+ public T2 getSecond() {
+ return second;
+ }
+
+ public void setSecond(T2 second) {
+ this.second = second;
+ }
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PorterStemmer.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PorterStemmer.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PorterStemmer.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PorterStemmer.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,409 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps.utils;
+
+public class PorterStemmer {
+ public String stem(String str) {
+ // check for zero length
+ if (str.length() > 0) {
+ // all characters must be letters
+ char[] c = str.toCharArray();
+ for (int i = 0; i < c.length; i++) {
+ if (!Character.isLetter(c[i]))
+ return "Invalid term";
+ }
+ } else {
+ return "No term entered";
+ }
+ str = step1a(str);
+ str = step1b(str);
+ str = step1c(str);
+ str = step2(str);
+ str = step3(str);
+ str = step4(str);
+ str = step5a(str);
+ str = step5b(str);
+ return str;
+ } // end stem
+
+ protected String step1a(String str) {
+ // SSES -> SS
+ if (str.endsWith("sses")) {
+ return str.substring(0, str.length() - 2);
+ // IES -> I
+ } else if (str.endsWith("ies")) {
+ return str.substring(0, str.length() - 2);
+ // SS -> S
+ } else if (str.endsWith("ss")) {
+ return str;
+ // S ->
+ } else if (str.endsWith("s")) {
+ return str.substring(0, str.length() - 1);
+ } else {
+ return str;
+ }
+ } // end step1a
+
+ protected String step1b(String str) {
+ // (m > 0) EED -> EE
+ if (str.endsWith("eed")) {
+ if (stringMeasure(str.substring(0, str.length() - 3)) > 0)
+ return str.substring(0, str.length() - 1);
+ else
+ return str;
+ // (*v*) ED ->
+ } else if ((str.endsWith("ed"))
+ && (containsVowel(str.substring(0, str.length() - 2)))) {
+ return step1b2(str.substring(0, str.length() - 2));
+ // (*v*) ING ->
+ } else if ((str.endsWith("ing"))
+ && (containsVowel(str.substring(0, str.length() - 3)))) {
+ return step1b2(str.substring(0, str.length() - 3));
+ } // end if
+ return str;
+ } // end step1b
+
+ protected String step1b2(String str) {
+ // AT -> ATE
+ if (str.endsWith("at") || str.endsWith("bl") || str.endsWith("iz")) {
+ return str + "e";
+ } else if ((endsWithDoubleConsonent(str))
+ && (!(str.endsWith("l") || str.endsWith("s") || str.endsWith("z")))) {
+ return str.substring(0, str.length() - 1);
+ } else if ((stringMeasure(str) == 1) && (endsWithCVC(str))) {
+ return str + "e";
+ } else {
+ return str;
+ }
+ } // end step1b2
+
+ protected String step1c(String str) {
+ // (*v*) Y -> I
+ if (str.endsWith("y")) {
+ if (containsVowel(str.substring(0, str.length() - 1)))
+ return str.substring(0, str.length() - 1) + "i";
+ } // end if
+ return str;
+ } // end step1c
+
+ protected String step2(String str) {
+ // (m > 0) ATIONAL -> ATE
+ if ((str.endsWith("ational"))
+ && (stringMeasure(str.substring(0, str.length() - 5)) > 0)) {
+ return str.substring(0, str.length() - 5) + "e";
+ // (m > 0) TIONAL -> TION
+ } else if ((str.endsWith("tional"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+ return str.substring(0, str.length() - 2);
+ // (m > 0) ENCI -> ENCE
+ } else if ((str.endsWith("enci"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+ return str.substring(0, str.length() - 2);
+ // (m > 0) ANCI -> ANCE
+ } else if ((str.endsWith("anci"))
+ && (stringMeasure(str.substring(0, str.length() - 1)) > 0)) {
+ return str.substring(0, str.length() - 1) + "e";
+ // (m > 0) IZER -> IZE
+ } else if ((str.endsWith("izer"))
+ && (stringMeasure(str.substring(0, str.length() - 1)) > 0)) {
+ return str.substring(0, str.length() - 1);
+ // (m > 0) ABLI -> ABLE
+ } else if ((str.endsWith("abli"))
+ && (stringMeasure(str.substring(0, str.length() - 1)) > 0)) {
+ return str.substring(0, str.length() - 1) + "e";
+ // (m > 0) ENTLI -> ENT
+ } else if ((str.endsWith("alli"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+ return str.substring(0, str.length() - 2);
+ // (m > 0) ELI -> E
+ } else if ((str.endsWith("entli"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+ return str.substring(0, str.length() - 2);
+ // (m > 0) OUSLI -> OUS
+ } else if ((str.endsWith("eli"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+ return str.substring(0, str.length() - 2);
+ // (m > 0) IZATION -> IZE
+ } else if ((str.endsWith("ousli"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+ return str.substring(0, str.length() - 2);
+ // (m > 0) IZATION -> IZE
+ } else if ((str.endsWith("ization"))
+ && (stringMeasure(str.substring(0, str.length() - 5)) > 0)) {
+ return str.substring(0, str.length() - 5) + "e";
+ // (m > 0) ATION -> ATE
+ } else if ((str.endsWith("ation"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+ return str.substring(0, str.length() - 3) + "e";
+ // (m > 0) ATOR -> ATE
+ } else if ((str.endsWith("ator"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+ return str.substring(0, str.length() - 2) + "e";
+ // (m > 0) ALISM -> AL
+ } else if ((str.endsWith("alism"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 0) IVENESS -> IVE
+ } else if ((str.endsWith("iveness"))
+ && (stringMeasure(str.substring(0, str.length() - 4)) > 0)) {
+ return str.substring(0, str.length() - 4);
+ // (m > 0) FULNESS -> FUL
+ } else if ((str.endsWith("fulness"))
+ && (stringMeasure(str.substring(0, str.length() - 4)) > 0)) {
+ return str.substring(0, str.length() - 4);
+ // (m > 0) OUSNESS -> OUS
+ } else if ((str.endsWith("ousness"))
+ && (stringMeasure(str.substring(0, str.length() - 4)) > 0)) {
+ return str.substring(0, str.length() - 4);
+ // (m > 0) ALITII -> AL
+ } else if ((str.endsWith("aliti"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 0) IVITI -> IVE
+ } else if ((str.endsWith("iviti"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+ return str.substring(0, str.length() - 3) + "e";
+ // (m > 0) BILITI -> BLE
+ } else if ((str.endsWith("biliti"))
+ && (stringMeasure(str.substring(0, str.length() - 5)) > 0)) {
+ return str.substring(0, str.length() - 5) + "le";
+ } // end if
+ return str;
+ } // end step2
+
+ protected String step3(String str) {
+ // (m > 0) ICATE -> IC
+ if ((str.endsWith("icate"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 0) ATIVE ->
+ } else if ((str.endsWith("ative"))
+ && (stringMeasure(str.substring(0, str.length() - 5)) > 0)) {
+ return str.substring(0, str.length() - 5);
+ // (m > 0) ALIZE -> AL
+ } else if ((str.endsWith("alize"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 0) ICITI -> IC
+ } else if ((str.endsWith("iciti"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 0) ICAL -> IC
+ } else if ((str.endsWith("ical"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+ return str.substring(0, str.length() - 2);
+ // (m > 0) FUL ->
+ } else if ((str.endsWith("ful"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 0) NESS ->
+ } else if ((str.endsWith("ness"))
+ && (stringMeasure(str.substring(0, str.length() - 4)) > 0)) {
+ return str.substring(0, str.length() - 4);
+ } // end if
+ return str;
+ } // end step3
+
+ protected String step4(String str) {
+ if ((str.endsWith("al"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 1)) {
+ return str.substring(0, str.length() - 2);
+ // (m > 1) ANCE ->
+ } else if ((str.endsWith("ance"))
+ && (stringMeasure(str.substring(0, str.length() - 4)) > 1)) {
+ return str.substring(0, str.length() - 4);
+ // (m > 1) ENCE ->
+ } else if ((str.endsWith("ence"))
+ && (stringMeasure(str.substring(0, str.length() - 4)) > 1)) {
+ return str.substring(0, str.length() - 4);
+ // (m > 1) ER ->
+ } else if ((str.endsWith("er"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 1)) {
+ return str.substring(0, str.length() - 2);
+ // (m > 1) IC ->
+ } else if ((str.endsWith("ic"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 1)) {
+ return str.substring(0, str.length() - 2);
+ // (m > 1) ABLE ->
+ } else if ((str.endsWith("able"))
+ && (stringMeasure(str.substring(0, str.length() - 4)) > 1)) {
+ return str.substring(0, str.length() - 4);
+ // (m > 1) IBLE ->
+ } else if ((str.endsWith("ible"))
+ && (stringMeasure(str.substring(0, str.length() - 4)) > 1)) {
+ return str.substring(0, str.length() - 4);
+ // (m > 1) ANT ->
+ } else if ((str.endsWith("ant"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 1) EMENT ->
+ } else if ((str.endsWith("ement"))
+ && (stringMeasure(str.substring(0, str.length() - 5)) > 1)) {
+ return str.substring(0, str.length() - 5);
+ // (m > 1) MENT ->
+ } else if ((str.endsWith("ment"))
+ && (stringMeasure(str.substring(0, str.length() - 4)) > 1)) {
+ return str.substring(0, str.length() - 4);
+ // (m > 1) ENT ->
+ } else if ((str.endsWith("ent"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 1) and (*S or *T) ION ->
+ } else if ((str.endsWith("sion") || str.endsWith("tion"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 1) OU ->
+ } else if ((str.endsWith("ou"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 1)) {
+ return str.substring(0, str.length() - 2);
+ // (m > 1) ISM ->
+ } else if ((str.endsWith("ism"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 1) ATE ->
+ } else if ((str.endsWith("ate"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 1) ITI ->
+ } else if ((str.endsWith("iti"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 1) OUS ->
+ } else if ((str.endsWith("ous"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 1) IVE ->
+ } else if ((str.endsWith("ive"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 1) IZE ->
+ } else if ((str.endsWith("ize"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+ return str.substring(0, str.length() - 3);
+ } // end if
+ return str;
+ } // end step4
+
+ protected String step5a(String str) {
+ // (m > 1) E ->
+ if ((stringMeasure(str.substring(0, str.length() - 1)) > 1)
+ && str.endsWith("e"))
+ return str.substring(0, str.length() - 1);
+ // (m = 1 and not *0) E ->
+ else if ((stringMeasure(str.substring(0, str.length() - 1)) == 1)
+ && (!endsWithCVC(str.substring(0, str.length() - 1)))
+ && (str.endsWith("e")))
+ return str.substring(0, str.length() - 1);
+ else
+ return str;
+ } // end step5a
+
+ protected String step5b(String str) {
+ // (m > 1 and *d and *L) ->
+ if (str.endsWith("l") && endsWithDoubleConsonent(str)
+ && (stringMeasure(str.substring(0, str.length() - 1)) > 1)) {
+ return str.substring(0, str.length() - 1);
+ } else {
+ return str;
+ }
+ } // end step5b
+
+ /*
+ * ------------------------------------------------------- The following are
+ * functions to help compute steps 1 - 5
+ * -------------------------------------------------------
+ */
+
+ // does string end with 's'?
+ protected boolean endsWithS(String str) {
+ return str.endsWith("s");
+ } // end function
+
+ // does string contain a vowel?
+ protected boolean containsVowel(String str) {
+ char[] strchars = str.toCharArray();
+ for (int i = 0; i < strchars.length; i++) {
+ if (isVowel(strchars[i]))
+ return true;
+ }
+ // no aeiou but there is y
+ if (str.indexOf('y') > -1)
+ return true;
+ else
+ return false;
+ } // end function
+
+ // is char a vowel?
+ public boolean isVowel(char c) {
+ if ((c == 'a') || (c == 'e') || (c == 'i') || (c == 'o') || (c == 'u'))
+ return true;
+ else
+ return false;
+ } // end function
+
+ // does string end with a double consonent?
+ protected boolean endsWithDoubleConsonent(String str) {
+ char c = str.charAt(str.length() - 1);
+ if (c == str.charAt(str.length() - 2))
+ if (!containsVowel(str.substring(str.length() - 2))) {
+ return true;
+ }
+ return false;
+ } // end function
+
+ // returns a CVC measure for the string
+ protected int stringMeasure(String str) {
+ int count = 0;
+ boolean vowelSeen = false;
+ char[] strchars = str.toCharArray();
+
+ for (int i = 0; i < strchars.length; i++) {
+ if (isVowel(strchars[i])) {
+ vowelSeen = true;
+ } else if (vowelSeen) {
+ count++;
+ vowelSeen = false;
+ }
+ } // end for
+ return count;
+ } // end function
+
+ // does stem end with CVC?
+ protected boolean endsWithCVC(String str) {
+ char c, v, c2 = ' ';
+ if (str.length() >= 3) {
+ c = str.charAt(str.length() - 1);
+ v = str.charAt(str.length() - 2);
+ c2 = str.charAt(str.length() - 3);
+ } else {
+ return false;
+ }
+
+ if ((c == 'w') || (c == 'x') || (c == 'y')) {
+ return false;
+ } else if (isVowel(c)) {
+ return false;
+ } else if (!isVowel(v)) {
+ return false;
+ } else if (isVowel(c2)) {
+ return false;
+ } else {
+ return true;
+ }
+ } // end function
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PorterStemmer.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringCleaner.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringCleaner.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringCleaner.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringCleaner.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps.utils;
+
+public class StringCleaner {
+ public static String processSnapshotForMatching(String snapshot) {
+ snapshot = snapshot.replace("<b>...</b>", ". ").replace("<b>", "")
+ .replace("</b>", "").replace(". . ", " ").replace(" . . . ", " ")
+ .replace("...", " ").replace(",..", " ").replace("&", " ")
+ .replace('\"', ' ').replace(" ", " ");
+ snapshot = snapshot.replace('\'', ' ').replace('-', ' ');
+
+ return snapshot;
+ }
+}
\ No newline at end of file
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringCleaner.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,331 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps.utils;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.springframework.stereotype.Component;
+
+@Component
+public class StringDistanceMeasurer {
+ // external tools
+ private PorterStemmer ps; // stemmer
+
+ private static final int MIN_STRING_LENGTH_FOR_WORD = 4;
+
+ protected int MIN_STRING_LENGTH_FOR_DISTORTED_WORD = 6;
+
+ protected static final int ACCEPTABLE_DEVIATION_IN_CHAR = 2;
+
+ private static final double MIN_SCORE_FOR_LING = 100; // 0.7;
+
+ public StringDistanceMeasurer() {
+ // first get stemmer
+ ps = new PorterStemmer();
+ if (MIN_SCORE_FOR_LING > 1.0)
+ return;
+
+ }
+
+ // gets string array and process numbers, applies stemming and forms a list
+ protected List<String> filterWordArray(String[] strWords) {
+ List<String> strList = new ArrayList<String>();
+ for (String w : strWords) {
+ Boolean bInteger = true;
+ try {
+ Integer.parseInt(w);
+ } catch (Exception e) {
+ bInteger = false;
+ }
+ if (w.length() < MIN_STRING_LENGTH_FOR_WORD && !bInteger) // only
+ // non-integer
+ // short
+ // string like preposition is uninteresting
+ continue;
+ try {
+ w = ps.stem(w.toLowerCase());
+ } catch (Exception e) {
+ // do nothing, just have original term
+ }
+ if (w.startsWith("Invalid"))
+ continue;
+ strList.add(w);
+ }
+ return strList;
+ }
+
+ protected List<String> filterWordArrayNoStem(String[] strWords) {
+ List<String> strList = new ArrayList<String>();
+ for (String w : strWords) {
+ Boolean bInteger = true;
+ try {
+ Integer.parseInt(w);
+ } catch (Exception e) {
+ bInteger = false;
+ }
+ if (w.length() < MIN_STRING_LENGTH_FOR_WORD && !bInteger) // only
+ // non-integer
+ // short
+ // string like preposition is uninteresting
+ continue;
+ w = w.toLowerCase();
+
+ strList.add(w);
+ }
+ return strList;
+ }
+
+ // main entry point. Gets two strings and applies string match
+ // and also linguistic match if score > a threshold
+ public double measureStringDistance(String str1, String str2) {
+ double result = (double) -1.0;
+ try {
+ str1 = StringCleaner.processSnapshotForMatching(str1);
+ str2 = StringCleaner.processSnapshotForMatching(str2);
+ if (str1.equals(str2)) // || str1.endsWith(str2) || str2.endsWith(str1))
+ // bg 03-2011
+ return 1.0;
+
+ String[] str1Words = str1.split(" ");
+ String[] str2Words = str2.split(" ");
+ List<String> str1List = filterWordArray(str1Words), str2List = filterWordArray(str2Words);
+
+ int l1 = str1List.size(), l2 = str2List.size();
+ if (l1 < 2)
+ l1 = str1Words.length;
+ if (l2 < 2)
+ l2 = str2Words.length;
+
+ int lOverlap = 0;
+ List<String> strListOverlap = new ArrayList<String>(str1List);
+ strListOverlap.retainAll(str2List);
+ for (String w : strListOverlap) {
+ if (w.toLowerCase().equals(w)) // no special interest word
+ lOverlap++;
+ else
+ lOverlap += 2; // if capitalized, or specific word => important so
+ // double score
+ }
+ result = Math.pow((double) (lOverlap * lOverlap) / (double) l1
+ / (double) l2, 0.4);
+
+ // now we try to find similar words which are long or Upper case
+ int countSimilar = 0;
+ str1List.removeAll(strListOverlap);
+ str2List.removeAll(strListOverlap);
+ for (String w1 : str1List) {
+ for (String w2 : str2List) {
+ if (w1.length() > MIN_STRING_LENGTH_FOR_DISTORTED_WORD
+ || !w1.toLowerCase().equals(w1))
+ if (w2.length() > MIN_STRING_LENGTH_FOR_DISTORTED_WORD
+ || !w2.toLowerCase().equals(w2))
+ if (LevensteinDistanceFinder.levensteinDistance(w1, w2, 1, 10, 1,
+ 10) <= ACCEPTABLE_DEVIATION_IN_CHAR)
+ countSimilar++;
+ }
+ }
+ lOverlap += countSimilar;
+ result = Math.pow((double) (lOverlap * lOverlap) / (double) l1
+ / (double) l2, 0.4);
+ if (result > 1)
+ result = (double) 1.0;
+
+ // double ld = LevensteinDistanceFinder. levensteinDistance(str1, str2, 1,
+ // 10, 1, 10);
+ // System.out.println(ld);
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ return (double) -1.0;
+ }
+
+ Double linguisticScore = (double) -1.0;
+ // to be developed - employs linguistic processor
+ /*
+ * if (result>MIN_SCORE_FOR_LING) { List<List<ParseTreeChunk>> matchResult =
+ * pos.matchOrigSentencesCache(str1, str2); linguisticScore =
+ * ParseTreeChunkListScorer.getParseTreeChunkListScore(matchResult);
+ * System.out.println(matchResult);
+ *
+ * // magic formula for 0.7 string match and 0.3 linguistic match result =
+ * result*0.7 + linguisticScore/6.0* 0.3; }
+ */
+ return result;
+ }
+
+ public double measureStringDistanceNoStemming(String str1, String str2) {
+ double result = (double) -1.0;
+ try {
+ str1 = StringCleaner.processSnapshotForMatching(str1);
+ str2 = StringCleaner.processSnapshotForMatching(str2);
+ if (str1.equals(str2)) // || str1.endsWith(str2) || str2.endsWith(str1))
+ // bg 03-2011
+ return 1.0;
+
+ String[] str1Words = str1.split(" ");
+ String[] str2Words = str2.split(" ");
+ List<String> str1List = filterWordArrayNoStem(str1Words), str2List = filterWordArrayNoStem(str2Words);
+
+ int l1 = str1List.size(), l2 = str2List.size();
+ if (l1 < 2)
+ l1 = str1Words.length;
+ if (l2 < 2)
+ l2 = str2Words.length;
+
+ int lOverlap = 0;
+ List<String> strListOverlap = new ArrayList<String>(str1List);
+ strListOverlap.retainAll(str2List);
+ for (String w : strListOverlap) {
+ if (w.toLowerCase().equals(w)) // no special interest word
+ lOverlap++;
+ else
+ lOverlap += 2; // if capitalized, or specific word => important so
+ // double score
+ }
+ result = Math.pow((double) (lOverlap * lOverlap) / (double) l1
+ / (double) l2, 0.4);
+
+ // now we try to find similar words which are long or Upper case
+ int countSimilar = 0;
+ str1List.removeAll(strListOverlap);
+ str2List.removeAll(strListOverlap);
+ for (String w1 : str1List) {
+ for (String w2 : str2List) {
+ if (w1.length() > MIN_STRING_LENGTH_FOR_DISTORTED_WORD
+ || !w1.toLowerCase().equals(w1))
+ if (w2.length() > MIN_STRING_LENGTH_FOR_DISTORTED_WORD
+ || !w2.toLowerCase().equals(w2))
+ if (LevensteinDistanceFinder.levensteinDistance(w1, w2, 1, 10, 1,
+ 10) <= ACCEPTABLE_DEVIATION_IN_CHAR)
+ countSimilar++;
+ }
+ }
+ lOverlap += countSimilar;
+ result = Math.pow((double) (lOverlap * lOverlap) / (double) l1
+ / (double) l2, 0.4);
+ if (result > 1)
+ result = (double) 1.0;
+
+ // double ld = LevensteinDistanceFinder. levensteinDistance(str1, str2, 1,
+ // 10, 1, 10);
+ // System.out.println(ld);
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ return (double) -1.0;
+ }
+
+ Double linguisticScore = (double) -1.0;
+ // to be developed - employs linguistic processor
+ /*
+ * if (result>MIN_SCORE_FOR_LING) { List<List<ParseTreeChunk>> matchResult =
+ * pos.matchOrigSentencesCache(str1, str2); linguisticScore =
+ * ParseTreeChunkListScorer.getParseTreeChunkListScore(matchResult);
+ * System.out.println(matchResult);
+ *
+ * // magic formula for 0.7 string match and 0.3 linguistic match result =
+ * result*0.7 + linguisticScore/6.0* 0.3; }
+ */
+ return result;
+ }
+
+ public static void main(String[] args) {
+ StringDistanceMeasurer meas = new StringDistanceMeasurer();
+
+ // String sent1 =
+ // "estoy en LA,California y no encuentro tu album en NINGUNA parte!!! " +
+ // "NO MANCHES!!!lo tengo que comprar por internet!! " ;
+
+ // redunction of announcement
+ String sent2a = "Tomarow come check us out if your in the area show starts at 6:00pm "
+ + "2404 E. La Palma Anaheim, California 92806 Cost:$3";
+ String sent2b = "Tomorrow you can check us if you area will show start at 6 pm "
+ + "2404 East La Palma Anaheim, $3";
+ // common sub-chunk = [VBZ-starts IN-at NNP-* NNP-* ]
+
+ // original posting and its yahoo.com search snapshot
+ String sent4a = "Fliers may have to wait years for new liquid screening equipment";
+ String sent4b = "for screening checkpoints and equipment; improving ... "
+ + "Wait times are not just a problem at large airports";
+
+ // slang and search snapshot
+ String sent5a = "hell yea i stay in california. and hell no lol LA sucks hella bad, "
+ + "i lived there for a while and hated it sooo much, so boring! ";
+
+ String sent5b = "My life is so boring without Tree Hill and the OC. America is sooo "
+ + "racist I LOVE YOU SO MUCH. TO everyone that has hurt me...no one in the ..... Yeah sucks I know";
+
+ String sent6a = "I think its gonna be in the east coast as well. California is pretty but way "
+ + "to close to LA and helicopters are gonna ruin it";
+ String sent6b = "could be in east coast as well. California is pretty but way "
+ + "to close to LA and choppers will ruin it";
+ // common sub-chunk = [JJ-east NN-coast ]
+
+ String sent7a = "Iran nuke document called 'alarming'. Their Program started in the 50s with our help!";
+ String sent7b = "nuke project of Iran is alarming' Program started in 1950s with our help";
+ // common sub-chunk = [VBD-started IN-in NNS-50s IN-with PRP$-our NN-help ]
+
+ // News title for the same event
+ String sent8a = "Pakistan slaps travel ban on defence minister";
+ String sent8b = "Pakistan corruption fall-out threatens stability";
+ String sent8c = "Pakistan defence minister 'barred from leaving country'";
+ String sent8d = "Pakistani defence minister banned from travel";
+ String sent8dd = "Pakistani defence minister banned from travel"; // to
+ // check
+ // the
+ // case of
+ // 1.0
+
+ // common sub-chunk = [NN-defence NN-minister ]
+
+ List<Double> matchRes = new ArrayList<Double>();
+ matchRes.add(meas.measureStringDistance(sent2a, sent2b));
+ matchRes.add(meas.measureStringDistance(sent4a, sent4b));
+ matchRes.add(meas.measureStringDistance(sent5a, sent5b));
+ matchRes.add(meas.measureStringDistance(sent6a, sent6b));
+ matchRes.add(meas.measureStringDistance(sent7a, sent7b));
+
+ System.out.println(matchRes);
+ // [0.8178702752867737, 0.21082473737065027, 0.27594593229224296,
+ // 0.7517586466500455, 0.9100766715907641]
+
+ matchRes = new ArrayList<Double>();
+ matchRes.add(meas.measureStringDistance(sent8a, sent8b));
+ matchRes.add(meas.measureStringDistance(sent8a, sent8c));
+ matchRes.add(meas.measureStringDistance(sent8a, sent8d));
+ matchRes.add(meas.measureStringDistance(sent8b, sent8c));
+ matchRes.add(meas.measureStringDistance(sent8b, sent8d));
+ matchRes.add(meas.measureStringDistance(sent8c, sent8d));
+
+ System.out.println(matchRes);
+ // [0.48044977359257246, 0.8365116420730185, 0.8365116420730185,
+ // 0.48044977359257246, 0.27594593229224296,
+ // 0.6391010941257969]
+
+ matchRes = new ArrayList<Double>();
+ // to verify that the same sentence gives 1
+ matchRes.add(meas.measureStringDistance(sent8dd, sent8d));
+ // to verify that totally different sentences give 0
+ matchRes.add(meas.measureStringDistance(sent2a, sent8d));
+
+ System.out.println("Now testing 1 and 0: \n" + matchRes);
+ // Now testing 1 and 0:
+ // [1.0, 0.0]
+ }
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java
------------------------------------------------------------------------------
svn:mime-type = text/plain