You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/10/11 17:36:33 UTC
svn commit: r1181845 [3/5] - in
/incubator/opennlp/sandbox/opennlp-similarity: ./
src/main/java/opennlp/tools/similarity/
src/main/java/opennlp/tools/similarity/apps/
src/main/java/opennlp/tools/similarity/apps/utils/
src/main/java/opennlp/tools/textsi...
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Utils.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Utils.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Utils.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Utils.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,682 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps.utils;
+
+import java.awt.Graphics2D;
+import java.awt.geom.AffineTransform;
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Hashtable;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import javax.imageio.ImageIO;
+
+import org.apache.commons.lang.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class Utils {
+
+ private static final Logger LOG = LoggerFactory.getLogger(Utils.class);
+
+ protected static final ArrayList<String[]> characterMappings = new ArrayList<String[]>();
+
+ static {
+ characterMappings
+ .add(new String[] {
+ "[ÃÆàÃÆáÃÆâÃÆãÃÆäÃÆÃ¥ïÿý?ÃâÃâÃâââ¬Â¦Ã�ð]",
+ " " }); // was a
+ characterMappings
+ .add(new String[] {
+ "[ÃÆââ¬Ã¯Ã¿Ã½?ÃÆââ¬Å¡ÃÆÃâÃÆââ¬Å¾ÃÆââ¬Â¦Ãâââ¬Ãâââ¬Å¡Ãâââ¬Å¾Ã¯Ã¿Ã½?]",
+ "A" });
+ characterMappings.add(new String[] {
+ "[ÃÆçÃâââ¬Â¡Ãâââ¬Â°Ãâââ¬Â¹Ã¯Ã¿Ã½?]", "c" });
+ characterMappings.add(new String[] {
+ "[ÃÆââ¬Â¡Ãâââ¬Â ÃâÃâ ÃâÃ
 ÃâÃ
â]", "C" });
+ characterMappings.add(new String[] { "[ïÿý?Ãâââ¬Ë]", "d" });
+ characterMappings.add(new String[] {
+ "[ïÿý?ÃâÃ
½Ã¯Ã¿Ã½?]", "D" });
+ characterMappings
+ .add(new String[] {
+ "[ÃÆèÃÆéÃÆêÃÆëÃÆæÃâââ¢Ãâââ¬ÅÃâââ¬Â¢Ãâââ¬âÃâââ¢Ãâââ¬Âº]",
+ " " }); // was e
+ characterMappings
+ .add(new String[] {
+ "[ÃÆÃâ ÃÆââ¬Â°ÃÆÃ
 ÃÆââ¬Â¹ÃÆââ¬Â Ãâââ¬â¢Ãâââ¬ï¿½Ãâââ¬âÃâÃÅÃâÃ
¡]",
+ "'" }); // was E
+ characterMappings.add(new String[] {
+ "[ïÿý?ÃâÃ
¸ÃâáÃâã]", "g" });
+ characterMappings.add(new String[] {
+ "[ÃâÃ
âÃâÃ
¾ÃâàÃââÃâ ââ¬Å]", "G" });
+ characterMappings.add(new String[] { "[ÃâÃÂ¥Ãâç]", "h" });
+ characterMappings.add(new String[] { "[ÃâäÃâæ]", "H" });
+ characterMappings
+ .add(new String[] {
+ "[ÃÆìÃÆÃÂÃÆîÃÆïÃâéÃâëÃâÃÂÃâîÃâïÃâñÃâóÃâõ]",
+ "i" });
+ characterMappings
+ .add(new String[] {
+ "[ÃÆÃ
âïÿý?ÃÆÃ
½Ã¯Ã¿Ã½?ÃâèÃâêÃâìÃâðÃâòÃâôÃâõ]",
+ "I" });
+ characterMappings.add(new String[] { "[Ãâ÷Ãâø]", "k" });
+ characterMappings.add(new String[] { "[̦̉]", "K" });
+ characterMappings
+ .add(new String[] {
+ "[ÃÆøÃâ¦Ã¢â¬ËÃÆðÃÆòÃÆóÃÆôÃÆõÃÆöïÿý?ïÿý?Ãâ¦Ã¢â¬ËÃâ¦Ã¢â¬ÅÃâ á]",
+ "o" });
+ characterMappings
+ .add(new String[] {
+ "[ÃÆââ¬â¢ÃÆââ¬ÅÃÆââ¬ï¿½ÃÆââ¬Â¢ÃÆââ¬âÃÆÃÅÃâ¦Ã
âÃâ¦Ã
½Ã¯Ã¿Ã½?Ãâ¦Ã¢â¬â¢Ãâ à]",
+ "O" });
+ characterMappings.add(new String[] {
+ "[ÃÆñÃâ¦Ã¢â¬Å¾Ãâ¦Ã¢â¬Â Ãâ¦Ãâ Ãâ¦Ã¢â¬Â°Ãâ¦Ã¢â¬Â¹]",
+ "n" });
+ characterMappings.add(new String[] {
+ "[ÃÆââ¬ËÃâ¦ÃâÃâ¦Ã¢â¬Â¦Ãâ¦Ã¢â¬Â¡Ãâ¦Ã
 Ãâ¦Ã¢â¬Â¹]",
+ "N" });
+ characterMappings.add(new String[] {
+ "[ÃâúÃâüÃâþÃâ¦Ã¢â¬Ãâ¦Ã¢â¬Å¡]", "l" });
+ characterMappings.add(new String[] {
+ "[ÃâùÃâûÃâýÃâÿïÿý?]", "L" });
+ characterMappings
+ .add(new String[] {
+ "[ÃÆùÃÆúÃÆûÃÆüÃâ¦Ã©Ãâ¦Ã«Ãâ¦ÃÂÃâ¦Ã¯Ãâ¦Ã±Ãâ¦Ã³Ãâ ð]",
+ "u" });
+ characterMappings
+ .add(new String[] {
+ "[ÃÆââ¢ÃÆÃ
¡ÃÆââ¬ÂºÃÆÃ
âÃâ¦Ã¨Ãâ¦ÃªÃâ¦Ã¬Ãâ¦Ã®Ãâ¦Ã°Ãâ¦Ã²Ãâ ï]",
+ "U" });
+ characterMappings.add(new String[] { "[ÃÆýÃÆÿÃâ¦Ã·]", "y" });
+ characterMappings.add(new String[] { "[ïÿý?Ãâ¦Ã¶Ãâ¦Ã¸]",
+ "Y" });
+ characterMappings.add(new String[] {
+ "[Ãâ¦Ã¢â¬Â¢Ãâ¦Ã¢â¬âÃâ¦Ã¢â¢]", "r" });
+ characterMappings.add(new String[] {
+ "[Ãâ¦Ã¢â¬ï¿½Ãâ¦Ã¢â¬âÃâ¦ÃÅ]", "R" });
+ characterMappings
+ .add(new String[] {
+ "[Ãâ¦Ã¡Ãâ¦Ã¢â¬ÂºÃ¯Ã¿Ã½?Ãâ¦Ã
¸Ãâ¦Ã¡Ãâ¦Ã¿]",
+ "s" });
+ characterMappings.add(new String[] {
+ "[Ãâ¦Ã Ãâ¦Ã
¡Ãâ¦Ã
âÃâ¦Ã
¾Ãâ¦Ã Ãâ¦Ã¿]", "S" });
+ characterMappings.add(new String[] { "ÃÆÃ
¸", "ss" });
+ characterMappings.add(new String[] { "ÃÆÃ
¾", "th" });
+ characterMappings.add(new String[] { "ÃÆþ", "Th" });
+ characterMappings
+ .add(new String[] { "[Ãâ¦Ã£Ãâ¦ÃÂ¥Ãâ¦Ã§]", "t" });
+ characterMappings
+ .add(new String[] { "[Ãâ¦Ã¢Ãâ¦Ã¤Ãâ¦Ã¦]", "T" });
+ characterMappings.add(new String[] { "[Ãâ¦Ãµ]", "w" });
+ characterMappings.add(new String[] { "[Ãâ¦Ã´]", "W" });
+ characterMappings.add(new String[] {
+ "[Ãâ¦Ã¾Ãâ¦ÃºÃâ¦Ã¼Ãâ¦Ã¾Ãâ ö]", "z" });
+ characterMappings.add(new String[] {
+ "[Ãâ¦Ã½Ãâ¦Ã½Ãâ¦Ã¹Ãâ¦Ã»Ãâ¦Ã½Ãâ õ]", "Z" });
+ characterMappings.add(new String[] { "[âââ‰â¢]", "'" });
+ characterMappings.add(new String[] { "[âââ‰â¬Å]", "'" });
+ characterMappings.add(new String[] { "'", "'" });
+ characterMappings.add(new String[] { "Ãâe", "ë" });
+ characterMappings.add(new String[] { "'AG", "ââ¬Å" });
+ characterMappings.add(new String[] { "A�", " " });
+ characterMappings.add(new String[] { """, "\"" });
+ characterMappings.add(new String[] { "&", "&" });
+ characterMappings.add(new String[] { " ", " " });
+ characterMappings.add(new String[] { "îââ‰â¬", " " });
+ characterMappings.add(new String[] { "âââ¬Å¾Ã¢", " " });
+ characterMappings.add(new String[] { "âââ‰â¬ï¿½", "" });
+ characterMappings.add(new String[] { "â", "'" });
+ }
+
+ public static String stripNonAsciiChars(String s) {
+ StringBuffer b = new StringBuffer();
+ if (s != null) {
+ for (int i = 0; i < s.length(); i++) {
+ if (((int) s.charAt(i)) <= 256) {
+ b.append(s.charAt(i));
+ }
+ }
+ }
+
+ return b.toString().trim().replaceAll("\\s+", " "); // replace any multiple
+ // spaces with a single
+ // space
+ }
+
+ public static String convertToASCII(String s) {
+ s = s.replace("&", "");
+ s = s.replaceAll("â", "__apostrophe__");
+ String tmp = s;
+ if (tmp != null) {
+ for (String[] mapping : characterMappings) {
+ tmp = tmp.replaceAll(mapping[0], mapping[1]);
+ }
+ }
+ return stripNonAsciiChars(tmp.replaceAll("__apostrophe__", "'"));
+ }
+
+ public static class KeyValue {
+ public Object key = null;
+
+ public float value = 0;
+
+ public KeyValue(Object o, Float i) {
+ this.key = o;
+ this.value = i;
+ }
+
+ public static class SortByValue implements Comparator {
+ public int compare(Object obj1, Object obj2) {
+ float i1 = ((KeyValue) obj1).value;
+ float i2 = ((KeyValue) obj2).value;
+
+ if (i1 < i2)
+ return 1;
+ return -1;
+ }
+ }
+ }
+
+ public static boolean createResizedCopy(String originalImage,
+ String newImage, int scaledWidth, int scaledHeight) {
+ boolean retVal = true;
+ try {
+ File o = new File(originalImage);
+ BufferedImage bsrc = ImageIO.read(o);
+ BufferedImage bdest = new BufferedImage(scaledWidth, scaledHeight,
+ BufferedImage.TYPE_INT_RGB);
+
+ Graphics2D g = bdest.createGraphics();
+ AffineTransform at = AffineTransform.getScaleInstance(
+ (double) scaledWidth / bsrc.getWidth(),
+ (double) scaledHeight / bsrc.getHeight());
+ g.drawRenderedImage(bsrc, at);
+ ImageIO.write(bdest, "jpeg", new File(newImage));
+
+ } catch (Exception e) {
+ retVal = false;
+ LOG.error("Failed creating thumbnail for image: " + originalImage, e);
+ }
+
+ return retVal;
+ }
+
+ private static int minimum(int a, int b, int c) {
+ int mi;
+
+ mi = a;
+ if (b < mi) {
+ mi = b;
+ }
+ if (c < mi) {
+ mi = c;
+ }
+ return mi;
+
+ }
+
+ public static int computeEditDistance(String s, String t) {
+ int d[][]; // matrix
+ int n; // length of s
+ int m; // length of t
+ int i; // iterates through s
+ int j; // iterates through t
+ char s_i; // ith character of s
+ char t_j; // jth character of t
+ int cost; // cost
+
+ // Step 1
+ n = s.length();
+ m = t.length();
+ if (n == 0) {
+ return m;
+ }
+ if (m == 0) {
+ return n;
+ }
+ d = new int[n + 1][m + 1];
+ // Step 2
+ for (i = 0; i <= n; i++) {
+ d[i][0] = i;
+ }
+ for (j = 0; j <= m; j++) {
+ d[0][j] = j;
+ }
+ // Step 3
+ for (i = 1; i <= n; i++) {
+ s_i = s.charAt(i - 1);
+ // Step 4
+ for (j = 1; j <= m; j++) {
+ t_j = t.charAt(j - 1);
+ // Step 5
+ if (s_i == t_j) {
+ cost = 0;
+ } else {
+ cost = 1;
+ }
+ // Step 6
+ d[i][j] = minimum(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1]
+ + cost);
+ }
+ }
+ // Step 7
+ return d[n][m];
+ }
+
+ public static ArrayList<KeyValue> sortByValue(HashMap<Object, Float> h) {
+ ArrayList<KeyValue> res = new ArrayList<KeyValue>();
+ for (Object o : h.keySet()) {
+ // form a pair
+ res.add(new KeyValue(o, h.get(o)));
+ }
+
+ Collections.sort(res, new KeyValue.SortByValue());
+
+ return res;
+ }
+
+ public static String convertKeyValueToString(ArrayList<KeyValue> l) {
+ StringBuffer retVal = new StringBuffer();
+ for (KeyValue kv : l) {
+ retVal.append(kv.key);
+ retVal.append("-");
+ retVal.append(kv.value);
+ retVal.append(",");
+ }
+
+ return retVal.toString();
+ }
+
+ public static String convertStringArrayToString(ArrayList<String> l) {
+ StringBuffer b = new StringBuffer();
+ for (String s : l) {
+ b.append(s);
+ b.append(", ");
+ }
+
+ return b.toString();
+ }
+
+ public static String convertStringArrayToPlainString(ArrayList<String> l) {
+ StringBuffer b = new StringBuffer();
+ for (String s : l) {
+ b.append(s);
+ b.append(" ");
+ }
+
+ return b.toString();
+ }
+
+ public static boolean noDomainInUrl(String siteUrl, String url) {
+ if (StringUtils.isEmpty(url)) {
+ return true;
+ }
+ if (!url.startsWith("http://")) {
+ return true;
+ }
+ return false;
+ }
+
+ public static String addDomainToUrl(String siteUrl, String url) {
+ if (StringUtils.isEmpty(url)) {
+ return null; // should we return siteUrl here ??
+ }
+ if (!url.startsWith("http://")) {
+ String domain = StringUtils.substringBetween(siteUrl, "http://", "/");
+ if (domain == null) {
+ url = siteUrl + (url.startsWith("/") ? "" : "/") + url;
+ } else {
+ if (!url.startsWith("/")) {
+ int lastIndex = StringUtils.lastIndexOf(siteUrl, "/");
+ url = siteUrl.substring(0, lastIndex) + "/" + url;
+ } else {
+ url = "http://" + domain + url;
+ }
+ }
+ }
+ return url;
+ }
+
+ public static int countValues(Hashtable<String, Float> b1) {
+ int retVal = 0;
+ for (String s : b1.keySet()) {
+ retVal += b1.get(s);
+ }
+
+ return retVal;
+ }
+
+ public static int countValues(HashMap<String, Integer> b1) {
+ int retVal = 0;
+ for (String s : b1.keySet()) {
+ retVal += b1.get(s);
+ }
+
+ return retVal;
+ }
+
+ public static String convertHashMapToString(HashMap<String, Integer> m) {
+ StringBuffer s = new StringBuffer();
+ for (String x : m.keySet()) {
+ s.append(x);
+ s.append("-");
+ s.append(m.get(x));
+ s.append(",");
+ }
+
+ return s.toString();
+ }
+
+ public static boolean isTokenAllDigitOrPunc(String token) {
+ for (int i = 0; i < token.length(); i++) {
+ if (java.lang.Character.isLetter(token.charAt(i))) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ public static boolean containsDigit(String token) {
+ for (int i = 0; i < token.length(); i++) {
+ if (java.lang.Character.isDigit(token.charAt(i))) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public static String CleanCharacter(String txt, int uValue) {
+ StringBuffer retVal = new StringBuffer();
+ for (int i = 0; i < txt.length(); i++) {
+ int uChar = (txt.charAt(i));
+ if (uChar != uValue) {
+ retVal.append(txt.charAt(i));
+ } else {
+ retVal.append(" ");
+ }
+ }
+ return retVal.toString();
+ }
+
+ public static String removeHTMLTagsFromStr(String inputStr) {
+ String[] removeTags = StringUtils.substringsBetween(inputStr, "<", ">");
+
+ if (removeTags != null && removeTags.length > 0) {
+ for (String tag : removeTags) {
+ inputStr = StringUtils.remove(inputStr, "<" + tag + ">");
+ }
+ }
+
+ return inputStr;
+ }
+
+ public static String unescapeHTML(String text) {
+ return org.apache.commons.lang.StringEscapeUtils.unescapeHtml(text);
+ }
+
+ public static String stripHTML(String text) {
+ return text.replaceAll("\\<.*?>", "");
+ }
+
+ public static String stripScriptTags(String text) {
+ Pattern p = java.util.regex.Pattern.compile("\\<SCRIPT.*?</SCRIPT>",
+ Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
+ Matcher matcher = p.matcher(text);
+ String tmp = matcher.replaceAll("");
+ return tmp;
+ }
+
+ public static String stripNoScriptTags(String text) {
+ Pattern p = java.util.regex.Pattern.compile("\\<NOSCRIPT.*?</NOSCRIPT>",
+ Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
+ Matcher matcher = p.matcher(text);
+ String tmp = matcher.replaceAll("");
+ return tmp;
+ }
+
+ public static String stripHTMLMultiLine(String text,
+ HashSet<String> allowedHtmlTags, String escGtCh, String escLtCh) {
+ if (StringUtils.isNotEmpty(text)) {
+
+ boolean hadAllowedHtmlTags = false;
+
+ if (allowedHtmlTags != null) {
+ for (String htmlTag : allowedHtmlTags) {
+ String tmp = text.replaceAll("<" + htmlTag + ">", escLtCh + htmlTag
+ + escGtCh);
+ tmp = tmp.replaceAll("</" + htmlTag + ">", escLtCh + "/" + htmlTag
+ + escGtCh);
+ if (!tmp.equals(text)) {
+ text = tmp;
+ hadAllowedHtmlTags = true;
+ }
+ }
+ }
+
+ text = stripHTMLMultiLine(text);
+
+ if (hadAllowedHtmlTags) {
+ text = text.replaceAll(escLtCh, "<");
+ text = text.replaceAll(escGtCh, ">");
+ }
+ }
+
+ return text;
+ }
+
+ public static String stripHTMLMultiLine(String text) {
+ Pattern p = java.util.regex.Pattern.compile("\\<.*?>", Pattern.DOTALL);
+ Matcher matcher = p.matcher(text);
+ String tmp = matcher.replaceAll("");
+ return tmp;
+ }
+
+ public static String stripHTMLCommentsMultiLine(String text) {
+ Pattern p = java.util.regex.Pattern.compile("\\<!--.*?-->", Pattern.DOTALL);
+ Matcher matcher = p.matcher(text);
+ String tmp = matcher.replaceAll("");
+ return tmp;
+ }
+
+ public static boolean isFlagSet(Integer flags, Integer flagToCheck) {
+ if (flags != null && flagToCheck != null) {
+ return ((flags & flagToCheck) == flagToCheck);
+ }
+ return false;
+ }
+
+ public static Integer updateFlag(Integer flags, Integer flagToCheck,
+ boolean shouldSet) {
+ if (shouldSet) {
+ return setFlag(flags, flagToCheck);
+ } else {
+ return resetFlag(flags, flagToCheck);
+ }
+ }
+
+ public static Integer setFlag(Integer flags, Integer flagToCheck) {
+ if (flags == null) {
+ flags = new Integer(0);
+ }
+ if (!isFlagSet(flags, flagToCheck)) {
+ flags = flags + flagToCheck;
+ ;
+ }
+ return flags;
+ }
+
+ public static Integer resetFlag(Integer flags, Integer flagToCheck) {
+ if (flags == null) {
+ // nothing to reset
+ flags = new Integer(0);
+ return flags;
+ }
+
+ if (isFlagSet(flags, flagToCheck)) {
+ flags = flags - flagToCheck;
+ }
+ return flags;
+ }
+
+ public static String truncateOnSpace(String text, Integer length) {
+ String retVal = "";
+ if (text.length() <= length) {
+ retVal = text;
+ } else {
+ StringBuffer b = new StringBuffer();
+ for (int i = 0; i < text.length(); i++) {
+ if (b.length() >= length && Character.isWhitespace(text.charAt(i))) { // iterate
+ // until
+ // we
+ // hit
+ // whitespace
+ b.append("...");
+ break;
+ }
+ b.append(text.charAt(i));
+ }
+ retVal = b.toString();
+ }
+
+ return retVal.trim();
+ }
+
+ public static String sanitizeString(String text) {
+ text = Utils.stripHTMLCommentsMultiLine(text);
+ text = Utils.stripHTMLMultiLine(text);
+ text = Utils.unescapeHTML(text);
+ text = StringUtils.trimToEmpty(text);
+ text = text.replaceAll("\\s+", " ");
+ return text;
+ }
+
+ public static String makeStringUrlSafe(String text) {
+ StringBuffer b = new StringBuffer();
+ for (int i = 0; i < text.length(); i++) {
+ if (StringUtils.isAlphanumericSpace(String.valueOf(text.charAt(i)))) {
+ b.append(text.charAt(i));
+ }
+ }
+ return Utils.convertToASCII(b.toString().replaceAll("\\s+", " "));
+ }
+
+ public static String getEventIdFromNewsUrl(String url) {
+ String eventId = null;
+ String p = "news/([0-9]+)";
+ Pattern pattern = Pattern.compile(p);
+ Matcher matcher = pattern.matcher(url);
+ while (matcher.find()) {
+ // System.out.println("found: " + matcher.group(2));
+ eventId = matcher.group(1);
+ }
+ return eventId;
+ }
+
+ public static String buildCommaSeparatedIds(List ids) {
+
+ if (ids != null && ids.size() > 0) {
+ StringBuffer sbuf = new StringBuffer();
+
+ for (int count = 0; count < ids.size(); count++) {
+ if (count > 0) {
+ sbuf.append(",");
+ }
+ sbuf.append(ids.get(count));
+ }
+ return sbuf.toString();
+ }
+
+ return null;
+ }
+
+ public static float computeScoreForRanking(List<Float> scores,
+ int desiredRanking) {
+ float newScore = 0f;
+
+ if (desiredRanking == 1) {
+ newScore = scores.get(0) + 50000;
+ } else if (desiredRanking == scores.size()) {
+ newScore = scores.get(scores.size() - 1) - 1;
+ } else {
+ newScore = (scores.get(desiredRanking - 2) + scores
+ .get(desiredRanking - 1)) / 2;
+ }
+
+ return newScore;
+ }
+
+ public static String fullStripHTML(String text) {
+ text = Utils.stripScriptTags(text);
+ text = Utils.stripNoScriptTags(text);
+ text = Utils.stripStyleTags(text);
+ return text.replaceAll("\\<.*?>", "");
+ }
+
+ public static String stripStyleTags(String text) {
+ Pattern p = java.util.regex.Pattern.compile("\\<STYLE.*?</STYLE>",
+ Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
+ Matcher matcher = p.matcher(text);
+ String tmp = matcher.replaceAll("");
+ return tmp;
+ }
+
+ public static boolean isLatinWord(String word) {
+ for (int i = 0; i < word.length(); i++) {
+ int asciiCode = (int) word.charAt(i);
+ if (asciiCode > 128)
+ return false;
+ }
+
+ return true;
+ }
+
+ static public void main(String[] args) {
+ System.out.println(isLatinWord("Performing Arts Center (SPAC)"));
+ System.out.println(isLatinWord("âJazz Ageâ"));
+
+ System.out
+ .println(isLatinWord("ãÆâ¡Ã£Æ¼ãÆâãÆÆãÆâ°Ã£Æȋ"));
+ System.out.println(isLatinWord("é ñçøÃ¥ó"));
+ System.out.println(isLatinWord("ùìîä à øöé"));
+ System.out
+ .println(isLatinWord("éâ¢Â³Ã¦Â¸Â¯Ã§âŸ, éâ¢Ë港çâŸ"));
+
+ System.out
+ .println(convertToASCII("Irvine Bay Hotel & Golf Club on Sunday, May 01 duringÃÂ Jazz on the Beach,ÃÂ Tobago Jazz Experience alongsideÃÂ The Jazz Singer"));
+ System.out
+ .println(convertToASCII("This yearâs event, held again at the wonderful Saratoga Performing Arts Center (SPAC)"));
+ System.out
+ .println(convertToASCII("and the great saxophone playing of Sam Rogers Rush Hour Blues 2010 Â . "));
+ System.out
+ .println(convertToASCII(" Ron Carter is among the most original, prolific "));
+ System.out
+ .println(convertToASCII("Â . Â Â Â Â Â Â Â Â Â Â Â Ron Carter is among the most original, prolific. "));
+ // TODO deal with
+ // www.wmot.org/program-guide/program-listings/28th_annual_playboy_jazz_festiva_2006.htm
+ System.out
+ .println(convertToASCII("By the mid 1920âs, during the period referred to as the âJazz Ageâ, jazz music was heard in most major cities from the East Coast"));
+
+ }
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Utils.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/ValueSortMap.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/ValueSortMap.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/ValueSortMap.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/ValueSortMap.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,286 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps.utils;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * This class is used to show how you can sort a java.uti.Map for values. This
+ * also takes care of null and duplicate values present in the map.
+ */
+public class ValueSortMap {
+
+ private ValueSortMap() {
+ }
+
+ /**
+ * This method returns the new LinkedHashMap sorted with values for passed
+ * Comparater. If null values exist they will be put in the last of the
+ * returned LinkedHashMap. If there are duplicate values they will come
+ * together at the values ordering order but ordering between same multiple
+ * values is ramdom. Passed Map will be intect.
+ *
+ * @param inMap
+ * Map to be sorted
+ * @param comparator
+ * Values will be sorted as per passed Comparater
+ * @return LinkedHashMap Sorted new LinkedHashMap
+ */
+ public static <K, V> LinkedHashMap<K, V> sortMapByValue(Map<K, V> inMap,
+ Comparator<V> comparator) {
+ return sortMapByValue(inMap, comparator, null);
+ }
+
+ /**
+ * This method returns the new LinkedHashMap sorted with values for passed
+ * ascendingOrder. If null values exist they will be put in the last for true
+ * value of ascendingOrder or will be put on top of the returned LinkedHashMap
+ * for false value of ascendingOrder. If there are duplicate values they will
+ * come together at the values ordering order but ordering between same
+ * multiple values is ramdom. Passed Map will be intect.
+ *
+ * @param inMap
+ * Map to be sorted
+ * @param ascendingOrder
+ * Values will be sorted as per value of ascendingOrder
+ * @return LinkedHashMap Sorted new LinkedHashMap
+ */
+ public static <K, V> LinkedHashMap<K, V> sortMapByValue(Map<K, V> inMap,
+ boolean ascendingOrder) {
+ return sortMapByValue(inMap, null, ascendingOrder);
+ }
+
+ /**
+ * This method returns the new LinkedHashMap sorted with values in ascending
+ * order. If null values exist they will be put in the last of the returned
+ * LinkedHashMap. If there are duplicate values they will come together at the
+ * values ordering order but ordering between same multiple values is ramdom.
+ * Passed Map will be intect.
+ *
+ * @param inMap
+ * Map to be sorted
+ * @return LinkedHashMap Sorted new LinkedHashMap
+ */
+ public static <K, V> LinkedHashMap<K, V> sortMapByValue(Map<K, V> inMap) {
+ return sortMapByValue(inMap, null, null);
+ }
+
+ /**
+ * This method returns the new LinkedHashMap sorted with values. Values will
+ * be sorted as value of passed comparator if ascendingOrder is null or in
+ * order of passed ascendingOrder if it is not null. If null values exist they
+ * will be put in the last for true value of ascendingOrder or will be put on
+ * top of the returned LinkedHashMap for false value of ascendingOrder. If
+ * there are duplicate values they will come together at the values ordering
+ * order but ordering between same multiple values is ramdom. Passed Map will
+ * be intect.
+ *
+ * @param inMap
+ * Map to be sorted
+ * @param comparator
+ * Values will be sorted as per passed Comparater
+ * @param ascendingOrder
+ * Values will be sorted as per value of ascendingOrder
+ * @return LinkedHashMap Sorted new LinkedHashMap
+ */
+ private static <K, V> LinkedHashMap<K, V> sortMapByValue(Map<K, V> inMap,
+ Comparator<V> comparator, Boolean ascendingOrder) {
+ int iSize = inMap.size();
+
+ // Create new LinkedHashMap that need to be returned
+ LinkedHashMap sortedMap = new LinkedHashMap(iSize);
+
+ Collection values = inMap.values();
+ ArrayList valueList = new ArrayList(values); // To get List of all values in
+ // passed Map
+ HashSet distinctValues = new HashSet(values); // To know the distinct values
+ // in passed Map
+
+ // Do handing for null values. remove them from the list that will be used
+ // for sorting
+ int iNullValueCount = 0; // Total number of null values present in passed
+ // Map
+ if (distinctValues.contains(null)) {
+ distinctValues.remove(null);
+ for (int i = 0; i < valueList.size(); i++) {
+ if (valueList.get(i) == null) {
+ valueList.remove(i);
+ iNullValueCount++;
+ i--;
+ continue;
+ }
+ }
+ }
+
+ // Sort the values of the passed Map
+ if (ascendingOrder == null) {
+ // If Boolean ascendingOrder is null, use passed comparator for order of
+ // sorting values
+ Collections.sort(valueList, comparator);
+ } else if (ascendingOrder.booleanValue()) {
+ // If Boolean ascendingOrder is not null and is true, sort values in
+ // ascending order
+ Collections.sort(valueList);
+ } else {
+ // If Boolean ascendingOrder is not null and is false, sort values in
+ // descending order
+ Collections.sort(valueList);
+ Collections.reverse(valueList);
+ }
+
+ // Check if there are multiple same values exist in passed Map (not
+ // considering null values)
+ boolean bAllDistinct = true;
+ if (iSize != (distinctValues.size() + iNullValueCount))
+ bAllDistinct = false;
+
+ Object key = null, value = null, sortedValue;
+ Set keySet = null;
+ Iterator itKeyList = null;
+ HashMap hmTmpMap = new HashMap(iSize);
+ HashMap hmNullValueMap = new HashMap();
+
+ if (bAllDistinct) {
+ // There are no multiple same values in the passed map (without consedring
+ // null)
+ keySet = inMap.keySet();
+ itKeyList = keySet.iterator();
+ while (itKeyList.hasNext()) {
+ key = itKeyList.next();
+ value = inMap.get(key);
+
+ if (value != null)
+ hmTmpMap.put(value, key); // Prepare new temp HashMap with value=key
+ // combination
+ else
+ hmNullValueMap.put(key, value); // Keep all null values in a new temp
+ // Map
+ }
+
+ if (ascendingOrder != null && !ascendingOrder.booleanValue()) {
+ // As it is descending order, Add Null Values in first place of the
+ // LinkedHasMap
+ sortedMap.putAll(hmNullValueMap);
+ }
+
+ // Put all not null values in returning LinkedHashMap
+ for (int i = 0; i < valueList.size(); i++) {
+ value = valueList.get(i);
+ key = hmTmpMap.get(value);
+
+ sortedMap.put(key, value);
+ }
+
+ if (ascendingOrder == null || ascendingOrder.booleanValue()) {
+ // Add Null Values in the last of the LinkedHasMap
+ sortedMap.putAll(hmNullValueMap);
+ }
+ } else {
+ // There are some multiple values (with out considering null)
+ keySet = inMap.keySet();
+ itKeyList = keySet.iterator();
+ while (itKeyList.hasNext()) {
+ key = itKeyList.next();
+ value = inMap.get(key);
+
+ if (value != null)
+ hmTmpMap.put(key, value); // Prepare new temp HashMap with key=value
+ // combination
+ else
+ hmNullValueMap.put(key, value); // Keep all null values in a new temp
+ // Map
+ }
+
+ if (ascendingOrder != null && !ascendingOrder.booleanValue()) {
+ // As it is descending order, Add Null Values in first place of the
+ // LinkedHasMap
+ sortedMap.putAll(hmNullValueMap);
+ }
+
+ // Put all not null values in returning LinkedHashMap
+ for (int i = 0; i < valueList.size(); i++) {
+ sortedValue = valueList.get(i);
+
+ // Search this value in temp HashMap and if found remove it
+ keySet = hmTmpMap.keySet();
+ itKeyList = keySet.iterator();
+ while (itKeyList.hasNext()) {
+ key = itKeyList.next();
+ value = hmTmpMap.get(key);
+ if (value.equals(sortedValue)) {
+ sortedMap.put(key, value);
+ hmTmpMap.remove(key);
+ break;
+ }
+ }
+ }
+
+ if (ascendingOrder == null || ascendingOrder.booleanValue()) {
+ // Add Null Values in the last of the LinkedHasMap
+ sortedMap.putAll(hmNullValueMap);
+ }
+ }
+
+ return sortedMap;
+ }
+
+ public static void main(String[] args) {
+ HashMap hmValue = new HashMap();
+
+ hmValue.put("ZNU", "Zuki Ndulo");
+ hmValue.put("YSH", "Yogesh Sharma");
+ hmValue.put("HHU", "Hiram Hugesh");
+ hmValue.put("MLE", "Marry Lee");
+ hmValue.put("FST", "Faran Stott");
+ hmValue.put("HET", null);
+ hmValue.put("SID", null);
+ hmValue.put("AFR", "Alice Fryer");
+ hmValue.put("KIQ", null);
+ hmValue.put("JBE", "Jim Bell");
+ hmValue.put("MAU", null);
+ hmValue.put("KAE", null);
+ hmValue.put("JBA", "Jim Bader");
+ hmValue.put("RAN", "Robert Anthony");
+ hmValue.put("CLE", "Carole Lee");
+ hmValue.put("JMD", "Jim Bader");
+ hmValue.put("ALI", null);
+ hmValue.put("GMI", "Gracia Millan");
+ hmValue.put("MAL", "Marry Lee");
+ hmValue.put("CLE", "Carole Lee");
+ hmValue.put("APE", "Annin Peck");
+ hmValue.put("HUA", null);
+
+ System.out.println("============ Before Sorting ===============");
+ System.out.println(hmValue);
+
+ // Call method to sort the hmValue Map for it's Values
+ Map sortedMap = sortMapByValue(hmValue, false);
+
+ System.out.println("============ After Sorting ===============");
+ System.out.println(sortedMap);
+ }
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/ValueSortMap.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/GeneralizationListReducer.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/GeneralizationListReducer.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/GeneralizationListReducer.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/GeneralizationListReducer.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+
+import org.springframework.stereotype.Component;
+
+@Component
+public class GeneralizationListReducer {
+ public List<ParseTreeChunk> applyFilteringBySubsumption_OLD(
+ List<ParseTreeChunk> result) {
+ List<ParseTreeChunk> resultDupl = new ArrayList<ParseTreeChunk>();
+ resultDupl.addAll(new HashSet<ParseTreeChunk>(result));
+ result = resultDupl;
+ if (result.size() < 2)
+ return result; // nothing to reduce
+ List<ParseTreeChunk> resultReduced = new ArrayList<ParseTreeChunk>();
+ int size = result.size();
+ for (int i = 0; i < size; i++) {
+ Boolean bSubChunk = false;
+ for (int j = 0; j < size; j++) {
+ if (i == j) {
+ continue;
+ }
+ if (result.get(j).isASubChunk(result.get(i))) {
+ bSubChunk = true;
+ }
+ }
+ if (!bSubChunk)
+ resultReduced.add(result.get(i));
+ }
+
+ if (resultReduced.size() < 1) {
+ System.err.println("Wrong subsumption reduction");
+ }
+
+ if (resultReduced.size() > 1) {
+ int z = 0;
+ z++;
+ }
+ return resultReduced;
+
+ }
+
+ public List<ParseTreeChunk> applyFilteringBySubsumptionOLD(
+ List<ParseTreeChunk> result) {
+ List<ParseTreeChunk> resultDupl = null;
+ if (result.size() < 2)
+ return result; // nothing to reduce
+ List<ParseTreeChunk> resultReduced = new ArrayList<ParseTreeChunk>();
+ int size = result.size();
+ resultDupl = new ArrayList<ParseTreeChunk>(result);
+ for (int s = 0; s < size; s++) {
+ for (int i = 0; i < resultDupl.size(); i++) {
+ Boolean bStop = false;
+ for (int j = 0; j < resultDupl.size(); j++) {
+ if (i == j) {
+ continue;
+ }
+ if (result.get(j).isASubChunk(result.get(i))
+ && !result.get(i).isASubChunk(result.get(j))) {
+ resultDupl.remove(i);
+ bStop = true;
+ break;
+ }
+ }
+ if (bStop) {
+ break;
+ }
+ }
+ }
+ resultReduced = resultDupl;
+ if (resultReduced.size() < 1) {
+ System.err.println("Wrong subsumption reduction");
+ }
+
+ if (resultReduced.size() > 1) {
+ int z = 0;
+ z++;
+ }
+ return resultReduced;
+
+ }
+
+ public List<ParseTreeChunk> applyFilteringBySubsumption(
+ List<ParseTreeChunk> result) {
+ List<Integer> resultDuplIndex = new ArrayList<Integer>();
+ List<ParseTreeChunk> resultReduced = new ArrayList<ParseTreeChunk>();
+
+ if (result.size() < 2) {
+ return result; // nothing to reduce
+ }
+ // remove empty
+ for (ParseTreeChunk ch : result) {
+ if (ch.getLemmas().size() > 0) {
+ resultReduced.add(ch);
+ }
+ }
+ result = resultReduced;
+
+ for (int i = 0; i < result.size(); i++) {
+ for (int j = i + 1; j < result.size(); j++) {
+ if (i == j) {
+ continue;
+ }
+ if (result.get(j).isASubChunk(result.get(i))) {
+ resultDuplIndex.add(i);
+ } else if (result.get(i).isASubChunk(result.get(j))) {
+ resultDuplIndex.add(j);
+ }
+ }
+
+ }
+ resultReduced = new ArrayList<ParseTreeChunk>();
+ for (int i = 0; i < result.size(); i++) {
+ if (!resultDuplIndex.contains(i)) {
+ resultReduced.add(result.get(i));
+ }
+ }
+
+ if (resultReduced.size() < 1) {
+ System.err.println("Wrong subsumption reduction");
+ resultReduced = result;
+ }
+
+ return resultReduced;
+
+ }
+
+ // testing sub-chunk functionality and
+ // elimination more general according to subsumption relation
+
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/GeneralizationListReducer.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+import java.util.List;
+
+import org.springframework.stereotype.Component;
+
+@Component
+public class LemmaFormManager {
+
+ public String matchLemmas(PorterStemmer ps, String lemma1, String lemma2,
+ String POS) {
+ if (POS == null) {
+ return null;
+ }
+ lemma1 = lemma1.toLowerCase();
+ lemma2 = lemma2.toLowerCase();
+ // numbers have to be exact
+ if (POS.equals("CD")) {
+ if (lemma1.equals(lemma2)) {
+ return lemma1;
+ } else {
+ return null;
+ }
+ }
+
+ // 'must' occurrence of word - if not equal then 'fail'
+ if (lemma1.endsWith("_xyz") || lemma2.endsWith("_xyz")) {
+ lemma1 = lemma1.replace("_xyz", "");
+ lemma2 = lemma2.replace("_xyz", "");
+ if (lemma1.equals(lemma2)) {
+ return lemma1;
+ } else { // trying to check if nouns and different plural/single form
+ if (POS.equals("NN") || POS.equals("NP")) {
+ if ((lemma1.equals(lemma2 + "s") || lemma2.equals(lemma1 + "s"))
+ || lemma1.endsWith(lemma2) || lemma2.endsWith(lemma1)
+ || lemma1.startsWith(lemma2) || lemma2.startsWith(lemma1))
+ return lemma1;
+ }
+ return "fail";
+ }
+ }
+
+ if (lemma1.equals(lemma2)) {
+ return lemma1;
+ }
+
+ if (POS.equals("NN") || POS.equals("NP")) {
+ if ((lemma1.equals(lemma2 + "s") || lemma2.equals(lemma1 + "s"))
+ || lemma1.endsWith(lemma2) || lemma2.endsWith(lemma1)
+ || lemma1.startsWith(lemma2) || lemma2.startsWith(lemma1)) {
+ return lemma1;
+ }
+ }
+ try {
+ if (ps != null) {
+ if (ps.stem(lemma1).equalsIgnoreCase(ps.stem(lemma2))) {
+ return lemma1;
+ }
+ }
+ } catch (Exception e) {
+ System.err.println("Problem processing " + lemma1 + " " + lemma2);
+ return null;
+ }
+
+ return null;
+ }
+
+ public boolean acceptableLemmaAndPOS(String sim, String lemmaMatch) {
+ if (sim == null) {
+ return false;
+ }
+
+ if (lemmaMatch != null && !lemmaMatch.equals("fail")) {
+ return false;
+ }
+ // even if lemmaMatch==null
+ return true;
+ // if (sim!=null && (lemmaMatch!=null && !lemmaMatch.equals("fail"))){
+
+ }
+
+ // all lemmas ending with # in ch1 and/or ch2 SHOULD occur in chunkToAdd
+ public boolean mustOccurVerifier(ParseTreeChunk ch1, ParseTreeChunk ch2,
+ ParseTreeChunk chunkToAdd) {
+ List<String> lemmasWithMustOccur = ch1.getLemmas();
+ lemmasWithMustOccur.addAll(ch2.getLemmas());
+ List<String> res = chunkToAdd.getLemmas();
+ for (String lem : lemmasWithMustOccur) {
+ if (lem.endsWith("_xyz")) {
+ String pureLem = lem.replace("_xyz", "");
+ if (!res.contains(pureLem)) { // should occur but does not
+ return false;
+ }// failed the test
+ }
+ }
+ return true;
+ }
+
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaPair.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaPair.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaPair.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaPair.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+/**
+ *
+ * @author Albert-Jan de Vries
+ *
+ */
+public class LemmaPair {
+ private String POS;
+
+ private String lemma;
+
+ private int startPos;
+
+ int endPos;
+
+ public LemmaPair(String POS, String lemma, int startPos) {
+
+ this.POS = POS;
+ this.lemma = lemma;
+ this.startPos = startPos;
+ }
+
+ public LemmaPair(String POS, String lemma) {
+ this.POS = POS;
+ this.lemma = lemma;
+ }
+
+ public String getPOS() {
+ return POS;
+ }
+
+ public void setPOS(String pOS) {
+ POS = pOS;
+ }
+
+ public String getLemma() {
+ return lemma;
+ }
+
+ public void setLemma(String lemma) {
+ this.lemma = lemma;
+ }
+
+ public int getStartPos() {
+ return startPos;
+ }
+
+ public void setStartPos(int startPos) {
+ this.startPos = startPos;
+ }
+
+ public int getEndPos() {
+ return endPos;
+ }
+
+ public void setEndPos(int endPos) {
+ this.endPos = endPos;
+ }
+
+ public String toString() {
+ return this.getStartPos() + "(" + POS + "-" + lemma + ")";
+ }
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaPair.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/POSManager.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/POSManager.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/POSManager.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/POSManager.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+import org.springframework.stereotype.Component;
+
+@Component
+public class POSManager {
+ public POSManager() {
+
+ }
+
+ public String similarPOS(String pos1, String pos2) {
+ if ((pos1.startsWith("NN") && pos2.equals("NP") || pos2.startsWith("NN")
+ && pos1.equals("NP"))) {
+ return "NN";
+ }
+ if ((pos1.startsWith("NN") && pos2.equals("VBG") || pos2.startsWith("VBG")
+ && pos1.equals("NN"))) {
+ return "NN";
+ }
+
+ if ((pos1.startsWith("NN") && pos2.equals("ADJP") || pos2.startsWith("NN")
+ && pos1.equals("ADJP"))) {
+ return "NN";
+ }
+ if ((pos1.equals("IN") && pos2.equals("TO") || pos1.equals("TO")
+ && pos2.equals("IN"))) {
+ return "IN";
+ }
+ // VBx vs VBx = VB (does not matter which form for verb)
+ if (pos1.startsWith("VB") && pos2.startsWith("VB")) {
+ return "VB";
+ }
+
+ // ABx vs ABy always gives AB
+ if (pos1.equalsIgnoreCase(pos2)) {
+ return pos1;
+ }
+ if (pos1.length() > 2) {
+ pos1 = pos1.substring(0, 2);
+ }
+
+ if (pos2.length() > 2) {
+ pos2 = pos2.substring(0, 2);
+ }
+ if (pos1.equalsIgnoreCase(pos2)) {
+ return pos1 + "*";
+ }
+ return null;
+ }
+
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/POSManager.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParagraphClassifier.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParagraphClassifier.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParagraphClassifier.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParagraphClassifier.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,282 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Component;
+
+import com.zvents.ce.common.util.ValueSortMap;
+
+@Component
+public class ParagraphClassifier {
+ @Autowired
+ private SyntMatcher processor;
+
+ @Autowired
+ private ParseTreeChunkListScorer parseTreeChunkListScorer;
+
+ @Autowired
+ private ParseTreeChunk parseTreeChunk;
+
+ @Autowired
+ private ParseTreeChunkFactory parseTreeChunkFactory;
+
+ public ParagraphClassifier() {
+ }
+
+ // gets two paragraphs, one to be classified (each sentence to be assigned as
+ // closest
+ // representative out of the sentences of second paragraph)
+ public Map<String, List<LemmaPair>> findMappingBetweenSentencesOfAParagraphAndAClassReps(
+ String para1, // input
+ // paragraph
+ // of
+ // sentences
+ String classStr) { // training dataset of sentences - class
+ // representatives
+ // profile of matches
+ List<List<List<ParseTreeChunk>>> matchResultPerSentence = new ArrayList<List<List<ParseTreeChunk>>>();
+
+ ParseTreeChunk matcher = parseTreeChunkFactory.getParseTreeChunk();
+
+ // splitting into sentences
+ String[] sents = processor.getSentenceDetectorME().sentDetect(para1);
+ String[] classSents = processor.getSentenceDetectorME()
+ .sentDetect(classStr);
+
+ List<List<LemmaPair>> parseSentList = new ArrayList<List<LemmaPair>>();
+ for (String s : sents) {
+ parseSentList.add(processor.getAllPhrasesTWPairs((processor.parseLine(s,
+ processor.getParser(), 1)[0])));
+ }
+
+ List<List<LemmaPair>> parseClassList = new ArrayList<List<LemmaPair>>();
+ for (String s : classSents) {
+ parseClassList.add(processor.getAllPhrasesTWPairs((processor.parseLine(s,
+ processor.getParser(), 1)[0])));
+
+ }
+
+ Map<Integer, Integer> sentID_classID = new HashMap<Integer, Integer>();
+ Map<String, List<LemmaPair>> sentence_bestClassRep = new HashMap<String, List<LemmaPair>>();
+ Map<String, List<List<ParseTreeChunk>>> sentence_generalization = new HashMap<String, List<List<ParseTreeChunk>>>();
+ int sN = 0;
+ for (List<LemmaPair> chunksSent : parseSentList) {
+ Double maxScore = -1.0;
+ int cN = 0;
+ for (List<LemmaPair> chunksClass : parseClassList) {
+ List<List<ParseTreeChunk>> matchResult = matcher
+ .matchTwoSentencesGivenPairLists(chunksSent, chunksClass);
+ Double score = parseTreeChunkListScorer
+ .getParseTreeChunkListScore(matchResult);
+ if (score > maxScore) {
+ maxScore = score;
+ sentence_bestClassRep.put(chunksSent.toString(), chunksClass);
+ sentence_generalization.put(sents[sN], matchResult);
+ sentID_classID.put(sN, cN);
+ }
+ cN++;
+ }
+ sN++;
+ }
+
+ return sentence_bestClassRep;
+
+ }
+
+ public Double findShortestDistanceToClass(String para1, String trainingSet,
+ String className) {
+ List<List<List<ParseTreeChunk>>> matchResultPerSentence = new ArrayList<List<List<ParseTreeChunk>>>();
+
+ ParseTreeChunk matcher = new ParseTreeChunk();
+
+ // splitting into sentences
+ String[] sents = processor.getSentenceDetectorME().sentDetect(para1);
+ String[] classSents = processor.getSentenceDetectorME().sentDetect(
+ trainingSet);
+
+ List<List<LemmaPair>> parseSentList = new ArrayList<List<LemmaPair>>();
+ for (String s : sents) {
+ parseSentList.add(processor.getAllPhrasesTWPairs((processor.parseLine(s,
+ processor.getParser(), 1)[0])));
+ }
+
+ List<List<LemmaPair>> parseClassList = new ArrayList<List<LemmaPair>>();
+ for (String s : classSents) {
+ parseClassList.add(processor.getAllPhrasesTWPairs((processor.parseLine(s,
+ processor.getParser(), 1)[0])));
+
+ }
+
+ Map<Integer, Integer> sentID_classID = new HashMap<Integer, Integer>();
+ Map<String, List<LemmaPair>> sentence_bestClassRep = new HashMap<String, List<LemmaPair>>();
+ Map<String, List<List<ParseTreeChunk>>> sentence_generalization = new HashMap<String, List<List<ParseTreeChunk>>>();
+ Map<Integer, Double> sentID_score = new HashMap<Integer, Double>();
+ int sN = 0;
+ for (List<LemmaPair> chunksSent : parseSentList) {
+ Double maxScore = -1.0;
+ int cN = 0;
+ String bestSent = "", bestClass = "";
+ List<List<ParseTreeChunk>> bestMatchResult = null;
+ for (List<LemmaPair> chunksClass : parseClassList) {
+ List<List<ParseTreeChunk>> matchResult = matcher
+ .matchTwoSentencesGivenPairLists(chunksSent, chunksClass);
+ Double score = parseTreeChunkListScorer
+ .getParseTreeChunkListScore(matchResult);
+ if (score > maxScore) {
+ maxScore = score;
+ sentence_bestClassRep.put(chunksSent.toString(), chunksClass);
+ sentence_generalization.put(sents[sN], matchResult);
+ sentID_classID.put(sN, cN);
+ sentID_score.put(sN, score);
+ bestSent = sents[sN];
+ bestClass = classSents[cN];
+ bestMatchResult = matchResult;
+ }
+ cN++;
+ }
+ if (maxScore > 1.6) {
+ System.out.println("Best match:" + bestSent + " <x> " + bestClass
+ + " of class= " + className.toUpperCase() + " score =" + maxScore
+ + " " + parseTreeChunk.listToString(bestMatchResult));
+ System.out.println("");
+ }
+ sN++;
+ }
+
+ // now get the average of highest three (if exists- matching score to judge
+ // if
+ // para belongs to a class
+ List<Double> scoreValues = new ArrayList<Double>(sentID_score.values());
+ Collections.sort(scoreValues, Collections.reverseOrder());
+ if (scoreValues.size() > 2)
+ scoreValues = scoreValues.subList(0, 3);
+ Double sum = 0.0;
+ int count = 0;
+ for (Double sc : scoreValues) {
+ sum += sc;
+ count++;
+ }
+ return scoreValues.get(0); // sum/(1.0*count);
+ }
+
+ public List<String> findClassesForPara(String para) {
+ List<String> resultantClasses = new ArrayList<String>();
+ Map<String, Double> class_score = new HashMap<String, Double>();
+ Double classTHRESH = 1.79;
+ List<String> classNames = new ArrayList<String>(
+ EpistemicStatesTrainingSet.class_setOfSentences.keySet());
+ for (String clName : classNames) {
+ String trainingSet = EpistemicStatesTrainingSet.class_setOfSentences
+ .get(clName);
+ if (trainingSet == null)
+ System.err.println("Wrong EpistemicStatesTrainingSet for class = "
+ + clName);
+ Double scoreAvg = findShortestDistanceToClass(para, trainingSet, clName);
+ if (scoreAvg > classTHRESH) {
+ resultantClasses.add(clName);
+ class_score.put(clName, scoreAvg);
+ }
+ }
+ Map sortedMap = ValueSortMap.sortMapByValue(class_score, false);
+
+ System.out.println(sortedMap);
+ return resultantClasses;
+
+ }
+
+}
+
+/*
+ *
+ * I removed abberation by digital zoom increase by performance limitation of
+ * filters of my camera. =[[ [JJ-digital NN-zoom NN-* ], [PRP$-my NN-camera ]],
+ * [ [VBD-* JJ-digital NN-zoom NN-* IN-by NP-filters IN-* PRP$-my NN-camera ],
+ * [NP-filters TO-* PRP$-my NN-camera ]], [], [ [IN-by NP-filters TO-* PRP$-my
+ * NN-camera ], [TO-* PRP$-my NN-camera ]], [], [], [ [NP-I VBD-* JJ-digital
+ * NN-zoom NN-* IN-by NP-filters IN-* PRP$-my NN-camera .-. ], [NP-filters TO-*
+ * PRP$-my NN-camera ]]],
+ *
+ *
+ * Animals run to the tiger zoo. =[[], [ [VB-run IN-* NP-tigers NP-zoo ]], [], [
+ * [IN-* NP-zoo ], [IN-* NP-tigers ]], [], [], [ [NP-* VBP-* TO-to NP-tigers
+ * NP-zoo ], [VB-run IN-* NP-tigers NP-zoo ]]],
+ *
+ * In this digital camera you can turn your ldc screen away from the scene.= [[
+ * [NN-* ], [DT-* NN-* ], [DT-* JJ-* NN-* ], [PRP$-your NN-ldc NN-screen ],
+ * [DT-the NN-scene ]], [ [MD-can VB-turn PRP$-your NN-ldc NN-screen RB-away
+ * IN-from DT-the NN-scene ]], [], [ [IN-* DT-* NN-* ], [IN-from DT-the NN-scene
+ * ]], [], [], [ [NP-* MD-can VB-turn PRP$-your NN-ldc NN-screen RB-away IN-from
+ * DT-the NN-scene ], [IN-* DT-* JJ-* NN-* ], [NP-* VBZ-* NN-* ]]],
+ *
+ * I can easily connect this digital camera to my desktop computer to copy
+ * images. =[[ [DT-this JJ-digital NN-camera ], [PRP$-my NN-* NN-computer ]], [
+ * [VBD-* DT-this JJ-digital NN-camera TO-to PRP$-my NN-* NN-computer NN-* ]],
+ * [], [ [TO-to PRP$-my NN-* NN-computer ]], [], [], [ [NP-I VBD-* DT-this
+ * JJ-digital NN-camera TO-to PRP$-my NN-* NN-computer NN-* .-. ]]],
+ *
+ * I want this nice radio thing. =[[ [DT-this JJ-nice NN-thing ]], [ [VB-want
+ * DT-this JJ-nice NN-thing ]], [], [], [], [], [ [NP-* VB-want DT-this JJ-nice
+ * NN-thing .-. ]]],
+ *
+ * This digital camera nicely fits in my palm and the body is not heavy. = [[
+ * [DT-* JJ-digital NN-camera ], [PRP$-my NN-* ], [PRP$-my NN-palm ]], [ [VBZ-*
+ * TO-* PRP$-my NN-palm ], [VBZ-is NN-* ]], [], [ [IN-* PRP$-my NN-* ], [TO-*
+ * PRP$-my NN-palm ]], [], [], [ [DT-* JJ-digital NN-camera IN-* PRP$-my NN-*
+ * DT-* NN-* ADJP-* .-. ], [DT-* JJ-digital NN-camera VBZ-* IN-* PRP$-my NN-palm
+ * ], [DT-* NN-* VBZ-is NN-* ]]],
+ *
+ * I told my wife to film me at a speed while on a boat. =[[ [PRP$-* NN-wife ],
+ * [DT-a NN-speed ]], [ [VBD-* TO-to VB-film NP-me DT-a NN-speed ], [VB-film
+ * NP-me NP-* IN-* DT-a NN-* ], [VBD-* VBG-* IN-at DT-a NN-speed ], [TO-to
+ * VB-film NP-me NP-* IN-* DT-a NN-* ], [TO-to VB-film NP-me IN-at DT-a NN-speed
+ * ], [VBG-* IN-at DT-a NN-speed ]], [], [ [IN-at DT-a NN-speed ]], [], [], [
+ * [PRP$-* NN-wife TO-to VB-film NP-me IN-at DT-a NN-speed ], [TO-to VB-film
+ * NP-me NP-* IN-* DT-a NN-* ], [NP-* VBD-* VB-* IN-at DT-a NN-speed ], [VBG-*
+ * IN-at DT-a NN-speed ]]],
+ *
+ * I have to frequently change batteries in digital camera. = [[ [JJ-digital
+ * NN-* ], [NN-camera ]], [ [VBP-* TO-* VB-* NP-* IN-* NN-camera ], [VBG-* NP-*
+ * TO-* NN-camera ]], [], [ [IN-* NN-camera ], [TO-* NN-camera ]], [], [], [
+ * [NP-I VBP-* TO-* VB-* NP-* IN-* NN-camera .-. ], [VBG-* TO-to NN-camera ],
+ * [VBG-* NP-* TO-* NN-camera ]]],
+ *
+ * I enjoyed the digital zoom of this camera because I can quickly adjust for
+ * shots far away. =[[ [DT-* NN-* ], [DT-the NN-* IN-of DT-this NN-camera ],
+ * [DT-* JJ-digital NN-camera ], [JJ-* NN-* NNS-* ], [DT-the JJ-digital NN-* ]],
+ * [ [DT-the NN-* IN-of DT-this NN-camera IN-because NP-I MD-* VB-* NN-* ],
+ * [VB-* JJ-* NN-* NNS-* ], [VB-* IN-* NP-* ]], [], [IN-of DT-this NN-camera ]],
+ * [], [], [ [NP-I VBD-* DT-the NN-* IN-of DT-this NN-camera IN-because NP-I
+ * MD-* VB-* .-. ], [IN-because NP-I MD-* VB-* NN-* ], [NP-I VB-* JJ-* NN-*
+ * NNS-* ]]],
+ *
+ * Nice to hear interesting programs about animals. =[[ [JJ-* NNS-programs IN-*
+ * NP-animals ]], [ [TO-to VB-hear JJ-* NNS-programs IN-* NP-animals ]], [], [
+ * [IN-* NP-animals ]], [ [JJ-* TO-to VB-hear JJ-* NNS-programs IN-* NP-animals
+ * ]], [], [ [TO-to VB-hear JJ-* NNS-programs IN-* NP-animals ]]],
+ *
+ * I like it because radio is loud. = [[ [NN-* NN-* ]], [ [VBZ-is NN-* ]], [], [
+ * [IN-* NN-* ]], [], [], [ [NN-* IN-* NN-* IN-because NP-* VBZ-is NN-* .-. ],
+ * [NP-* VBZ-is NN-* ]]]}
+ */
\ No newline at end of file
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParagraphClassifier.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,410 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Component;
+
+@Component
+public class ParseTreeChunk {
+ private String mainPOS;
+
+ private List<String> lemmas;
+
+ private List<String> POSs;
+
+ private int startPos;
+
+ private int endPos;
+
+ private int size;
+
+ @Autowired
+ private ParseTreeMatcher parseTreeMatcher;
+
+ @Autowired
+ private LemmaFormManager lemmaFormManager;
+
+ @Autowired
+ private GeneralizationListReducer generalizationListReducer;
+
+ public ParseTreeChunk() {
+ }
+
+ public ParseTreeChunk(List<String> lemmas, List<String> POSs, int startPos,
+ int endPos) {
+ this.lemmas = lemmas;
+ this.POSs = POSs;
+ this.startPos = startPos;
+ this.endPos = endPos;
+
+ // phraseType.put(0, "np");
+ }
+
+ // constructor which takes lemmas and POS as lists so that phrases can be
+ // conveniently specified.
+ // usage: stand-alone runs
+ public ParseTreeChunk(String mPOS, String[] lemmas, String[] POSss) {
+ this.mainPOS = mPOS;
+ this.lemmas = new ArrayList<String>();
+ for (String l : lemmas) {
+ this.lemmas.add(l);
+ }
+ this.POSs = new ArrayList<String>();
+ for (String p : POSss) {
+ this.POSs.add(p);
+ }
+ }
+
+ // Before:
+ // [0(S-At home we like to eat great pizza deals), 0(PP-At home), 0(IN-At),
+ // 3(NP-home), 3(NN-home), 8(NP-we),
+ // 8(PRP-we), 11(VP-like to eat great pizza deals), 11(VBP-like), 16(S-to eat
+ // great pizza deals), 16(VP-to eat great
+ // pizza deals),
+ // 16(TO-to), 19(VP-eat great pizza deals), 19(VB-eat), 23(NP-great pizza
+ // deals), 23(JJ-great), 29(NN-pizza),
+ // 35(NNS-deals)]
+
+ // After:
+ // [S [IN-At NP-home NP-we VBP-like ], PP [IN-At NP-home ], IN [IN-At ], NP
+ // [NP-home ], NN [NP-home ], NP [NP-we ],
+ // PRP [NP-we ], VP [VBP-like TO-to VB-eat JJ-great ], VBP [VBP-like ], S
+ // [TO-to VB-eat JJ-great NN-pizza ], VP
+ // [TO-to VB-eat JJ-great NN-pizza ], TO [TO-to ], VP [VB-eat JJ-great
+ // NN-pizza NNS-deals ],
+ // VB [VB-eat ], NP [JJ-great NN-pizza NNS-deals ], JJ [JJ-great ], NN
+ // [NN-pizza ], NNS [NNS-deals ]]
+
+ public List<ParseTreeChunk> buildChunks(List<LemmaPair> parseResults) {
+ List<ParseTreeChunk> chunksResults = new ArrayList<ParseTreeChunk>();
+ for (LemmaPair chunk : parseResults) {
+ String[] lemmasAr = chunk.getLemma().split(" ");
+ List<String> poss = new ArrayList<String>(), lems = new ArrayList<String>();
+ for (String lem : lemmasAr) {
+ lems.add(lem);
+ // now looking for POSs for individual word
+ for (LemmaPair chunkCur : parseResults) {
+ if (chunkCur.getLemma().equals(lem)
+ &&
+ // check that this is a proper word in proper position
+ chunkCur.getEndPos() <= chunk.getEndPos()
+ && chunkCur.getStartPos() >= chunk.getStartPos()) {
+ poss.add(chunkCur.getPOS());
+ break;
+ }
+ }
+ }
+ if (lems.size() != poss.size()) {
+ System.err.println("lems.size()!= poss.size()");
+ }
+ if (lems.size() < 2) { // single word phrase, nothing to match
+ continue;
+ }
+ ParseTreeChunk ch = new ParseTreeChunk(lems, poss, chunk.getStartPos(),
+ chunk.getEndPos());
+ ch.setMainPOS(chunk.getPOS());
+ chunksResults.add(ch);
+ }
+ return chunksResults;
+ }
+
+ public List<List<ParseTreeChunk>> matchTwoSentencesGivenPairLists(
+ List<LemmaPair> sent1Pairs, List<LemmaPair> sent2Pairs) {
+
+ List<ParseTreeChunk> chunk1List = buildChunks(sent1Pairs);
+ List<ParseTreeChunk> chunk2List = buildChunks(sent2Pairs);
+
+ List<List<ParseTreeChunk>> sent1GrpLst = groupChunksAsParses(chunk1List);
+ List<List<ParseTreeChunk>> sent2GrpLst = groupChunksAsParses(chunk2List);
+
+ System.out.println("=== Grouped chunks 1 " + sent1GrpLst);
+ System.out.println("=== Grouped chunks 2 " + sent2GrpLst);
+
+ return matchTwoSentencesGroupedChunks(sent1GrpLst, sent2GrpLst);
+ }
+
+ // groups noun phrases, verb phrases, propos phrases etc. for separate match
+
+ public List<List<ParseTreeChunk>> groupChunksAsParses(
+ List<ParseTreeChunk> parseResults) {
+ List<ParseTreeChunk> np = new ArrayList<ParseTreeChunk>(), vp = new ArrayList<ParseTreeChunk>(), prp = new ArrayList<ParseTreeChunk>(), sbarp = new ArrayList<ParseTreeChunk>(), pp = new ArrayList<ParseTreeChunk>(), adjp = new ArrayList<ParseTreeChunk>(), whadvp = new ArrayList<ParseTreeChunk>(), restOfPhrasesTypes = new ArrayList<ParseTreeChunk>();
+ List<List<ParseTreeChunk>> results = new ArrayList<List<ParseTreeChunk>>();
+ for (ParseTreeChunk ch : parseResults) {
+ String mainPos = ch.getMainPOS().toLowerCase();
+
+ if (mainPos.equals("s")) {
+ continue;
+ }
+ if (mainPos.equals("np")) {
+ np.add(ch);
+ } else if (mainPos.equals("vp")) {
+ vp.add(ch);
+ } else if (mainPos.equals("prp")) {
+ prp.add(ch);
+ } else if (mainPos.equals("pp")) {
+ pp.add(ch);
+ } else if (mainPos.equals("adjp")) {
+ adjp.add(ch);
+ } else if (mainPos.equals("whadvp")) {
+ whadvp.add(ch);
+ } else if (mainPos.equals("sbar")) {
+ sbarp.add(ch);
+ } else {
+ restOfPhrasesTypes.add(ch);
+ }
+
+ }
+ results.add(np);
+ results.add(vp);
+ results.add(prp);
+ results.add(pp);
+ results.add(adjp);
+ results.add(whadvp);
+ results.add(restOfPhrasesTypes);
+
+ return results;
+
+ }
+
+ // main function to generalize two expressions grouped by phrase types
+ // returns a list of generalizations for each phrase type with filtered
+ // sub-expressions
+ public List<List<ParseTreeChunk>> matchTwoSentencesGroupedChunks(
+ List<List<ParseTreeChunk>> sent1, List<List<ParseTreeChunk>> sent2) {
+ List<List<ParseTreeChunk>> results = new ArrayList<List<ParseTreeChunk>>();
+ // first irerate through component
+ for (int comp = 0; comp < 2 && // just np & vp
+ comp < sent1.size() && comp < sent2.size(); comp++) {
+ List<ParseTreeChunk> resultComps = new ArrayList<ParseTreeChunk>();
+ // then iterate through each phrase in each component
+ for (ParseTreeChunk ch1 : sent1.get(comp)) {
+ for (ParseTreeChunk ch2 : sent2.get(comp)) { // simpler version
+ ParseTreeChunk chunkToAdd = parseTreeMatcher
+ .generalizeTwoGroupedPhrasesRandomSelectHighestScoreWithTransforms(
+ ch1, ch2);
+
+ if (!lemmaFormManager.mustOccurVerifier(ch1, ch2, chunkToAdd)) {
+ continue; // if the words which have to stay do not stay, proceed to
+ // other elements
+ }
+ Boolean alreadyThere = false;
+ for (ParseTreeChunk chunk : resultComps) {
+ if (chunk.equalsTo(chunkToAdd)) {
+ alreadyThere = true;
+ break;
+ }
+
+ if (parseTreeMatcher
+ .generalizeTwoGroupedPhrasesRandomSelectHighestScore(chunk,
+ chunkToAdd).equalsTo(chunkToAdd)) {
+ alreadyThere = true;
+ break;
+ }
+ }
+
+ if (!alreadyThere) {
+ resultComps.add(chunkToAdd);
+ }
+
+ List<ParseTreeChunk> resultCompsReduced = generalizationListReducer
+ .applyFilteringBySubsumption(resultComps);
+ // if (resultCompsReduced.size() != resultComps.size())
+ // System.out.println("reduction of gen list occurred");
+ }
+ }
+ results.add(resultComps);
+ }
+
+ return results;
+ }
+
+ public Boolean equals(ParseTreeChunk ch) {
+ List<String> lems = ch.getLemmas();
+ List<String> poss = ch.POSs;
+
+ if (this.lemmas.size() <= lems.size())
+ return false; // sub-chunk should be shorter than chunk
+
+ for (int i = 0; i < lems.size() && i < this.lemmas.size(); i++) {
+ if (!(this.lemmas.get(i).equals(lems.get(i)) && this.POSs.get(i).equals(
+ poss.get(i))))
+ return false;
+ }
+ return true;
+ }
+
+ // 'this' is super - chunk of ch, ch is sub-chunk of 'this'
+ public Boolean isASubChunk(ParseTreeChunk ch) {
+ List<String> lems = ch.getLemmas();
+ List<String> poss = ch.POSs;
+
+ if (this.lemmas.size() < lems.size())
+ return false; // sub-chunk should be shorter than chunk
+
+ for (int i = 0; i < lems.size() && i < this.lemmas.size(); i++) {
+ if (!(this.lemmas.get(i).equals(lems.get(i)) && this.POSs.get(i).equals(
+ poss.get(i))))
+ return false;
+ }
+ return true;
+ }
+
+ public Boolean equalsTo(ParseTreeChunk ch) {
+ List<String> lems = ch.getLemmas();
+ List<String> poss = ch.POSs;
+ if (this.lemmas.size() != lems.size() || this.POSs.size() != poss.size())
+ return false;
+
+ for (int i = 0; i < lems.size(); i++) {
+ if (!(this.lemmas.get(i).equals(lems.get(i)) && this.POSs.get(i).equals(
+ poss.get(i))))
+ return false;
+ }
+
+ return true;
+ }
+
+ public String toString() {
+ String buf = " [";
+ if (mainPOS != null)
+ buf = mainPOS + " [";
+ for (int i = 0; i < lemmas.size() && i < POSs.size() // && i<=3
+ ; i++) {
+ buf += POSs.get(i) + "-" + lemmas.get(i) + " ";
+ }
+ return buf + "]";
+ }
+
+ public int compareTo(ParseTreeChunk o) {
+ if (this.size > o.size)
+ return -1;
+ else
+ return 1;
+
+ }
+
+ public String listToString(List<List<ParseTreeChunk>> chunks) {
+ StringBuffer buf = new StringBuffer();
+ if (chunks.get(0).size() > 0) {
+ buf.append(" np " + chunks.get(0).toString());
+ }
+ if (chunks.get(1).size() > 0) {
+ buf.append(" vp " + chunks.get(1).toString());
+ }
+ if (chunks.size() < 3) {
+ return buf.toString();
+ }
+ if (chunks.get(2).size() > 0) {
+ buf.append(" prp " + chunks.get(2).toString());
+ }
+ if (chunks.get(3).size() > 0) {
+ buf.append(" pp " + chunks.get(3).toString());
+ }
+ if (chunks.get(4).size() > 0) {
+ buf.append(" adjp " + chunks.get(4).toString());
+ }
+ if (chunks.get(5).size() > 0) {
+ buf.append(" whadvp " + chunks.get(5).toString());
+ }
+ /*
+ * if (mainPos.equals("np")) np.add(ch); else if (mainPos.equals( "vp"))
+ * vp.add(ch); else if (mainPos.equals( "prp")) prp.add(ch); else if
+ * (mainPos.equals( "pp")) pp.add(ch); else if (mainPos.equals( "adjp"))
+ * adjp.add(ch); else if (mainPos.equals( "whadvp")) whadvp.add(ch);
+ */
+ return buf.toString();
+ }
+
+ public List<List<ParseTreeChunk>> obtainParseTreeChunkListByParsingList(
+ String toParse) {
+ List<List<ParseTreeChunk>> results = new ArrayList<List<ParseTreeChunk>>();
+ // if (toParse.endsWith("]]]")){
+ // toParse = toParse.replace("[[","").replace("]]","");
+ // }
+ toParse = toParse.replace(" ]], [ [", "&");
+ String[] phraseTypeFragments = toParse.trim().split("&");
+ for (String toParseFragm : phraseTypeFragments) {
+ toParseFragm = toParseFragm.replace("], [", "#");
+
+ List<ParseTreeChunk> resultsPhraseType = new ArrayList<ParseTreeChunk>();
+ String[] indivChunks = toParseFragm.trim().split("#");
+ for (String expr : indivChunks) {
+ List<String> lems = new ArrayList<String>(), poss = new ArrayList<String>();
+ expr = expr.replace("[", "").replace(" ]", "");
+ String[] pairs = expr.trim().split(" ");
+ for (String word : pairs) {
+ word = word.replace("]]", "").replace("]", "");
+ String[] pos_lem = word.split("-");
+ lems.add(pos_lem[1].trim());
+ poss.add(pos_lem[0].trim());
+ }
+ ParseTreeChunk ch = new ParseTreeChunk();
+ ch.setLemmas(lems);
+ ch.setPOSs(poss);
+ resultsPhraseType.add(ch);
+ }
+ results.add(resultsPhraseType);
+ }
+ System.out.println(results);
+ return results;
+
+ // 2.1 | Vietnam <b>embassy</b> <b>in</b> <b>Israel</b>: information on how
+ // to get your <b>visa</b> at Vietnam
+ // <b>embassy</b> <b>in</b> <b>Israel</b>. <b>...</b> <b>Spain</b>.
+ // Scotland. Sweden. Slovakia. Switzerland. T
+ // [Top of Page] <b>...</b>
+ // [[ [NN-* IN-in NP-israel ], [NP-* IN-in NP-israel ], [NP-* IN-* TO-* NN-*
+ // ], [NN-visa IN-* NN-* IN-in ]], [
+ // [VB-get NN-visa IN-* NN-* IN-in .-* ], [VBD-* IN-* NN-* NN-* .-* ], [VB-*
+ // NP-* ]]]
+
+ }
+
+ public void setMainPOS(String mainPOS) {
+ this.mainPOS = mainPOS;
+ }
+
+ public String getMainPOS() {
+ return mainPOS;
+ }
+
+ public List<String> getLemmas() {
+ return lemmas;
+ }
+
+ public void setLemmas(List<String> lemmas) {
+ this.lemmas = lemmas;
+ }
+
+ public List<String> getPOSs() {
+ return POSs;
+ }
+
+ public void setPOSs(List<String> pOSs) {
+ POSs = pOSs;
+ }
+
+ public ParseTreeMatcher getParseTreeMatcher() {
+ return parseTreeMatcher;
+ }
+
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkComparable.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkComparable.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkComparable.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkComparable.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+import java.util.Comparator;
+
+public class ParseTreeChunkComparable implements Comparator<ParseTreeChunk> {
+ public int compare(ParseTreeChunk ch1, ParseTreeChunk ch2) {
+ for (int i = 0; i < ch1.getLemmas().size() && i < ch2.getLemmas().size(); i++) {
+ if (!(ch1.getLemmas().get(i).equals(ch2.getLemmas().get(i)) && ch1
+ .getPOSs().get(i).equals(ch2.getPOSs().get(i))))
+ return -1;
+ }
+ return 0;
+
+ }
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkComparable.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkFactory.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkFactory.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkFactory.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkFactory.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+import org.springframework.context.annotation.Bean;
+import org.springframework.context.annotation.Configuration;
+import org.springframework.context.annotation.Scope;
+
+@Configuration
+public class ParseTreeChunkFactory {
+ @Bean
+ @Scope(value = "prototype")
+ public ParseTreeChunk getParseTreeChunk() {
+ return new ParseTreeChunk();
+ }
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkFactory.java
------------------------------------------------------------------------------
svn:mime-type = text/plain