You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/10/11 17:36:33 UTC
svn commit: r1181845 [4/5] - in /incubator/opennlp/sandbox/opennlp-similarity: ./ src/main/java/opennlp/tools/similarity/ src/main/java/opennlp/tools/similarity/apps/ src/main/java/opennlp/tools/similarity/apps/utils/ src/main/java/opennlp/tools/textsi...

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkListScorer.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkListScorer.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkListScorer.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkListScorer.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+import java.util.List;
+
+import org.springframework.stereotype.Component;
+
+@Component
+public class ParseTreeChunkListScorer {
+  // find the single expression with the highest score
+  public double getParseTreeChunkListScore(
+      List<List<ParseTreeChunk>> matchResult) {
+    double currScore = 0.0;
+    for (List<ParseTreeChunk> chunksGivenPhraseType : matchResult)
+      for (ParseTreeChunk chunk : chunksGivenPhraseType) {
+        Double score = getScore(chunk);
+        // System.out.println(chunk+ " => score >>> "+score);
+        if (score > currScore) {
+          currScore = score;
+        }
+      }
+    return currScore;
+  }
+
+  // get max score per phrase type and then sum up
+  public double getParseTreeChunkListScoreAggregPhraseType(
+      List<List<ParseTreeChunk>> matchResult) {
+    double currScoreTotal = 0.0;
+    for (List<ParseTreeChunk> chunksGivenPhraseType : matchResult) {
+      double currScorePT = 0.0;
+      for (ParseTreeChunk chunk : chunksGivenPhraseType) {
+        Double score = getScore(chunk);
+        // System.out.println(chunk+ " => score >>> "+score);
+        if (score > currScorePT) {
+          currScorePT = score;
+        }
+      }
+      // if substantial for given phrase type
+      if (currScorePT > 0.5) {
+        currScoreTotal += currScorePT;
+      }
+    }
+    return currScoreTotal;
+  }
+
+  // score is meaningful only for chunks which are results of generalization
+
+  public double getScore(ParseTreeChunk chunk) {
+    double score = 0.0;
+    int i = 0;
+    for (String l : chunk.getLemmas()) {
+      String pos = chunk.getPOSs().get(i);
+      if (l.equals("*")) {
+        if (pos.startsWith("CD")) { // number vs number gives high score
+                                    // although different numbers
+          score += 0.7;
+        } else if (pos.endsWith("_high")) { // if query modification adds 'high'
+          score += 1.0;
+        } else {
+          score += 0.1;
+        }
+      } else {
+
+        if (pos.startsWith("NN") || pos.startsWith("NP")
+            || pos.startsWith("CD") || pos.startsWith("RB")) {
+          score += 1.0;
+        } else if (pos.startsWith("VB") || pos.startsWith("JJ")) {
+          if (l.equals("get")) { // 'common' verbs are not that important
+            score += 0.3;
+          } else {
+            score += 0.5;
+          }
+        } else {
+          score += 0.3;
+        }
+      }
+      i++;
+
+    }
+    return score;
+  }
+
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkListScorer.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcher.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcher.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcher.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcher.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Component;
+
+/**
+ * Created by IntelliJ IDEA. User: boris Date: Feb 13, 2009 Time: 2:18:47 PM To
+ * change this template use File | Settings | File Templates.
+ */
+@Component
+public class ParseTreeMatcher {
+
+  private static final int NUMBER_OF_ITERATIONS = 2;
+
+  private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
+  private POSManager posManager = new POSManager();
+  private LemmaFormManager lemmaFormManager = new LemmaFormManager();
+
+  public ParseTreeMatcher() {
+
+  }
+
+  public ParseTreeChunk generalizeTwoGroupedPhrasesOLD(ParseTreeChunk chunk1,
+      ParseTreeChunk chunk2) {
+    List<String> pos1 = chunk1.getPOSs();
+    List<String> pos2 = chunk1.getPOSs();
+
+    List<String> commonPOS = new ArrayList<String>(), commonLemmas = new ArrayList<String>();
+    int k1 = 0, k2 = 0;
+    Boolean incrFirst = true;
+    while (k1 < pos1.size() && k2 < pos2.size()) {
+      // first check if the same POS
+      String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2));
+      if (sim != null) {
+        commonPOS.add(pos1.get(k1));
+        if (chunk1.getLemmas().size() > k1 && chunk2.getLemmas().size() > k2
+            && chunk1.getLemmas().get(k1).equals(chunk2.getLemmas().get(k2))) {
+          commonLemmas.add(chunk1.getLemmas().get(k1));
+        } else {
+          commonLemmas.add("*");
+        }
+        k1++;
+        k2++;
+      } else if (incrFirst) {
+        k1++;
+      } else {
+        k2++;
+      }
+      incrFirst = !incrFirst;
+    }
+
+    ParseTreeChunk res = new ParseTreeChunk(commonLemmas, commonPOS, 0, 0);
+    // if (parseTreeChunkListScorer.getScore(res)> 0.6)
+    // System.out.println(chunk1 + "  + \n"+ chunk2 + " = \n" + res);
+    return res;
+  }
+
+  // A for B => B have A
+  // transforms expr { A B C prep X Y }
+  // into {A B {X Y} C}
+  // should only be applied to a noun phrase
+  public ParseTreeChunk prepositionalNNSTransform(ParseTreeChunk ch) {
+    List<String> transfPOS = new ArrayList<String>(), transfLemmas = new ArrayList<String>();
+    if (!ch.getPOSs().contains("IN"))
+      return ch;
+    int indexIN = ch.getPOSs().lastIndexOf("IN");
+
+    if (indexIN < 2)// preposition is a first word - should not be in a noun
+                    // phrase
+      return ch;
+    String Word_IN = ch.getLemmas().get(indexIN);
+    if (!(Word_IN.equals("to") || Word_IN.equals("on") || Word_IN.equals("in")
+        || Word_IN.equals("of") || Word_IN.equals("with")
+        || Word_IN.equals("by") || Word_IN.equals("from")))
+      return ch;
+
+    List<String> toShiftAfterPartPOS = ch.getPOSs().subList(indexIN + 1,
+        ch.getPOSs().size());
+    List<String> toShiftAfterPartLemmas = ch.getLemmas().subList(indexIN + 1,
+        ch.getLemmas().size());
+
+    if (indexIN - 1 > 0)
+      transfPOS.addAll(ch.getPOSs().subList(0, indexIN - 1));
+    transfPOS.addAll(toShiftAfterPartPOS);
+    transfPOS.add(ch.getPOSs().get(indexIN - 1));
+
+    if (indexIN - 1 > 0)
+      transfLemmas.addAll(ch.getLemmas().subList(0, indexIN - 1));
+    transfLemmas.addAll(toShiftAfterPartLemmas);
+    transfLemmas.add(ch.getLemmas().get(indexIN - 1));
+
+    return new ParseTreeChunk(transfLemmas, transfPOS, 0, 0);
+  }
+
+  public ParseTreeChunk generalizeTwoGroupedPhrasesRandomSelectHighestScoreWithTransforms(
+      ParseTreeChunk chunk1, ParseTreeChunk chunk2) {
+    ParseTreeChunk chRes1 = generalizeTwoGroupedPhrasesRandomSelectHighestScore(
+        chunk1, chunk2);
+    ParseTreeChunk chRes2 = generalizeTwoGroupedPhrasesRandomSelectHighestScore(
+        prepositionalNNSTransform(chunk1), chunk2);
+    ParseTreeChunk chRes3 = generalizeTwoGroupedPhrasesRandomSelectHighestScore(
+        prepositionalNNSTransform(chunk2), chunk1);
+
+    ParseTreeChunk chRes = null;
+    if (parseTreeChunkListScorer.getScore(chRes1) > parseTreeChunkListScorer
+        .getScore(chRes2))
+      if (parseTreeChunkListScorer.getScore(chRes1) > parseTreeChunkListScorer
+          .getScore(chRes3))
+        chRes = chRes1;
+      else
+        chRes = chRes3;
+    else if (parseTreeChunkListScorer.getScore(chRes2) > parseTreeChunkListScorer
+        .getScore(chRes3))
+      chRes = chRes2;
+    else
+      chRes = chRes3;
+
+    return chRes;
+  }
+
+  public ParseTreeChunk generalizeTwoGroupedPhrasesRandomSelectHighestScore(
+      ParseTreeChunk chunk1, ParseTreeChunk chunk2) {
+    List<String> pos1 = chunk1.getPOSs();
+    List<String> pos2 = chunk2.getPOSs();
+    // Map <ParseTreeChunk, Double> scoredResults = new HashMap <ParseTreeChunk,
+    // Double> ();
+    int timesRepetitiveRun = NUMBER_OF_ITERATIONS;
+
+    Double globalScore = -1.0;
+    ParseTreeChunk result = null;
+
+    for (int timesRun = 0; timesRun < timesRepetitiveRun; timesRun++) {
+      List<String> commonPOS = new ArrayList<String>(), commonLemmas = new ArrayList<String>();
+      int k1 = 0, k2 = 0;
+      Double score = 0.0;
+      while (k1 < pos1.size() && k2 < pos2.size()) {
+        // first check if the same POS
+        String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2));
+        String lemmaMatch = lemmaFormManager.matchLemmas(null, chunk1
+            .getLemmas().get(k1), chunk2.getLemmas().get(k2), sim);
+        // if (LemmaFormManager.acceptableLemmaAndPOS(sim, lemmaMatch)){
+        if ((sim != null)
+            && (lemmaMatch == null || (lemmaMatch != null && !lemmaMatch
+                .equals("fail")))) {
+          // if (sim!=null){ // && (lemmaMatch!=null &&
+          // !lemmaMatch.equals("fail"))){
+          commonPOS.add(pos1.get(k1));
+          if (chunk1.getLemmas().size() > k1 && chunk2.getLemmas().size() > k2
+              && lemmaMatch != null) {
+            commonLemmas.add(lemmaMatch);
+
+          } else {
+            commonLemmas.add("*");
+
+          }
+          k1++;
+          k2++;
+        } else if (Math.random() > 0.5) {
+          k1++;
+        } else {
+          k2++;
+        }
+
+      }
+      ParseTreeChunk currResult = new ParseTreeChunk(commonLemmas, commonPOS,
+          0, 0);
+      score = parseTreeChunkListScorer.getScore(currResult);
+      if (score > globalScore) {
+        // System.out.println(chunk1 + "  + \n"+ chunk2 + " = \n" +
+        // result+" score = "+ score +"\n\n");
+        result = currResult;
+        globalScore = score;
+      }
+    }
+
+    for (int timesRun = 0; timesRun < timesRepetitiveRun; timesRun++) {
+      List<String> commonPOS = new ArrayList<String>(), commonLemmas = new ArrayList<String>();
+      int k1 = pos1.size() - 1, k2 = pos2.size() - 1;
+      Double score = 0.0;
+      while (k1 >= 0 && k2 >= 0) {
+        // first check if the same POS
+        String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2));
+        String lemmaMatch = lemmaFormManager.matchLemmas(null, chunk1
+            .getLemmas().get(k1), chunk2.getLemmas().get(k2), sim);
+        // if (acceptableLemmaAndPOS(sim, lemmaMatch)){
+        if ((sim != null)
+            && (lemmaMatch == null || (lemmaMatch != null && !lemmaMatch
+                .equals("fail")))) {
+          commonPOS.add(pos1.get(k1));
+          if (chunk1.getLemmas().size() > k1 && chunk2.getLemmas().size() > k2
+              && lemmaMatch != null) {
+            commonLemmas.add(lemmaMatch);
+          } else {
+            commonLemmas.add("*");
+
+          }
+          k1--;
+          k2--;
+        } else if (Math.random() > 0.5) {
+          k1--;
+        } else {
+          k2--;
+        }
+
+      }
+      Collections.reverse(commonLemmas);
+      Collections.reverse(commonPOS);
+
+      ParseTreeChunk currResult = new ParseTreeChunk(commonLemmas, commonPOS,
+          0, 0);
+      score = parseTreeChunkListScorer.getScore(currResult);
+      if (score > globalScore) {
+        // System.out.println(chunk1 + "  + \n"+ chunk2 + " = \n" +
+        // currResult+" score = "+ score +"\n\n");
+        result = currResult;
+        globalScore = score;
+      }
+    }
+
+    // // System.out.println(chunk1 + "  + \n"+ chunk2 + " = \n" + result
+    // +" score = " +
+    // // parseTreeChunkListScorer.getScore(result)+"\n\n");
+    return result;
+  }
+
+  public Boolean acceptableLemmaAndPOS(String sim, String lemmaMatch) {
+    if (sim == null) {
+      return false;
+    }
+
+    if (lemmaMatch != null && !lemmaMatch.equals("fail")) {
+      return false;
+    }
+    // even if lemmaMatch==null
+    return true;
+    // if (sim!=null && (lemmaMatch!=null && !lemmaMatch.equals("fail"))){
+
+  }
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcher.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministic.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministic.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministic.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministic.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,264 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Component;
+
+@Component
+public class ParseTreeMatcherDeterministic {
+
+  private GeneralizationListReducer generalizationListReducer = new GeneralizationListReducer();
+
+  private LemmaFormManager lemmaFormManager = new LemmaFormManager();
+
+  private POSManager posManager = new POSManager();
+
+  public List<ParseTreeChunk> generalizeTwoGroupedPhrasesDeterministic(
+      ParseTreeChunk chunk1, ParseTreeChunk chunk2) {
+    List<String> pos1 = chunk1.getPOSs();
+    List<String> pos2 = chunk2.getPOSs();
+    List<String> lem1 = chunk1.getLemmas();
+    List<String> lem2 = chunk2.getLemmas();
+
+    List<String> lem1stem = new ArrayList<String>();
+    List<String> lem2stem = new ArrayList<String>();
+
+    PorterStemmer ps = new PorterStemmer();
+    for (String word : lem1) {
+      try {
+        lem1stem.add(ps.stem(word.toLowerCase()));
+      } catch (Exception e) {
+        // e.printStackTrace();
+
+        if (word.length() > 2)
+          System.err.println("Unable to stem: " + word);
+      }
+    }
+    try {
+      for (String word : lem2) {
+        lem2stem.add(ps.stem(word.toLowerCase()));
+      }
+    } catch (Exception e) {
+      System.err.println("problem processing word " + lem2.toString());
+    }
+
+    List<String> overlap = new ArrayList(lem1stem);
+    overlap.retainAll(lem2stem);
+
+    if (overlap == null || overlap.size() < 1)
+      return null;
+
+    List<Integer> occur1 = new ArrayList<Integer>(), occur2 = new ArrayList<Integer>();
+    for (String word : overlap) {
+      Integer i1 = lem1stem.indexOf(word);
+      Integer i2 = lem2stem.indexOf(word);
+      occur1.add(i1);
+      occur2.add(i2);
+    }
+
+    // now we search for plausible sublists of overlaps
+    // if at some position correspondence is inverse (one of two position
+    // decreases instead of increases)
+    // then we terminate current alignment accum and start a new one
+    List<List<int[]>> overlapsPlaus = new ArrayList<List<int[]>>();
+    // starts from 1, not 0
+    List<int[]> accum = new ArrayList<int[]>();
+    accum.add(new int[] { occur1.get(0), occur2.get(0) });
+    for (int i = 1; i < occur1.size(); i++) {
+
+      if (occur1.get(i) > occur1.get(i - 1)
+          && occur2.get(i) > occur2.get(i - 1))
+        accum.add(new int[] { occur1.get(i), occur2.get(i) });
+      else {
+        overlapsPlaus.add(accum);
+        accum = new ArrayList<int[]>();
+        accum.add(new int[] { occur1.get(i), occur2.get(i) });
+      }
+    }
+    if (accum.size() > 0) {
+      overlapsPlaus.add(accum);
+    }
+
+    List<ParseTreeChunk> results = new ArrayList<ParseTreeChunk>();
+    for (List<int[]> occur : overlapsPlaus) {
+      List<Integer> occr1 = new ArrayList<Integer>(), occr2 = new ArrayList<Integer>();
+      for (int[] column : occur) {
+        occr1.add(column[0]);
+        occr2.add(column[1]);
+      }
+
+      int ov1 = 0, ov2 = 0; // iterators over common words;
+      List<String> commonPOS = new ArrayList<String>(), commonLemmas = new ArrayList<String>();
+      // we start two words before first word
+      int k1 = occr1.get(ov1) - 2, k2 = occr2.get(ov2) - 2;
+      // if (k1<0) k1=0; if (k2<0) k2=0;
+      Boolean bReachedCommonWord = false;
+      while (k1 < 0 || k2 < 0) {
+        k1++;
+        k2++;
+      }
+      int k1max = pos1.size() - 1, k2max = pos2.size() - 1;
+      while (k1 <= k1max && k2 <= k2max) {
+        // first check if the same POS
+        String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2));
+        String lemmaMatch = lemmaFormManager.matchLemmas(ps, lem1.get(k1),
+            lem2.get(k2), sim);
+        if ((sim != null)
+            && (lemmaMatch == null || (lemmaMatch != null && !lemmaMatch
+                .equals("fail")))) {
+          commonPOS.add(pos1.get(k1));
+          if (lemmaMatch != null) {
+            commonLemmas.add(lemmaMatch);
+            // System.out.println("Added "+lemmaMatch);
+            if (k1 == occr1.get(ov1) && k2 == occr2.get(ov2))
+              bReachedCommonWord = true; // now we can have different increment
+                                         // opera
+            else {
+              if (occr1.size() > ov1 + 1 && occr2.size() > ov2 + 1
+                  && k1 == occr1.get(ov1 + 1) && k2 == occr2.get(ov2 + 1)) {
+                ov1++;
+                ov2++;
+                bReachedCommonWord = true;
+              }
+              // else
+              // System.err.println("Next match reached '"+lemmaMatch+
+              // "' | k1 - k2: "+k1 + " "+k2 +
+              // "| occur index ov1-ov2 "+
+              // ov1+" "+ov2+
+              // "| identified positions of match: occr1.get(ov1) - occr2.get(ov1) "
+              // +
+              // occr1.get(ov1) + " "+ occr2.get(ov1));
+            }
+          } else {
+            commonLemmas.add("*");
+          } // the same parts of speech, proceed to the next word in both
+            // expressions
+          k1++;
+          k2++;
+
+        } else if (!bReachedCommonWord) {
+          k1++;
+          k2++;
+        } // still searching
+        else {
+          // different parts of speech, jump to the next identified common word
+          ov1++;
+          ov2++;
+          if (ov1 > occr1.size() - 1 || ov2 > occr2.size() - 1)
+            break;
+          // now trying to find
+          int kk1 = occr1.get(ov1) - 2, // new positions of iterators
+          kk2 = occr2.get(ov2) - 2;
+          int countMove = 0;
+          while ((kk1 < k1 + 1 || kk2 < k2 + 1) && countMove < 2) { // if it is
+                                                                    // behind
+                                                                    // current
+                                                                    // position,
+                                                                    // synchroneously
+                                                                    // move
+                                                                    // towards
+                                                                    // right
+            kk1++;
+            kk2++;
+            countMove++;
+          }
+          k1 = kk1;
+          k2 = kk2;
+
+          if (k1 > k1max)
+            k1 = k1max;
+          if (k2 > k2max)
+            k2 = k2max;
+          bReachedCommonWord = false;
+        }
+      }
+      ParseTreeChunk currResult = new ParseTreeChunk(commonLemmas, commonPOS,
+          0, 0);
+      results.add(currResult);
+    }
+
+    return results;
+  }
+
+  // main function to generalize two expressions grouped by phrase types
+  // returns a list of generalizations for each phrase type with filtered
+  // sub-expressions
+  public List<List<ParseTreeChunk>> matchTwoSentencesGroupedChunksDeterministic(
+      List<List<ParseTreeChunk>> sent1, List<List<ParseTreeChunk>> sent2) {
+    List<List<ParseTreeChunk>> results = new ArrayList<List<ParseTreeChunk>>();
+    // first irerate through component
+    for (int comp = 0; comp < 2 && // just np & vp
+        comp < sent1.size() && comp < sent2.size(); comp++) {
+      List<ParseTreeChunk> resultComps = new ArrayList<ParseTreeChunk>();
+      // then iterate through each phrase in each component
+      for (ParseTreeChunk ch1 : sent1.get(comp)) {
+        for (ParseTreeChunk ch2 : sent2.get(comp)) { // simpler version
+          List<ParseTreeChunk> chunkToAdd = generalizeTwoGroupedPhrasesDeterministic(
+              ch1, ch2);
+
+          if (chunkToAdd == null)
+            chunkToAdd = new ArrayList<ParseTreeChunk>();
+          // System.out.println("ch1 = "+
+          // ch1.toString()+" | ch2="+ch2.toString()
+          // +"\n result = "+chunkToAdd.toString() + "\n");
+          /*
+           * List<ParseTreeChunk> chunkToAdd1 =
+           * ParseTreeMatcherDeterministic.generalizeTwoGroupedPhrasesDeterministic
+           * ( ParseTreeMatcher.prepositionalNNSTransform(ch1), ch2); if
+           * (chunkToAdd1!=null) chunkToAdd.addAll(chunkToAdd1);
+           * List<ParseTreeChunk> chunkToAdd2 =
+           * ParseTreeMatcherDeterministic.generalizeTwoGroupedPhrasesDeterministic
+           * ( ParseTreeMatcher.prepositionalNNSTransform(ch2), ch1); if
+           * (chunkToAdd2!=null) chunkToAdd.addAll(chunkToAdd2);
+           */
+
+          // For generalized match not with orig sentences but with templates
+          // if (!LemmaFormManager.mustOccurVerifier(ch1, ch2, chunkToAdd))
+          // continue; // if the words which have to stay do not stay, proceed
+          // to other elements
+          Boolean alreadyThere = false;
+          for (ParseTreeChunk chunk : resultComps) {
+            if (chunkToAdd.contains(chunk)) {
+              alreadyThere = true;
+              break;
+            }
+
+            // }
+          }
+
+          if (!alreadyThere && chunkToAdd != null && chunkToAdd.size() > 0) {
+            resultComps.addAll(chunkToAdd);
+          }
+
+        }
+      }
+      List<ParseTreeChunk> resultCompsRed = generalizationListReducer
+          .applyFilteringBySubsumption(resultComps);
+
+      resultComps = resultCompsRed;
+      results.add(resultComps);
+    }
+
+    return results;
+  }
+
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministic.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/PorterStemmer.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/PorterStemmer.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/PorterStemmer.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/PorterStemmer.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,409 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+public class PorterStemmer {
+  public String stem(String str) {
+    // check for zero length
+    if (str.length() > 0) {
+      // all characters must be letters
+      char[] c = str.toCharArray();
+      for (int i = 0; i < c.length; i++) {
+        if (!Character.isLetter(c[i]))
+          return "Invalid term";
+      }
+    } else {
+      return "No term entered";
+    }
+    str = step1a(str);
+    str = step1b(str);
+    str = step1c(str);
+    str = step2(str);
+    str = step3(str);
+    str = step4(str);
+    str = step5a(str);
+    str = step5b(str);
+    return str;
+  } // end stem
+
+  protected String step1a(String str) {
+    // SSES -> SS
+    if (str.endsWith("sses")) {
+      return str.substring(0, str.length() - 2);
+      // IES -> I
+    } else if (str.endsWith("ies")) {
+      return str.substring(0, str.length() - 2);
+      // SS -> S
+    } else if (str.endsWith("ss")) {
+      return str;
+      // S ->
+    } else if (str.endsWith("s")) {
+      return str.substring(0, str.length() - 1);
+    } else {
+      return str;
+    }
+  } // end step1a
+
+  protected String step1b(String str) {
+    // (m > 0) EED -> EE
+    if (str.endsWith("eed")) {
+      if (stringMeasure(str.substring(0, str.length() - 3)) > 0)
+        return str.substring(0, str.length() - 1);
+      else
+        return str;
+      // (*v*) ED ->
+    } else if ((str.endsWith("ed"))
+        && (containsVowel(str.substring(0, str.length() - 2)))) {
+      return step1b2(str.substring(0, str.length() - 2));
+      // (*v*) ING ->
+    } else if ((str.endsWith("ing"))
+        && (containsVowel(str.substring(0, str.length() - 3)))) {
+      return step1b2(str.substring(0, str.length() - 3));
+    } // end if
+    return str;
+  } // end step1b
+
+  protected String step1b2(String str) {
+    // AT -> ATE
+    if (str.endsWith("at") || str.endsWith("bl") || str.endsWith("iz")) {
+      return str + "e";
+    } else if ((endsWithDoubleConsonent(str))
+        && (!(str.endsWith("l") || str.endsWith("s") || str.endsWith("z")))) {
+      return str.substring(0, str.length() - 1);
+    } else if ((stringMeasure(str) == 1) && (endsWithCVC(str))) {
+      return str + "e";
+    } else {
+      return str;
+    }
+  } // end step1b2
+
+  protected String step1c(String str) {
+    // (*v*) Y -> I
+    if (str.endsWith("y")) {
+      if (containsVowel(str.substring(0, str.length() - 1)))
+        return str.substring(0, str.length() - 1) + "i";
+    } // end if
+    return str;
+  } // end step1c
+
+  protected String step2(String str) {
+    // (m > 0) ATIONAL -> ATE
+    if ((str.endsWith("ational"))
+        && (stringMeasure(str.substring(0, str.length() - 5)) > 0)) {
+      return str.substring(0, str.length() - 5) + "e";
+      // (m > 0) TIONAL -> TION
+    } else if ((str.endsWith("tional"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+      return str.substring(0, str.length() - 2);
+      // (m > 0) ENCI -> ENCE
+    } else if ((str.endsWith("enci"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+      return str.substring(0, str.length() - 2);
+      // (m > 0) ANCI -> ANCE
+    } else if ((str.endsWith("anci"))
+        && (stringMeasure(str.substring(0, str.length() - 1)) > 0)) {
+      return str.substring(0, str.length() - 1) + "e";
+      // (m > 0) IZER -> IZE
+    } else if ((str.endsWith("izer"))
+        && (stringMeasure(str.substring(0, str.length() - 1)) > 0)) {
+      return str.substring(0, str.length() - 1);
+      // (m > 0) ABLI -> ABLE
+    } else if ((str.endsWith("abli"))
+        && (stringMeasure(str.substring(0, str.length() - 1)) > 0)) {
+      return str.substring(0, str.length() - 1) + "e";
+      // (m > 0) ENTLI -> ENT
+    } else if ((str.endsWith("alli"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+      return str.substring(0, str.length() - 2);
+      // (m > 0) ELI -> E
+    } else if ((str.endsWith("entli"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+      return str.substring(0, str.length() - 2);
+      // (m > 0) OUSLI -> OUS
+    } else if ((str.endsWith("eli"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+      return str.substring(0, str.length() - 2);
+      // (m > 0) IZATION -> IZE
+    } else if ((str.endsWith("ousli"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+      return str.substring(0, str.length() - 2);
+      // (m > 0) IZATION -> IZE
+    } else if ((str.endsWith("ization"))
+        && (stringMeasure(str.substring(0, str.length() - 5)) > 0)) {
+      return str.substring(0, str.length() - 5) + "e";
+      // (m > 0) ATION -> ATE
+    } else if ((str.endsWith("ation"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+      return str.substring(0, str.length() - 3) + "e";
+      // (m > 0) ATOR -> ATE
+    } else if ((str.endsWith("ator"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+      return str.substring(0, str.length() - 2) + "e";
+      // (m > 0) ALISM -> AL
+    } else if ((str.endsWith("alism"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 0) IVENESS -> IVE
+    } else if ((str.endsWith("iveness"))
+        && (stringMeasure(str.substring(0, str.length() - 4)) > 0)) {
+      return str.substring(0, str.length() - 4);
+      // (m > 0) FULNESS -> FUL
+    } else if ((str.endsWith("fulness"))
+        && (stringMeasure(str.substring(0, str.length() - 4)) > 0)) {
+      return str.substring(0, str.length() - 4);
+      // (m > 0) OUSNESS -> OUS
+    } else if ((str.endsWith("ousness"))
+        && (stringMeasure(str.substring(0, str.length() - 4)) > 0)) {
+      return str.substring(0, str.length() - 4);
+      // (m > 0) ALITII -> AL
+    } else if ((str.endsWith("aliti"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 0) IVITI -> IVE
+    } else if ((str.endsWith("iviti"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+      return str.substring(0, str.length() - 3) + "e";
+      // (m > 0) BILITI -> BLE
+    } else if ((str.endsWith("biliti"))
+        && (stringMeasure(str.substring(0, str.length() - 5)) > 0)) {
+      return str.substring(0, str.length() - 5) + "le";
+    } // end if
+    return str;
+  } // end step2
+
+  protected String step3(String str) {
+    // (m > 0) ICATE -> IC
+    if ((str.endsWith("icate"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 0) ATIVE ->
+    } else if ((str.endsWith("ative"))
+        && (stringMeasure(str.substring(0, str.length() - 5)) > 0)) {
+      return str.substring(0, str.length() - 5);
+      // (m > 0) ALIZE -> AL
+    } else if ((str.endsWith("alize"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 0) ICITI -> IC
+    } else if ((str.endsWith("iciti"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 0) ICAL -> IC
+    } else if ((str.endsWith("ical"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+      return str.substring(0, str.length() - 2);
+      // (m > 0) FUL ->
+    } else if ((str.endsWith("ful"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 0) NESS ->
+    } else if ((str.endsWith("ness"))
+        && (stringMeasure(str.substring(0, str.length() - 4)) > 0)) {
+      return str.substring(0, str.length() - 4);
+    } // end if
+    return str;
+  } // end step3
+
+  protected String step4(String str) {
+    if ((str.endsWith("al"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 1)) {
+      return str.substring(0, str.length() - 2);
+      // (m > 1) ANCE ->
+    } else if ((str.endsWith("ance"))
+        && (stringMeasure(str.substring(0, str.length() - 4)) > 1)) {
+      return str.substring(0, str.length() - 4);
+      // (m > 1) ENCE ->
+    } else if ((str.endsWith("ence"))
+        && (stringMeasure(str.substring(0, str.length() - 4)) > 1)) {
+      return str.substring(0, str.length() - 4);
+      // (m > 1) ER ->
+    } else if ((str.endsWith("er"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 1)) {
+      return str.substring(0, str.length() - 2);
+      // (m > 1) IC ->
+    } else if ((str.endsWith("ic"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 1)) {
+      return str.substring(0, str.length() - 2);
+      // (m > 1) ABLE ->
+    } else if ((str.endsWith("able"))
+        && (stringMeasure(str.substring(0, str.length() - 4)) > 1)) {
+      return str.substring(0, str.length() - 4);
+      // (m > 1) IBLE ->
+    } else if ((str.endsWith("ible"))
+        && (stringMeasure(str.substring(0, str.length() - 4)) > 1)) {
+      return str.substring(0, str.length() - 4);
+      // (m > 1) ANT ->
+    } else if ((str.endsWith("ant"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 1) EMENT ->
+    } else if ((str.endsWith("ement"))
+        && (stringMeasure(str.substring(0, str.length() - 5)) > 1)) {
+      return str.substring(0, str.length() - 5);
+      // (m > 1) MENT ->
+    } else if ((str.endsWith("ment"))
+        && (stringMeasure(str.substring(0, str.length() - 4)) > 1)) {
+      return str.substring(0, str.length() - 4);
+      // (m > 1) ENT ->
+    } else if ((str.endsWith("ent"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 1) and (*S or *T) ION ->
+    } else if ((str.endsWith("sion") || str.endsWith("tion"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 1) OU ->
+    } else if ((str.endsWith("ou"))
+        && (stringMeasure(str.substring(0, str.length() - 2)) > 1)) {
+      return str.substring(0, str.length() - 2);
+      // (m > 1) ISM ->
+    } else if ((str.endsWith("ism"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 1) ATE ->
+    } else if ((str.endsWith("ate"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 1) ITI ->
+    } else if ((str.endsWith("iti"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 1) OUS ->
+    } else if ((str.endsWith("ous"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 1) IVE ->
+    } else if ((str.endsWith("ive"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+      return str.substring(0, str.length() - 3);
+      // (m > 1) IZE ->
+    } else if ((str.endsWith("ize"))
+        && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+      return str.substring(0, str.length() - 3);
+    } // end if
+    return str;
+  } // end step4
+
+  protected String step5a(String str) {
+    // (m > 1) E ->
+    if ((stringMeasure(str.substring(0, str.length() - 1)) > 1)
+        && str.endsWith("e"))
+      return str.substring(0, str.length() - 1);
+    // (m = 1 and not *0) E ->
+    else if ((stringMeasure(str.substring(0, str.length() - 1)) == 1)
+        && (!endsWithCVC(str.substring(0, str.length() - 1)))
+        && (str.endsWith("e")))
+      return str.substring(0, str.length() - 1);
+    else
+      return str;
+  } // end step5a
+
+  protected String step5b(String str) {
+    // (m > 1 and *d and *L) ->
+    if (str.endsWith("l") && endsWithDoubleConsonent(str)
+        && (stringMeasure(str.substring(0, str.length() - 1)) > 1)) {
+      return str.substring(0, str.length() - 1);
+    } else {
+      return str;
+    }
+  } // end step5b
+
+  /*
+   * ------------------------------------------------------- The following are
+   * functions to help compute steps 1 - 5
+   * -------------------------------------------------------
+   */
+
+  // does string end with 's'?
+  protected boolean endsWithS(String str) {
+    return str.endsWith("s");
+  } // end function
+
+  // does string contain a vowel?
+  protected boolean containsVowel(String str) {
+    char[] strchars = str.toCharArray();
+    for (int i = 0; i < strchars.length; i++) {
+      if (isVowel(strchars[i]))
+        return true;
+    }
+    // no aeiou but there is y
+    if (str.indexOf('y') > -1)
+      return true;
+    else
+      return false;
+  } // end function
+
+  // is char a vowel?
+  public boolean isVowel(char c) {
+    if ((c == 'a') || (c == 'e') || (c == 'i') || (c == 'o') || (c == 'u'))
+      return true;
+    else
+      return false;
+  } // end function
+
+  // does string end with a double consonent?
+  protected boolean endsWithDoubleConsonent(String str) {
+    char c = str.charAt(str.length() - 1);
+    if (c == str.charAt(str.length() - 2))
+      if (!containsVowel(str.substring(str.length() - 2))) {
+        return true;
+      }
+    return false;
+  } // end function
+
+  // returns a CVC measure for the string
+  protected int stringMeasure(String str) {
+    int count = 0;
+    boolean vowelSeen = false;
+    char[] strchars = str.toCharArray();
+
+    for (int i = 0; i < strchars.length; i++) {
+      if (isVowel(strchars[i])) {
+        vowelSeen = true;
+      } else if (vowelSeen) {
+        count++;
+        vowelSeen = false;
+      }
+    } // end for
+    return count;
+  } // end function
+
+  // does stem end with CVC?
+  protected boolean endsWithCVC(String str) {
+    char c, v, c2 = ' ';
+    if (str.length() >= 3) {
+      c = str.charAt(str.length() - 1);
+      v = str.charAt(str.length() - 2);
+      c2 = str.charAt(str.length() - 3);
+    } else {
+      return false;
+    }
+
+    if ((c == 'w') || (c == 'x') || (c == 'y')) {
+      return false;
+    } else if (isVowel(c)) {
+      return false;
+    } else if (!isVowel(v)) {
+      return false;
+    } else if (isVowel(c2)) {
+      return false;
+    } else {
+      return true;
+    }
+  } // end function
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/PorterStemmer.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SentencePairMatchResult.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SentencePairMatchResult.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SentencePairMatchResult.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SentencePairMatchResult.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+import java.util.List;
+
+import org.apache.commons.lang.StringUtils;
+
+public class SentencePairMatchResult {
+  public List<List<ParseTreeChunk>> matchResult;
+
+  public List<List<ParseTreeChunk>> getMatchResult() {
+    return matchResult;
+  }
+
+  public void setMatchResult(List<List<ParseTreeChunk>> matchResult) {
+    this.matchResult = matchResult;
+  }
+
+  public List<LemmaPair> getResForMinedSent1() {
+    return resForMinedSent1;
+  }
+
+  public void setResForMinedSent1(List<LemmaPair> resForMinedSent1) {
+    this.resForMinedSent1 = resForMinedSent1;
+  }
+
+  public boolean isVerbExists() {
+    return verbExists;
+  }
+
+  public void setVerbExists(boolean verbExists) {
+    this.verbExists = verbExists;
+  }
+
+  public boolean isImperativeVerb() {
+    return imperativeVerb;
+  }
+
+  public void setImperativeVerb(boolean imperativeVerb) {
+    this.imperativeVerb = imperativeVerb;
+  }
+
+  private List<LemmaPair> resForMinedSent1;
+
+  public boolean verbExists = false;
+
+  public boolean imperativeVerb = false;
+
+  public SentencePairMatchResult(List<List<ParseTreeChunk>> matchResult,
+      List<LemmaPair> resForMinedSent1) {
+    super();
+    verbExists = false;
+    imperativeVerb = false;
+    System.out.println("Assessing sentence for inclusion " + resForMinedSent1);
+    this.matchResult = matchResult;
+    this.resForMinedSent1 = resForMinedSent1;
+    for (LemmaPair word : resForMinedSent1) {
+      if (word.getPOS().startsWith("VB") && word.getLemma().length() > 2
+          && StringUtils.isAlpha(word.getLemma())) {// ||
+                                                    // word.getPOS().startsWith("VP"))
+        verbExists = true;
+        System.out.println("Found verb=" + word);
+      }
+    }
+    // various form of sales pitch: 'get something', or 'we offer'
+    if (resForMinedSent1.get(1).getLemma().startsWith("We")
+        || resForMinedSent1.get(2).getLemma().startsWith("We"))
+      imperativeVerb = true;
+    for (LemmaPair word : resForMinedSent1) {
+      if (word.getPOS().startsWith("VB") && word.getStartPos() < 1
+          && word.getEndPos() < 1) {
+        imperativeVerb = true;
+        System.out.println("Found imperative verb=" + word);
+      }
+    }
+
+  }
+
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SentencePairMatchResult.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcher.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcher.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcher.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcher.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,399 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.lang.english.SentenceDetector;
+import opennlp.tools.lang.english.Tokenizer;
+import opennlp.tools.lang.english.TreebankParser;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.parser.chunking.Parser;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.util.Span;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Autowired;
+
+public class SyntMatcher {
+  public static final String resourcesDir = (System.getProperty("os.name")
+      .toLowerCase().indexOf("win") > -1 ? "C:/workspace/ZSearch/resources_external"
+      : "/var/search/solr-1.2/resources");
+  static private SyntMatcher m_SyntMatcher = null;
+
+  private static final Logger LOG = LoggerFactory.getLogger(SyntMatcher.class);
+
+  private SentenceDetectorME sentenceDetectorME = null;
+
+  private Tokenizer tokenizer = null;
+
+  private Parser parser = null;
+
+  private final boolean useTagDict = true;
+
+  private final boolean useCaseInsensitiveTagDict = false;
+
+  private final int beamSize = Parser.defaultBeamSize;
+
+  private final double advancePercentage = Parser.defaultAdvancePercentage;
+
+  private Map<String, List<List<ParseTreeChunk>>> parsingsCache = new HashMap<String, List<List<ParseTreeChunk>>>();
+
+  private ParseTreeChunkListScorer parseTreeChunkListScorer;
+
+  private ParseTreeMatcherDeterministic parseTreeMatcherDeterministic = new ParseTreeMatcherDeterministic();
+
+  /**
+   * Get the StopList singleton instance.
+   * 
+   * @return The StopList
+   */
+  static public SyntMatcher getInstance() {
+    String dir = resourcesDir + "/models";
+    if (m_SyntMatcher == null) {
+      m_SyntMatcher = new SyntMatcher();
+
+      try {
+        m_SyntMatcher.loadOpenNLP(dir);
+      } catch (Exception e) {
+        LOG.error("Problem loading openNLP! ", 2);
+      }
+    }
+    return m_SyntMatcher;
+  }
+
+  static public SyntMatcher getInstance(String resourceDirSpec) {
+    String dir = resourceDirSpec + "/models";
+    if (m_SyntMatcher == null) {
+      m_SyntMatcher = new SyntMatcher();
+
+      try {
+        m_SyntMatcher.loadOpenNLP(dir);
+      } catch (Exception e) {
+        e.printStackTrace();
+        LOG.error("Problem loading openNLP! ", e);
+      }
+    }
+    return m_SyntMatcher;
+  }
+
+  public SyntMatcher() {
+    /*
+     * try { loadOpenNLP(resourcesDir); } catch (IOException e) {
+     * LOG.error("Problem loading openNLP! ", e); }
+     */
+  }
+
+  public SyntMatcher(String resourcesDir) {
+    try {
+      loadOpenNLP(resourcesDir);
+    } catch (IOException e) {
+      LOG.error("Problem loading openNLP! ", e);
+    }
+  }
+
+  public SyntMatcher(String resourcesDir, String language) {
+    try {
+      loadOpenNLP(resourcesDir, language);
+    } catch (IOException e) {
+      LOG.error("Problem loading openNLP! ", e);
+    }
+  }
+
+  protected void loadOpenNLP(String dir) throws IOException {
+    sentenceDetectorME = new SentenceDetector(dir
+        + "/sentdetect/EnglishSD.bin.gz");
+    tokenizer = new Tokenizer(dir + "/tokenize/EnglishTok.bin.gz");
+    parser = (Parser) TreebankParser.getParser(dir + "/parser", useTagDict,
+        useCaseInsensitiveTagDict, beamSize, advancePercentage);
+
+  }
+
+  protected void loadOpenNLP(String dir, String lang) throws IOException {
+    if (lang.equalsIgnoreCase("es")) {
+      sentenceDetectorME = new SentenceDetector(dir
+          + "/sentdetect/EnglishSD.bin.gz");
+      tokenizer = new Tokenizer(dir + "/tokenize/EnglishTok.bin.gz");
+      parser = (Parser) TreebankParser.getParser(dir + "/parser", useTagDict,
+          useCaseInsensitiveTagDict, beamSize, advancePercentage);
+    }
+  }
+
+  // TODO is synchronized needed here?
+  public synchronized Parse[] parseLine(String line, Parser p, double confidence) {
+    String[] tokens = tokenizer.tokenize(line);
+    // tokens = TextProcessor.fastTokenize(line, false).toArray(new String[0]);
+
+    StringBuilder sb = new StringBuilder();
+    for (String t : tokens)
+      sb.append(t).append(" ");
+
+    Parse[] ps = null;
+    try {
+      ps = TreebankParser.parseLine(sb.toString(), parser, 2);
+    } catch (Exception e) {
+      System.out.println("Problem parsing " + sb.toString());
+      e.printStackTrace(); // unable to parse for whatever reason
+    }
+    int i = 1;
+    for (; i < ps.length; i++) {
+      if (ps[i - 1].getProb() - ps[i].getProb() > confidence)
+        break;
+    }
+    if (i < ps.length) {
+      Parse[] retp = new Parse[i];
+      for (int j = 0; j < i; j++)
+        retp[j] = ps[j];
+      return retp;
+    } else
+      return ps;
+  }
+
+  // TODO is synchronized needed here?
+  protected synchronized Double[] getPhrasingAcceptabilityData(String line) {
+    int nParsings = 5;
+    String[] tokens = tokenizer.tokenize(line);
+    int numWords = tokens.length;
+    StringBuilder sb = new StringBuilder();
+    for (String t : tokens)
+      sb.append(t).append(" ");
+    Double result[] = new Double[5];
+
+    Parse[] ps = null;
+    try {
+      ps = TreebankParser.parseLine(sb.toString(), parser, nParsings);
+    } catch (Exception e) {
+      // unable to parse for whatever reason
+      for (int i = 0; i < result.length; i++) {
+        result[i] = -20.0;
+      }
+    }
+
+    for (int i = 0; i < ps.length; i++) {
+      result[i] = Math.abs(ps[i].getProb() / (double) numWords);
+    }
+    return result;
+  }
+
+  protected boolean allChildNodesArePOSTags(Parse p) {
+    Parse[] subParses = p.getChildren();
+    for (int pi = 0; pi < subParses.length; pi++)
+      if (!((Parse) subParses[pi]).isPosTag())
+        return false;
+    return true;
+  }
+
+  protected ArrayList<String> getNounPhrases(Parse p) {
+    ArrayList<String> nounphrases = new ArrayList<String>();
+
+    Parse[] subparses = p.getChildren();
+    for (int pi = 0; pi < subparses.length; pi++) {
+      // System.out.println("Processing Label: " + subparses[pi].getLabel());
+      // System.out.println("Processing Type: " + subparses[pi].getType());
+      if (subparses[pi].getType().equals("NP")
+          && allChildNodesArePOSTags(subparses[pi]))// &&
+      // ((Parse)subparses[pi]).getLabel()
+      // == "NP")
+      {
+        // System.out.println("Processing: " + subparses[pi].getLabel() +
+        // " as Chunk...");
+        Span _span = subparses[pi].getSpan();
+        nounphrases
+            .add(p.getText().substring(_span.getStart(), _span.getEnd()));
+      } else if (!((Parse) subparses[pi]).isPosTag())
+        nounphrases.addAll(getNounPhrases(subparses[pi]));
+    }
+
+    return nounphrases;
+  }
+
+  public List<LemmaPair> getAllPhrasesTWPairs(Parse p) {
+    List<String> nounphrases = new ArrayList<String>();
+    List<LemmaPair> LemmaPairs = new ArrayList<LemmaPair>();
+
+    Parse[] subparses = p.getChildren();
+    for (int pi = 0; pi < subparses.length; pi++) {
+      Span _span = subparses[pi].getSpan();
+
+      nounphrases.add(p.getText().substring(_span.getStart(), _span.getEnd()));
+      String expr = p.getText().substring(_span.getStart(), _span.getEnd());
+
+      // if (expr.indexOf(" ")>0)
+      LemmaPairs.add(new LemmaPair(subparses[pi].getType(), expr, _span
+          .getStart()));
+      if (!((Parse) subparses[pi]).isPosTag())
+        LemmaPairs.addAll(getAllPhrasesTWPairs(subparses[pi]));
+    }
+
+    return LemmaPairs;
+  }
+
+  protected List<List<ParseTreeChunk>> matchOrigSentences(String sent1,
+      String sent2) {
+    // with tokenizer now
+    Parse[] parses1 = parseLine(sent1, parser, 1);
+    Parse[] parses2 = parseLine(sent2, parser, 1);
+    List<LemmaPair> origChunks1 = getAllPhrasesTWPairs(parses1[0]);
+    List<LemmaPair> origChunks2 = getAllPhrasesTWPairs(parses2[0]);
+    System.out.println(origChunks1);
+    System.out.println(origChunks2);
+
+    ParseTreeChunk matcher = new ParseTreeChunk();
+    List<List<ParseTreeChunk>> matchResult = matcher
+        .matchTwoSentencesGivenPairLists(origChunks1, origChunks2);
+    return matchResult;
+  }
+
+  public List<List<ParseTreeChunk>> matchOrigSentencesCache(String sent1,
+      String sent2) {
+    sent1 = sent1.replace("'s", " 's").replace(":", " ");
+    sent2 = sent2.replace("'s", " 's").replace(":", " ");
+
+    ParseTreeChunk matcher = new ParseTreeChunk();
+    List<List<ParseTreeChunk>> sent1GrpLst = null, sent2GrpLst = null;
+
+    sent1GrpLst = parsingsCache.get(sent1);
+    if (sent1GrpLst == null) {
+      List<LemmaPair> origChunks1 = new ArrayList<LemmaPair>();
+      String[] sents1 = sentenceDetectorME.sentDetect(sent1);
+      for (String s1 : sents1) {
+        Parse[] parses1 = parseLine(s1, parser, 1);
+        origChunks1.addAll(getAllPhrasesTWPairs(parses1[0]));
+      }
+      List<ParseTreeChunk> chunk1List = matcher.buildChunks(origChunks1);
+      sent1GrpLst = matcher.groupChunksAsParses(chunk1List);
+      parsingsCache.put(sent1, sent1GrpLst);
+      System.out.println(origChunks1);
+      // System.out.println("=== Grouped chunks 1 "+ sent1GrpLst);
+    }
+    sent2GrpLst = parsingsCache.get(sent2);
+    if (sent2GrpLst == null) {
+      List<LemmaPair> origChunks2 = new ArrayList<LemmaPair>();
+      String[] sents2 = sentenceDetectorME.sentDetect(sent2);
+      for (String s2 : sents2) {
+        Parse[] parses2 = parseLine(s2, parser, 1);
+        origChunks2.addAll(getAllPhrasesTWPairs(parses2[0]));
+      }
+      List<ParseTreeChunk> chunk2List = matcher.buildChunks(origChunks2);
+      sent2GrpLst = matcher.groupChunksAsParses(chunk2List);
+      parsingsCache.put(sent2, sent2GrpLst);
+      System.out.println(origChunks2);
+      // System.out.println("=== Grouped chunks 2 "+ sent2GrpLst);
+    }
+
+    return parseTreeMatcherDeterministic
+        .matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst);
+
+  }
+
+  public SentencePairMatchResult assessRelevance(String minedSent1, String sent2) {
+    minedSent1 = minedSent1.replace("'s", " 's").replace(":", " ")
+        .replace("âs", " 's");
+    sent2 = sent2.replace("'s", " 's").replace(":", " ").replace("âs", " 's");
+
+    ParseTreeChunk matcher = new ParseTreeChunk();
+    List<List<ParseTreeChunk>> sent1GrpLst = null, sent2GrpLst = null;
+
+    // sent1GrpLst = parsingsCache.get(minedSent1);
+    // if (sent1GrpLst==null){
+    List<LemmaPair> origChunks1 = new ArrayList<LemmaPair>();
+    String[] sents1 = sentenceDetectorME.sentDetect(minedSent1);
+    for (String s1 : sents1) {
+      Parse[] parses1 = parseLine(s1, parser, 1);
+      origChunks1.addAll(getAllPhrasesTWPairs(parses1[0]));
+    }
+    List<ParseTreeChunk> chunk1List = matcher.buildChunks(origChunks1);
+    sent1GrpLst = matcher.groupChunksAsParses(chunk1List);
+    parsingsCache.put(minedSent1, sent1GrpLst);
+    // System.out.println(origChunks1);
+    // System.out.println("=== Grouped chunks 1 "+ sent1GrpLst);
+    // }
+    sent2GrpLst = parsingsCache.get(sent2);
+    if (sent2GrpLst == null) {
+      List<LemmaPair> origChunks2 = new ArrayList<LemmaPair>();
+      String[] sents2 = sentenceDetectorME.sentDetect(sent2);
+      for (String s2 : sents2) {
+        Parse[] parses2 = parseLine(s2, parser, 1);
+        origChunks2.addAll(getAllPhrasesTWPairs(parses2[0]));
+      }
+      List<ParseTreeChunk> chunk2List = matcher.buildChunks(origChunks2);
+      sent2GrpLst = matcher.groupChunksAsParses(chunk2List);
+      parsingsCache.put(sent2, sent2GrpLst);
+      // System.out.println(origChunks2);
+      // System.out.println("=== Grouped chunks 2 "+ sent2GrpLst);
+    }
+
+    ParseTreeMatcherDeterministic md = new ParseTreeMatcherDeterministic();
+    List<List<ParseTreeChunk>> res = md
+        .matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst);
+    return new SentencePairMatchResult(res, origChunks1);
+
+  }
+
+  public Map<String, List<LemmaPair>> findMappingBetweenSentencesOfAParagraphAndAClassReps(
+      String para1, String classStr) {
+    // profile of matches
+    List<List<List<ParseTreeChunk>>> matchResultPerSentence = new ArrayList<List<List<ParseTreeChunk>>>();
+
+    ParseTreeChunk matcher = new ParseTreeChunk();
+
+    String[] sents = sentenceDetectorME.sentDetect(para1);
+    String[] classSents = sentenceDetectorME.sentDetect(classStr);
+
+    List<List<LemmaPair>> parseSentList = new ArrayList<List<LemmaPair>>();
+    for (String s : sents) {
+      parseSentList.add(getAllPhrasesTWPairs((parseLine(s, parser, 1)[0])));
+    }
+
+    List<List<LemmaPair>> parseClassList = new ArrayList<List<LemmaPair>>();
+    for (String s : classSents) {
+      parseClassList.add(getAllPhrasesTWPairs((parseLine(s, parser, 1)[0])));
+    }
+
+    Map<String, List<LemmaPair>> sentence_bestClassRep = new HashMap<String, List<LemmaPair>>();
+    for (List<LemmaPair> chunksSent : parseSentList) {
+      Double maxScore = -1.0;
+      for (List<LemmaPair> chunksClass : parseClassList) {
+        List<List<ParseTreeChunk>> matchResult = matcher
+            .matchTwoSentencesGivenPairLists(chunksSent, chunksClass);
+        Double score = parseTreeChunkListScorer
+            .getParseTreeChunkListScore(matchResult);
+        if (score > maxScore) {
+          maxScore = score;
+          sentence_bestClassRep.put(chunksSent.toString(), chunksClass);
+        }
+      }
+    }
+    return sentence_bestClassRep;
+  }
+
+  public SentenceDetectorME getSentenceDetectorME() {
+    return sentenceDetectorME;
+  }
+
+  public Parser getParser() {
+    return parser;
+  }
+}
+
+// -Xms500M -Xmx500M

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcher.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherConfiguration.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherConfiguration.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherConfiguration.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherConfiguration.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.context.annotation.Bean;
+import org.springframework.context.annotation.Configuration;
+
+@Configuration
+public class SyntMatcherConfiguration {
+  @Value(value = "${resources.dir}")
+  private String resourcesDir;
+
+  @Bean
+  public SyntMatcher syntMatcher() {
+    return new SyntMatcher(resourcesDir);
+  }
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherConfiguration.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherFactory.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherFactory.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherFactory.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherFactory.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+import org.springframework.stereotype.Component;
+
+@Component
+public class SyntMatcherFactory {
+  public static SyntMatcher get(String language) {
+    return null;
+  }
+}

Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherFactory.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain