You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/10/11 17:36:33 UTC
svn commit: r1181845 [4/5] - in
/incubator/opennlp/sandbox/opennlp-similarity: ./
src/main/java/opennlp/tools/similarity/
src/main/java/opennlp/tools/similarity/apps/
src/main/java/opennlp/tools/similarity/apps/utils/
src/main/java/opennlp/tools/textsi...
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkListScorer.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkListScorer.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkListScorer.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkListScorer.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+import java.util.List;
+
+import org.springframework.stereotype.Component;
+
+@Component
+public class ParseTreeChunkListScorer {
+ // find the single expression with the highest score
+ public double getParseTreeChunkListScore(
+ List<List<ParseTreeChunk>> matchResult) {
+ double currScore = 0.0;
+ for (List<ParseTreeChunk> chunksGivenPhraseType : matchResult)
+ for (ParseTreeChunk chunk : chunksGivenPhraseType) {
+ Double score = getScore(chunk);
+ // System.out.println(chunk+ " => score >>> "+score);
+ if (score > currScore) {
+ currScore = score;
+ }
+ }
+ return currScore;
+ }
+
+ // get max score per phrase type and then sum up
+ public double getParseTreeChunkListScoreAggregPhraseType(
+ List<List<ParseTreeChunk>> matchResult) {
+ double currScoreTotal = 0.0;
+ for (List<ParseTreeChunk> chunksGivenPhraseType : matchResult) {
+ double currScorePT = 0.0;
+ for (ParseTreeChunk chunk : chunksGivenPhraseType) {
+ Double score = getScore(chunk);
+ // System.out.println(chunk+ " => score >>> "+score);
+ if (score > currScorePT) {
+ currScorePT = score;
+ }
+ }
+ // if substantial for given phrase type
+ if (currScorePT > 0.5) {
+ currScoreTotal += currScorePT;
+ }
+ }
+ return currScoreTotal;
+ }
+
+ // score is meaningful only for chunks which are results of generalization
+
+ public double getScore(ParseTreeChunk chunk) {
+ double score = 0.0;
+ int i = 0;
+ for (String l : chunk.getLemmas()) {
+ String pos = chunk.getPOSs().get(i);
+ if (l.equals("*")) {
+ if (pos.startsWith("CD")) { // number vs number gives high score
+ // although different numbers
+ score += 0.7;
+ } else if (pos.endsWith("_high")) { // if query modification adds 'high'
+ score += 1.0;
+ } else {
+ score += 0.1;
+ }
+ } else {
+
+ if (pos.startsWith("NN") || pos.startsWith("NP")
+ || pos.startsWith("CD") || pos.startsWith("RB")) {
+ score += 1.0;
+ } else if (pos.startsWith("VB") || pos.startsWith("JJ")) {
+ if (l.equals("get")) { // 'common' verbs are not that important
+ score += 0.3;
+ } else {
+ score += 0.5;
+ }
+ } else {
+ score += 0.3;
+ }
+ }
+ i++;
+
+ }
+ return score;
+ }
+
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkListScorer.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcher.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcher.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcher.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcher.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Component;
+
+/**
+ * Created by IntelliJ IDEA. User: boris Date: Feb 13, 2009 Time: 2:18:47 PM To
+ * change this template use File | Settings | File Templates.
+ */
+@Component
+public class ParseTreeMatcher {
+
+ private static final int NUMBER_OF_ITERATIONS = 2;
+
+ private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
+ private POSManager posManager = new POSManager();
+ private LemmaFormManager lemmaFormManager = new LemmaFormManager();
+
+ public ParseTreeMatcher() {
+
+ }
+
+ public ParseTreeChunk generalizeTwoGroupedPhrasesOLD(ParseTreeChunk chunk1,
+ ParseTreeChunk chunk2) {
+ List<String> pos1 = chunk1.getPOSs();
+ List<String> pos2 = chunk1.getPOSs();
+
+ List<String> commonPOS = new ArrayList<String>(), commonLemmas = new ArrayList<String>();
+ int k1 = 0, k2 = 0;
+ Boolean incrFirst = true;
+ while (k1 < pos1.size() && k2 < pos2.size()) {
+ // first check if the same POS
+ String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2));
+ if (sim != null) {
+ commonPOS.add(pos1.get(k1));
+ if (chunk1.getLemmas().size() > k1 && chunk2.getLemmas().size() > k2
+ && chunk1.getLemmas().get(k1).equals(chunk2.getLemmas().get(k2))) {
+ commonLemmas.add(chunk1.getLemmas().get(k1));
+ } else {
+ commonLemmas.add("*");
+ }
+ k1++;
+ k2++;
+ } else if (incrFirst) {
+ k1++;
+ } else {
+ k2++;
+ }
+ incrFirst = !incrFirst;
+ }
+
+ ParseTreeChunk res = new ParseTreeChunk(commonLemmas, commonPOS, 0, 0);
+ // if (parseTreeChunkListScorer.getScore(res)> 0.6)
+ // System.out.println(chunk1 + " + \n"+ chunk2 + " = \n" + res);
+ return res;
+ }
+
+ // A for B => B have A
+ // transforms expr { A B C prep X Y }
+ // into {A B {X Y} C}
+ // should only be applied to a noun phrase
+ public ParseTreeChunk prepositionalNNSTransform(ParseTreeChunk ch) {
+ List<String> transfPOS = new ArrayList<String>(), transfLemmas = new ArrayList<String>();
+ if (!ch.getPOSs().contains("IN"))
+ return ch;
+ int indexIN = ch.getPOSs().lastIndexOf("IN");
+
+ if (indexIN < 2)// preposition is a first word - should not be in a noun
+ // phrase
+ return ch;
+ String Word_IN = ch.getLemmas().get(indexIN);
+ if (!(Word_IN.equals("to") || Word_IN.equals("on") || Word_IN.equals("in")
+ || Word_IN.equals("of") || Word_IN.equals("with")
+ || Word_IN.equals("by") || Word_IN.equals("from")))
+ return ch;
+
+ List<String> toShiftAfterPartPOS = ch.getPOSs().subList(indexIN + 1,
+ ch.getPOSs().size());
+ List<String> toShiftAfterPartLemmas = ch.getLemmas().subList(indexIN + 1,
+ ch.getLemmas().size());
+
+ if (indexIN - 1 > 0)
+ transfPOS.addAll(ch.getPOSs().subList(0, indexIN - 1));
+ transfPOS.addAll(toShiftAfterPartPOS);
+ transfPOS.add(ch.getPOSs().get(indexIN - 1));
+
+ if (indexIN - 1 > 0)
+ transfLemmas.addAll(ch.getLemmas().subList(0, indexIN - 1));
+ transfLemmas.addAll(toShiftAfterPartLemmas);
+ transfLemmas.add(ch.getLemmas().get(indexIN - 1));
+
+ return new ParseTreeChunk(transfLemmas, transfPOS, 0, 0);
+ }
+
+ public ParseTreeChunk generalizeTwoGroupedPhrasesRandomSelectHighestScoreWithTransforms(
+ ParseTreeChunk chunk1, ParseTreeChunk chunk2) {
+ ParseTreeChunk chRes1 = generalizeTwoGroupedPhrasesRandomSelectHighestScore(
+ chunk1, chunk2);
+ ParseTreeChunk chRes2 = generalizeTwoGroupedPhrasesRandomSelectHighestScore(
+ prepositionalNNSTransform(chunk1), chunk2);
+ ParseTreeChunk chRes3 = generalizeTwoGroupedPhrasesRandomSelectHighestScore(
+ prepositionalNNSTransform(chunk2), chunk1);
+
+ ParseTreeChunk chRes = null;
+ if (parseTreeChunkListScorer.getScore(chRes1) > parseTreeChunkListScorer
+ .getScore(chRes2))
+ if (parseTreeChunkListScorer.getScore(chRes1) > parseTreeChunkListScorer
+ .getScore(chRes3))
+ chRes = chRes1;
+ else
+ chRes = chRes3;
+ else if (parseTreeChunkListScorer.getScore(chRes2) > parseTreeChunkListScorer
+ .getScore(chRes3))
+ chRes = chRes2;
+ else
+ chRes = chRes3;
+
+ return chRes;
+ }
+
+ public ParseTreeChunk generalizeTwoGroupedPhrasesRandomSelectHighestScore(
+ ParseTreeChunk chunk1, ParseTreeChunk chunk2) {
+ List<String> pos1 = chunk1.getPOSs();
+ List<String> pos2 = chunk2.getPOSs();
+ // Map <ParseTreeChunk, Double> scoredResults = new HashMap <ParseTreeChunk,
+ // Double> ();
+ int timesRepetitiveRun = NUMBER_OF_ITERATIONS;
+
+ Double globalScore = -1.0;
+ ParseTreeChunk result = null;
+
+ for (int timesRun = 0; timesRun < timesRepetitiveRun; timesRun++) {
+ List<String> commonPOS = new ArrayList<String>(), commonLemmas = new ArrayList<String>();
+ int k1 = 0, k2 = 0;
+ Double score = 0.0;
+ while (k1 < pos1.size() && k2 < pos2.size()) {
+ // first check if the same POS
+ String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2));
+ String lemmaMatch = lemmaFormManager.matchLemmas(null, chunk1
+ .getLemmas().get(k1), chunk2.getLemmas().get(k2), sim);
+ // if (LemmaFormManager.acceptableLemmaAndPOS(sim, lemmaMatch)){
+ if ((sim != null)
+ && (lemmaMatch == null || (lemmaMatch != null && !lemmaMatch
+ .equals("fail")))) {
+ // if (sim!=null){ // && (lemmaMatch!=null &&
+ // !lemmaMatch.equals("fail"))){
+ commonPOS.add(pos1.get(k1));
+ if (chunk1.getLemmas().size() > k1 && chunk2.getLemmas().size() > k2
+ && lemmaMatch != null) {
+ commonLemmas.add(lemmaMatch);
+
+ } else {
+ commonLemmas.add("*");
+
+ }
+ k1++;
+ k2++;
+ } else if (Math.random() > 0.5) {
+ k1++;
+ } else {
+ k2++;
+ }
+
+ }
+ ParseTreeChunk currResult = new ParseTreeChunk(commonLemmas, commonPOS,
+ 0, 0);
+ score = parseTreeChunkListScorer.getScore(currResult);
+ if (score > globalScore) {
+ // System.out.println(chunk1 + " + \n"+ chunk2 + " = \n" +
+ // result+" score = "+ score +"\n\n");
+ result = currResult;
+ globalScore = score;
+ }
+ }
+
+ for (int timesRun = 0; timesRun < timesRepetitiveRun; timesRun++) {
+ List<String> commonPOS = new ArrayList<String>(), commonLemmas = new ArrayList<String>();
+ int k1 = pos1.size() - 1, k2 = pos2.size() - 1;
+ Double score = 0.0;
+ while (k1 >= 0 && k2 >= 0) {
+ // first check if the same POS
+ String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2));
+ String lemmaMatch = lemmaFormManager.matchLemmas(null, chunk1
+ .getLemmas().get(k1), chunk2.getLemmas().get(k2), sim);
+ // if (acceptableLemmaAndPOS(sim, lemmaMatch)){
+ if ((sim != null)
+ && (lemmaMatch == null || (lemmaMatch != null && !lemmaMatch
+ .equals("fail")))) {
+ commonPOS.add(pos1.get(k1));
+ if (chunk1.getLemmas().size() > k1 && chunk2.getLemmas().size() > k2
+ && lemmaMatch != null) {
+ commonLemmas.add(lemmaMatch);
+ } else {
+ commonLemmas.add("*");
+
+ }
+ k1--;
+ k2--;
+ } else if (Math.random() > 0.5) {
+ k1--;
+ } else {
+ k2--;
+ }
+
+ }
+ Collections.reverse(commonLemmas);
+ Collections.reverse(commonPOS);
+
+ ParseTreeChunk currResult = new ParseTreeChunk(commonLemmas, commonPOS,
+ 0, 0);
+ score = parseTreeChunkListScorer.getScore(currResult);
+ if (score > globalScore) {
+ // System.out.println(chunk1 + " + \n"+ chunk2 + " = \n" +
+ // currResult+" score = "+ score +"\n\n");
+ result = currResult;
+ globalScore = score;
+ }
+ }
+
+ // // System.out.println(chunk1 + " + \n"+ chunk2 + " = \n" + result
+ // +" score = " +
+ // // parseTreeChunkListScorer.getScore(result)+"\n\n");
+ return result;
+ }
+
+ public Boolean acceptableLemmaAndPOS(String sim, String lemmaMatch) {
+ if (sim == null) {
+ return false;
+ }
+
+ if (lemmaMatch != null && !lemmaMatch.equals("fail")) {
+ return false;
+ }
+ // even if lemmaMatch==null
+ return true;
+ // if (sim!=null && (lemmaMatch!=null && !lemmaMatch.equals("fail"))){
+
+ }
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcher.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministic.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministic.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministic.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministic.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,264 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Component;
+
+@Component
+public class ParseTreeMatcherDeterministic {
+
+ private GeneralizationListReducer generalizationListReducer = new GeneralizationListReducer();
+
+ private LemmaFormManager lemmaFormManager = new LemmaFormManager();
+
+ private POSManager posManager = new POSManager();
+
+ public List<ParseTreeChunk> generalizeTwoGroupedPhrasesDeterministic(
+ ParseTreeChunk chunk1, ParseTreeChunk chunk2) {
+ List<String> pos1 = chunk1.getPOSs();
+ List<String> pos2 = chunk2.getPOSs();
+ List<String> lem1 = chunk1.getLemmas();
+ List<String> lem2 = chunk2.getLemmas();
+
+ List<String> lem1stem = new ArrayList<String>();
+ List<String> lem2stem = new ArrayList<String>();
+
+ PorterStemmer ps = new PorterStemmer();
+ for (String word : lem1) {
+ try {
+ lem1stem.add(ps.stem(word.toLowerCase()));
+ } catch (Exception e) {
+ // e.printStackTrace();
+
+ if (word.length() > 2)
+ System.err.println("Unable to stem: " + word);
+ }
+ }
+ try {
+ for (String word : lem2) {
+ lem2stem.add(ps.stem(word.toLowerCase()));
+ }
+ } catch (Exception e) {
+ System.err.println("problem processing word " + lem2.toString());
+ }
+
+ List<String> overlap = new ArrayList(lem1stem);
+ overlap.retainAll(lem2stem);
+
+ if (overlap == null || overlap.size() < 1)
+ return null;
+
+ List<Integer> occur1 = new ArrayList<Integer>(), occur2 = new ArrayList<Integer>();
+ for (String word : overlap) {
+ Integer i1 = lem1stem.indexOf(word);
+ Integer i2 = lem2stem.indexOf(word);
+ occur1.add(i1);
+ occur2.add(i2);
+ }
+
+ // now we search for plausible sublists of overlaps
+ // if at some position correspondence is inverse (one of two position
+ // decreases instead of increases)
+ // then we terminate current alignment accum and start a new one
+ List<List<int[]>> overlapsPlaus = new ArrayList<List<int[]>>();
+ // starts from 1, not 0
+ List<int[]> accum = new ArrayList<int[]>();
+ accum.add(new int[] { occur1.get(0), occur2.get(0) });
+ for (int i = 1; i < occur1.size(); i++) {
+
+ if (occur1.get(i) > occur1.get(i - 1)
+ && occur2.get(i) > occur2.get(i - 1))
+ accum.add(new int[] { occur1.get(i), occur2.get(i) });
+ else {
+ overlapsPlaus.add(accum);
+ accum = new ArrayList<int[]>();
+ accum.add(new int[] { occur1.get(i), occur2.get(i) });
+ }
+ }
+ if (accum.size() > 0) {
+ overlapsPlaus.add(accum);
+ }
+
+ List<ParseTreeChunk> results = new ArrayList<ParseTreeChunk>();
+ for (List<int[]> occur : overlapsPlaus) {
+ List<Integer> occr1 = new ArrayList<Integer>(), occr2 = new ArrayList<Integer>();
+ for (int[] column : occur) {
+ occr1.add(column[0]);
+ occr2.add(column[1]);
+ }
+
+ int ov1 = 0, ov2 = 0; // iterators over common words;
+ List<String> commonPOS = new ArrayList<String>(), commonLemmas = new ArrayList<String>();
+ // we start two words before first word
+ int k1 = occr1.get(ov1) - 2, k2 = occr2.get(ov2) - 2;
+ // if (k1<0) k1=0; if (k2<0) k2=0;
+ Boolean bReachedCommonWord = false;
+ while (k1 < 0 || k2 < 0) {
+ k1++;
+ k2++;
+ }
+ int k1max = pos1.size() - 1, k2max = pos2.size() - 1;
+ while (k1 <= k1max && k2 <= k2max) {
+ // first check if the same POS
+ String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2));
+ String lemmaMatch = lemmaFormManager.matchLemmas(ps, lem1.get(k1),
+ lem2.get(k2), sim);
+ if ((sim != null)
+ && (lemmaMatch == null || (lemmaMatch != null && !lemmaMatch
+ .equals("fail")))) {
+ commonPOS.add(pos1.get(k1));
+ if (lemmaMatch != null) {
+ commonLemmas.add(lemmaMatch);
+ // System.out.println("Added "+lemmaMatch);
+ if (k1 == occr1.get(ov1) && k2 == occr2.get(ov2))
+ bReachedCommonWord = true; // now we can have different increment
+ // opera
+ else {
+ if (occr1.size() > ov1 + 1 && occr2.size() > ov2 + 1
+ && k1 == occr1.get(ov1 + 1) && k2 == occr2.get(ov2 + 1)) {
+ ov1++;
+ ov2++;
+ bReachedCommonWord = true;
+ }
+ // else
+ // System.err.println("Next match reached '"+lemmaMatch+
+ // "' | k1 - k2: "+k1 + " "+k2 +
+ // "| occur index ov1-ov2 "+
+ // ov1+" "+ov2+
+ // "| identified positions of match: occr1.get(ov1) - occr2.get(ov1) "
+ // +
+ // occr1.get(ov1) + " "+ occr2.get(ov1));
+ }
+ } else {
+ commonLemmas.add("*");
+ } // the same parts of speech, proceed to the next word in both
+ // expressions
+ k1++;
+ k2++;
+
+ } else if (!bReachedCommonWord) {
+ k1++;
+ k2++;
+ } // still searching
+ else {
+ // different parts of speech, jump to the next identified common word
+ ov1++;
+ ov2++;
+ if (ov1 > occr1.size() - 1 || ov2 > occr2.size() - 1)
+ break;
+ // now trying to find
+ int kk1 = occr1.get(ov1) - 2, // new positions of iterators
+ kk2 = occr2.get(ov2) - 2;
+ int countMove = 0;
+ while ((kk1 < k1 + 1 || kk2 < k2 + 1) && countMove < 2) { // if it is
+ // behind
+ // current
+ // position,
+ // synchroneously
+ // move
+ // towards
+ // right
+ kk1++;
+ kk2++;
+ countMove++;
+ }
+ k1 = kk1;
+ k2 = kk2;
+
+ if (k1 > k1max)
+ k1 = k1max;
+ if (k2 > k2max)
+ k2 = k2max;
+ bReachedCommonWord = false;
+ }
+ }
+ ParseTreeChunk currResult = new ParseTreeChunk(commonLemmas, commonPOS,
+ 0, 0);
+ results.add(currResult);
+ }
+
+ return results;
+ }
+
+ // main function to generalize two expressions grouped by phrase types
+ // returns a list of generalizations for each phrase type with filtered
+ // sub-expressions
+ public List<List<ParseTreeChunk>> matchTwoSentencesGroupedChunksDeterministic(
+ List<List<ParseTreeChunk>> sent1, List<List<ParseTreeChunk>> sent2) {
+ List<List<ParseTreeChunk>> results = new ArrayList<List<ParseTreeChunk>>();
+ // first irerate through component
+ for (int comp = 0; comp < 2 && // just np & vp
+ comp < sent1.size() && comp < sent2.size(); comp++) {
+ List<ParseTreeChunk> resultComps = new ArrayList<ParseTreeChunk>();
+ // then iterate through each phrase in each component
+ for (ParseTreeChunk ch1 : sent1.get(comp)) {
+ for (ParseTreeChunk ch2 : sent2.get(comp)) { // simpler version
+ List<ParseTreeChunk> chunkToAdd = generalizeTwoGroupedPhrasesDeterministic(
+ ch1, ch2);
+
+ if (chunkToAdd == null)
+ chunkToAdd = new ArrayList<ParseTreeChunk>();
+ // System.out.println("ch1 = "+
+ // ch1.toString()+" | ch2="+ch2.toString()
+ // +"\n result = "+chunkToAdd.toString() + "\n");
+ /*
+ * List<ParseTreeChunk> chunkToAdd1 =
+ * ParseTreeMatcherDeterministic.generalizeTwoGroupedPhrasesDeterministic
+ * ( ParseTreeMatcher.prepositionalNNSTransform(ch1), ch2); if
+ * (chunkToAdd1!=null) chunkToAdd.addAll(chunkToAdd1);
+ * List<ParseTreeChunk> chunkToAdd2 =
+ * ParseTreeMatcherDeterministic.generalizeTwoGroupedPhrasesDeterministic
+ * ( ParseTreeMatcher.prepositionalNNSTransform(ch2), ch1); if
+ * (chunkToAdd2!=null) chunkToAdd.addAll(chunkToAdd2);
+ */
+
+ // For generalized match not with orig sentences but with templates
+ // if (!LemmaFormManager.mustOccurVerifier(ch1, ch2, chunkToAdd))
+ // continue; // if the words which have to stay do not stay, proceed
+ // to other elements
+ Boolean alreadyThere = false;
+ for (ParseTreeChunk chunk : resultComps) {
+ if (chunkToAdd.contains(chunk)) {
+ alreadyThere = true;
+ break;
+ }
+
+ // }
+ }
+
+ if (!alreadyThere && chunkToAdd != null && chunkToAdd.size() > 0) {
+ resultComps.addAll(chunkToAdd);
+ }
+
+ }
+ }
+ List<ParseTreeChunk> resultCompsRed = generalizationListReducer
+ .applyFilteringBySubsumption(resultComps);
+
+ resultComps = resultCompsRed;
+ results.add(resultComps);
+ }
+
+ return results;
+ }
+
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministic.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/PorterStemmer.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/PorterStemmer.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/PorterStemmer.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/PorterStemmer.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,409 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+public class PorterStemmer {
+ public String stem(String str) {
+ // check for zero length
+ if (str.length() > 0) {
+ // all characters must be letters
+ char[] c = str.toCharArray();
+ for (int i = 0; i < c.length; i++) {
+ if (!Character.isLetter(c[i]))
+ return "Invalid term";
+ }
+ } else {
+ return "No term entered";
+ }
+ str = step1a(str);
+ str = step1b(str);
+ str = step1c(str);
+ str = step2(str);
+ str = step3(str);
+ str = step4(str);
+ str = step5a(str);
+ str = step5b(str);
+ return str;
+ } // end stem
+
+ protected String step1a(String str) {
+ // SSES -> SS
+ if (str.endsWith("sses")) {
+ return str.substring(0, str.length() - 2);
+ // IES -> I
+ } else if (str.endsWith("ies")) {
+ return str.substring(0, str.length() - 2);
+ // SS -> S
+ } else if (str.endsWith("ss")) {
+ return str;
+ // S ->
+ } else if (str.endsWith("s")) {
+ return str.substring(0, str.length() - 1);
+ } else {
+ return str;
+ }
+ } // end step1a
+
+ protected String step1b(String str) {
+ // (m > 0) EED -> EE
+ if (str.endsWith("eed")) {
+ if (stringMeasure(str.substring(0, str.length() - 3)) > 0)
+ return str.substring(0, str.length() - 1);
+ else
+ return str;
+ // (*v*) ED ->
+ } else if ((str.endsWith("ed"))
+ && (containsVowel(str.substring(0, str.length() - 2)))) {
+ return step1b2(str.substring(0, str.length() - 2));
+ // (*v*) ING ->
+ } else if ((str.endsWith("ing"))
+ && (containsVowel(str.substring(0, str.length() - 3)))) {
+ return step1b2(str.substring(0, str.length() - 3));
+ } // end if
+ return str;
+ } // end step1b
+
+ protected String step1b2(String str) {
+ // AT -> ATE
+ if (str.endsWith("at") || str.endsWith("bl") || str.endsWith("iz")) {
+ return str + "e";
+ } else if ((endsWithDoubleConsonent(str))
+ && (!(str.endsWith("l") || str.endsWith("s") || str.endsWith("z")))) {
+ return str.substring(0, str.length() - 1);
+ } else if ((stringMeasure(str) == 1) && (endsWithCVC(str))) {
+ return str + "e";
+ } else {
+ return str;
+ }
+ } // end step1b2
+
+ protected String step1c(String str) {
+ // (*v*) Y -> I
+ if (str.endsWith("y")) {
+ if (containsVowel(str.substring(0, str.length() - 1)))
+ return str.substring(0, str.length() - 1) + "i";
+ } // end if
+ return str;
+ } // end step1c
+
+ protected String step2(String str) {
+ // (m > 0) ATIONAL -> ATE
+ if ((str.endsWith("ational"))
+ && (stringMeasure(str.substring(0, str.length() - 5)) > 0)) {
+ return str.substring(0, str.length() - 5) + "e";
+ // (m > 0) TIONAL -> TION
+ } else if ((str.endsWith("tional"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+ return str.substring(0, str.length() - 2);
+ // (m > 0) ENCI -> ENCE
+ } else if ((str.endsWith("enci"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+ return str.substring(0, str.length() - 2);
+ // (m > 0) ANCI -> ANCE
+ } else if ((str.endsWith("anci"))
+ && (stringMeasure(str.substring(0, str.length() - 1)) > 0)) {
+ return str.substring(0, str.length() - 1) + "e";
+ // (m > 0) IZER -> IZE
+ } else if ((str.endsWith("izer"))
+ && (stringMeasure(str.substring(0, str.length() - 1)) > 0)) {
+ return str.substring(0, str.length() - 1);
+ // (m > 0) ABLI -> ABLE
+ } else if ((str.endsWith("abli"))
+ && (stringMeasure(str.substring(0, str.length() - 1)) > 0)) {
+ return str.substring(0, str.length() - 1) + "e";
+ // (m > 0) ENTLI -> ENT
+ } else if ((str.endsWith("alli"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+ return str.substring(0, str.length() - 2);
+ // (m > 0) ELI -> E
+ } else if ((str.endsWith("entli"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+ return str.substring(0, str.length() - 2);
+ // (m > 0) OUSLI -> OUS
+ } else if ((str.endsWith("eli"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+ return str.substring(0, str.length() - 2);
+ // (m > 0) IZATION -> IZE
+ } else if ((str.endsWith("ousli"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+ return str.substring(0, str.length() - 2);
+ // (m > 0) IZATION -> IZE
+ } else if ((str.endsWith("ization"))
+ && (stringMeasure(str.substring(0, str.length() - 5)) > 0)) {
+ return str.substring(0, str.length() - 5) + "e";
+ // (m > 0) ATION -> ATE
+ } else if ((str.endsWith("ation"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+ return str.substring(0, str.length() - 3) + "e";
+ // (m > 0) ATOR -> ATE
+ } else if ((str.endsWith("ator"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+ return str.substring(0, str.length() - 2) + "e";
+ // (m > 0) ALISM -> AL
+ } else if ((str.endsWith("alism"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 0) IVENESS -> IVE
+ } else if ((str.endsWith("iveness"))
+ && (stringMeasure(str.substring(0, str.length() - 4)) > 0)) {
+ return str.substring(0, str.length() - 4);
+ // (m > 0) FULNESS -> FUL
+ } else if ((str.endsWith("fulness"))
+ && (stringMeasure(str.substring(0, str.length() - 4)) > 0)) {
+ return str.substring(0, str.length() - 4);
+ // (m > 0) OUSNESS -> OUS
+ } else if ((str.endsWith("ousness"))
+ && (stringMeasure(str.substring(0, str.length() - 4)) > 0)) {
+ return str.substring(0, str.length() - 4);
+ // (m > 0) ALITII -> AL
+ } else if ((str.endsWith("aliti"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 0) IVITI -> IVE
+ } else if ((str.endsWith("iviti"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+ return str.substring(0, str.length() - 3) + "e";
+ // (m > 0) BILITI -> BLE
+ } else if ((str.endsWith("biliti"))
+ && (stringMeasure(str.substring(0, str.length() - 5)) > 0)) {
+ return str.substring(0, str.length() - 5) + "le";
+ } // end if
+ return str;
+ } // end step2
+
+ protected String step3(String str) {
+ // (m > 0) ICATE -> IC
+ if ((str.endsWith("icate"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 0) ATIVE ->
+ } else if ((str.endsWith("ative"))
+ && (stringMeasure(str.substring(0, str.length() - 5)) > 0)) {
+ return str.substring(0, str.length() - 5);
+ // (m > 0) ALIZE -> AL
+ } else if ((str.endsWith("alize"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 0) ICITI -> IC
+ } else if ((str.endsWith("iciti"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 0) ICAL -> IC
+ } else if ((str.endsWith("ical"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 0)) {
+ return str.substring(0, str.length() - 2);
+ // (m > 0) FUL ->
+ } else if ((str.endsWith("ful"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 0)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 0) NESS ->
+ } else if ((str.endsWith("ness"))
+ && (stringMeasure(str.substring(0, str.length() - 4)) > 0)) {
+ return str.substring(0, str.length() - 4);
+ } // end if
+ return str;
+ } // end step3
+
+ protected String step4(String str) {
+ if ((str.endsWith("al"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 1)) {
+ return str.substring(0, str.length() - 2);
+ // (m > 1) ANCE ->
+ } else if ((str.endsWith("ance"))
+ && (stringMeasure(str.substring(0, str.length() - 4)) > 1)) {
+ return str.substring(0, str.length() - 4);
+ // (m > 1) ENCE ->
+ } else if ((str.endsWith("ence"))
+ && (stringMeasure(str.substring(0, str.length() - 4)) > 1)) {
+ return str.substring(0, str.length() - 4);
+ // (m > 1) ER ->
+ } else if ((str.endsWith("er"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 1)) {
+ return str.substring(0, str.length() - 2);
+ // (m > 1) IC ->
+ } else if ((str.endsWith("ic"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 1)) {
+ return str.substring(0, str.length() - 2);
+ // (m > 1) ABLE ->
+ } else if ((str.endsWith("able"))
+ && (stringMeasure(str.substring(0, str.length() - 4)) > 1)) {
+ return str.substring(0, str.length() - 4);
+ // (m > 1) IBLE ->
+ } else if ((str.endsWith("ible"))
+ && (stringMeasure(str.substring(0, str.length() - 4)) > 1)) {
+ return str.substring(0, str.length() - 4);
+ // (m > 1) ANT ->
+ } else if ((str.endsWith("ant"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 1) EMENT ->
+ } else if ((str.endsWith("ement"))
+ && (stringMeasure(str.substring(0, str.length() - 5)) > 1)) {
+ return str.substring(0, str.length() - 5);
+ // (m > 1) MENT ->
+ } else if ((str.endsWith("ment"))
+ && (stringMeasure(str.substring(0, str.length() - 4)) > 1)) {
+ return str.substring(0, str.length() - 4);
+ // (m > 1) ENT ->
+ } else if ((str.endsWith("ent"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 1) and (*S or *T) ION ->
+ } else if ((str.endsWith("sion") || str.endsWith("tion"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 1) OU ->
+ } else if ((str.endsWith("ou"))
+ && (stringMeasure(str.substring(0, str.length() - 2)) > 1)) {
+ return str.substring(0, str.length() - 2);
+ // (m > 1) ISM ->
+ } else if ((str.endsWith("ism"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 1) ATE ->
+ } else if ((str.endsWith("ate"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 1) ITI ->
+ } else if ((str.endsWith("iti"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 1) OUS ->
+ } else if ((str.endsWith("ous"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 1) IVE ->
+ } else if ((str.endsWith("ive"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+ return str.substring(0, str.length() - 3);
+ // (m > 1) IZE ->
+ } else if ((str.endsWith("ize"))
+ && (stringMeasure(str.substring(0, str.length() - 3)) > 1)) {
+ return str.substring(0, str.length() - 3);
+ } // end if
+ return str;
+ } // end step4
+
+ protected String step5a(String str) {
+ // (m > 1) E ->
+ if ((stringMeasure(str.substring(0, str.length() - 1)) > 1)
+ && str.endsWith("e"))
+ return str.substring(0, str.length() - 1);
+ // (m = 1 and not *0) E ->
+ else if ((stringMeasure(str.substring(0, str.length() - 1)) == 1)
+ && (!endsWithCVC(str.substring(0, str.length() - 1)))
+ && (str.endsWith("e")))
+ return str.substring(0, str.length() - 1);
+ else
+ return str;
+ } // end step5a
+
+ protected String step5b(String str) {
+ // (m > 1 and *d and *L) ->
+ if (str.endsWith("l") && endsWithDoubleConsonent(str)
+ && (stringMeasure(str.substring(0, str.length() - 1)) > 1)) {
+ return str.substring(0, str.length() - 1);
+ } else {
+ return str;
+ }
+ } // end step5b
+
+ /*
+ * ------------------------------------------------------- The following are
+ * functions to help compute steps 1 - 5
+ * -------------------------------------------------------
+ */
+
+ // does string end with 's'?
+ protected boolean endsWithS(String str) {
+ return str.endsWith("s");
+ } // end function
+
+ // does string contain a vowel?
+ protected boolean containsVowel(String str) {
+ char[] strchars = str.toCharArray();
+ for (int i = 0; i < strchars.length; i++) {
+ if (isVowel(strchars[i]))
+ return true;
+ }
+ // no aeiou but there is y
+ if (str.indexOf('y') > -1)
+ return true;
+ else
+ return false;
+ } // end function
+
+ // is char a vowel?
+ public boolean isVowel(char c) {
+ if ((c == 'a') || (c == 'e') || (c == 'i') || (c == 'o') || (c == 'u'))
+ return true;
+ else
+ return false;
+ } // end function
+
+ // does string end with a double consonent?
+ protected boolean endsWithDoubleConsonent(String str) {
+ char c = str.charAt(str.length() - 1);
+ if (c == str.charAt(str.length() - 2))
+ if (!containsVowel(str.substring(str.length() - 2))) {
+ return true;
+ }
+ return false;
+ } // end function
+
+ // returns a CVC measure for the string
+ protected int stringMeasure(String str) {
+ int count = 0;
+ boolean vowelSeen = false;
+ char[] strchars = str.toCharArray();
+
+ for (int i = 0; i < strchars.length; i++) {
+ if (isVowel(strchars[i])) {
+ vowelSeen = true;
+ } else if (vowelSeen) {
+ count++;
+ vowelSeen = false;
+ }
+ } // end for
+ return count;
+ } // end function
+
+ // does stem end with CVC?
+ protected boolean endsWithCVC(String str) {
+ char c, v, c2 = ' ';
+ if (str.length() >= 3) {
+ c = str.charAt(str.length() - 1);
+ v = str.charAt(str.length() - 2);
+ c2 = str.charAt(str.length() - 3);
+ } else {
+ return false;
+ }
+
+ if ((c == 'w') || (c == 'x') || (c == 'y')) {
+ return false;
+ } else if (isVowel(c)) {
+ return false;
+ } else if (!isVowel(v)) {
+ return false;
+ } else if (isVowel(c2)) {
+ return false;
+ } else {
+ return true;
+ }
+ } // end function
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/PorterStemmer.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SentencePairMatchResult.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SentencePairMatchResult.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SentencePairMatchResult.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SentencePairMatchResult.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+import java.util.List;
+
+import org.apache.commons.lang.StringUtils;
+
+public class SentencePairMatchResult {
+ public List<List<ParseTreeChunk>> matchResult;
+
+ public List<List<ParseTreeChunk>> getMatchResult() {
+ return matchResult;
+ }
+
+ public void setMatchResult(List<List<ParseTreeChunk>> matchResult) {
+ this.matchResult = matchResult;
+ }
+
+ public List<LemmaPair> getResForMinedSent1() {
+ return resForMinedSent1;
+ }
+
+ public void setResForMinedSent1(List<LemmaPair> resForMinedSent1) {
+ this.resForMinedSent1 = resForMinedSent1;
+ }
+
+ public boolean isVerbExists() {
+ return verbExists;
+ }
+
+ public void setVerbExists(boolean verbExists) {
+ this.verbExists = verbExists;
+ }
+
+ public boolean isImperativeVerb() {
+ return imperativeVerb;
+ }
+
+ public void setImperativeVerb(boolean imperativeVerb) {
+ this.imperativeVerb = imperativeVerb;
+ }
+
+ private List<LemmaPair> resForMinedSent1;
+
+ public boolean verbExists = false;
+
+ public boolean imperativeVerb = false;
+
+ public SentencePairMatchResult(List<List<ParseTreeChunk>> matchResult,
+ List<LemmaPair> resForMinedSent1) {
+ super();
+ verbExists = false;
+ imperativeVerb = false;
+ System.out.println("Assessing sentence for inclusion " + resForMinedSent1);
+ this.matchResult = matchResult;
+ this.resForMinedSent1 = resForMinedSent1;
+ for (LemmaPair word : resForMinedSent1) {
+ if (word.getPOS().startsWith("VB") && word.getLemma().length() > 2
+ && StringUtils.isAlpha(word.getLemma())) {// ||
+ // word.getPOS().startsWith("VP"))
+ verbExists = true;
+ System.out.println("Found verb=" + word);
+ }
+ }
+ // various form of sales pitch: 'get something', or 'we offer'
+ if (resForMinedSent1.get(1).getLemma().startsWith("We")
+ || resForMinedSent1.get(2).getLemma().startsWith("We"))
+ imperativeVerb = true;
+ for (LemmaPair word : resForMinedSent1) {
+ if (word.getPOS().startsWith("VB") && word.getStartPos() < 1
+ && word.getEndPos() < 1) {
+ imperativeVerb = true;
+ System.out.println("Found imperative verb=" + word);
+ }
+ }
+
+ }
+
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SentencePairMatchResult.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcher.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcher.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcher.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcher.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,399 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.lang.english.SentenceDetector;
+import opennlp.tools.lang.english.Tokenizer;
+import opennlp.tools.lang.english.TreebankParser;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.parser.chunking.Parser;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.util.Span;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Autowired;
+
+public class SyntMatcher {
+ public static final String resourcesDir = (System.getProperty("os.name")
+ .toLowerCase().indexOf("win") > -1 ? "C:/workspace/ZSearch/resources_external"
+ : "/var/search/solr-1.2/resources");
+ static private SyntMatcher m_SyntMatcher = null;
+
+ private static final Logger LOG = LoggerFactory.getLogger(SyntMatcher.class);
+
+ private SentenceDetectorME sentenceDetectorME = null;
+
+ private Tokenizer tokenizer = null;
+
+ private Parser parser = null;
+
+ private final boolean useTagDict = true;
+
+ private final boolean useCaseInsensitiveTagDict = false;
+
+ private final int beamSize = Parser.defaultBeamSize;
+
+ private final double advancePercentage = Parser.defaultAdvancePercentage;
+
+ private Map<String, List<List<ParseTreeChunk>>> parsingsCache = new HashMap<String, List<List<ParseTreeChunk>>>();
+
+ private ParseTreeChunkListScorer parseTreeChunkListScorer;
+
+ private ParseTreeMatcherDeterministic parseTreeMatcherDeterministic = new ParseTreeMatcherDeterministic();
+
+ /**
+ * Get the StopList singleton instance.
+ *
+ * @return The StopList
+ */
+ static public SyntMatcher getInstance() {
+ String dir = resourcesDir + "/models";
+ if (m_SyntMatcher == null) {
+ m_SyntMatcher = new SyntMatcher();
+
+ try {
+ m_SyntMatcher.loadOpenNLP(dir);
+ } catch (Exception e) {
+ LOG.error("Problem loading openNLP! ", 2);
+ }
+ }
+ return m_SyntMatcher;
+ }
+
+ static public SyntMatcher getInstance(String resourceDirSpec) {
+ String dir = resourceDirSpec + "/models";
+ if (m_SyntMatcher == null) {
+ m_SyntMatcher = new SyntMatcher();
+
+ try {
+ m_SyntMatcher.loadOpenNLP(dir);
+ } catch (Exception e) {
+ e.printStackTrace();
+ LOG.error("Problem loading openNLP! ", e);
+ }
+ }
+ return m_SyntMatcher;
+ }
+
+ public SyntMatcher() {
+ /*
+ * try { loadOpenNLP(resourcesDir); } catch (IOException e) {
+ * LOG.error("Problem loading openNLP! ", e); }
+ */
+ }
+
+ public SyntMatcher(String resourcesDir) {
+ try {
+ loadOpenNLP(resourcesDir);
+ } catch (IOException e) {
+ LOG.error("Problem loading openNLP! ", e);
+ }
+ }
+
+ public SyntMatcher(String resourcesDir, String language) {
+ try {
+ loadOpenNLP(resourcesDir, language);
+ } catch (IOException e) {
+ LOG.error("Problem loading openNLP! ", e);
+ }
+ }
+
+ protected void loadOpenNLP(String dir) throws IOException {
+ sentenceDetectorME = new SentenceDetector(dir
+ + "/sentdetect/EnglishSD.bin.gz");
+ tokenizer = new Tokenizer(dir + "/tokenize/EnglishTok.bin.gz");
+ parser = (Parser) TreebankParser.getParser(dir + "/parser", useTagDict,
+ useCaseInsensitiveTagDict, beamSize, advancePercentage);
+
+ }
+
+ protected void loadOpenNLP(String dir, String lang) throws IOException {
+ if (lang.equalsIgnoreCase("es")) {
+ sentenceDetectorME = new SentenceDetector(dir
+ + "/sentdetect/EnglishSD.bin.gz");
+ tokenizer = new Tokenizer(dir + "/tokenize/EnglishTok.bin.gz");
+ parser = (Parser) TreebankParser.getParser(dir + "/parser", useTagDict,
+ useCaseInsensitiveTagDict, beamSize, advancePercentage);
+ }
+ }
+
+ // TODO is synchronized needed here?
+ public synchronized Parse[] parseLine(String line, Parser p, double confidence) {
+ String[] tokens = tokenizer.tokenize(line);
+ // tokens = TextProcessor.fastTokenize(line, false).toArray(new String[0]);
+
+ StringBuilder sb = new StringBuilder();
+ for (String t : tokens)
+ sb.append(t).append(" ");
+
+ Parse[] ps = null;
+ try {
+ ps = TreebankParser.parseLine(sb.toString(), parser, 2);
+ } catch (Exception e) {
+ System.out.println("Problem parsing " + sb.toString());
+ e.printStackTrace(); // unable to parse for whatever reason
+ }
+ int i = 1;
+ for (; i < ps.length; i++) {
+ if (ps[i - 1].getProb() - ps[i].getProb() > confidence)
+ break;
+ }
+ if (i < ps.length) {
+ Parse[] retp = new Parse[i];
+ for (int j = 0; j < i; j++)
+ retp[j] = ps[j];
+ return retp;
+ } else
+ return ps;
+ }
+
+ // TODO is synchronized needed here?
+ protected synchronized Double[] getPhrasingAcceptabilityData(String line) {
+ int nParsings = 5;
+ String[] tokens = tokenizer.tokenize(line);
+ int numWords = tokens.length;
+ StringBuilder sb = new StringBuilder();
+ for (String t : tokens)
+ sb.append(t).append(" ");
+ Double result[] = new Double[5];
+
+ Parse[] ps = null;
+ try {
+ ps = TreebankParser.parseLine(sb.toString(), parser, nParsings);
+ } catch (Exception e) {
+ // unable to parse for whatever reason
+ for (int i = 0; i < result.length; i++) {
+ result[i] = -20.0;
+ }
+ }
+
+ for (int i = 0; i < ps.length; i++) {
+ result[i] = Math.abs(ps[i].getProb() / (double) numWords);
+ }
+ return result;
+ }
+
+ protected boolean allChildNodesArePOSTags(Parse p) {
+ Parse[] subParses = p.getChildren();
+ for (int pi = 0; pi < subParses.length; pi++)
+ if (!((Parse) subParses[pi]).isPosTag())
+ return false;
+ return true;
+ }
+
+ protected ArrayList<String> getNounPhrases(Parse p) {
+ ArrayList<String> nounphrases = new ArrayList<String>();
+
+ Parse[] subparses = p.getChildren();
+ for (int pi = 0; pi < subparses.length; pi++) {
+ // System.out.println("Processing Label: " + subparses[pi].getLabel());
+ // System.out.println("Processing Type: " + subparses[pi].getType());
+ if (subparses[pi].getType().equals("NP")
+ && allChildNodesArePOSTags(subparses[pi]))// &&
+ // ((Parse)subparses[pi]).getLabel()
+ // == "NP")
+ {
+ // System.out.println("Processing: " + subparses[pi].getLabel() +
+ // " as Chunk...");
+ Span _span = subparses[pi].getSpan();
+ nounphrases
+ .add(p.getText().substring(_span.getStart(), _span.getEnd()));
+ } else if (!((Parse) subparses[pi]).isPosTag())
+ nounphrases.addAll(getNounPhrases(subparses[pi]));
+ }
+
+ return nounphrases;
+ }
+
+ public List<LemmaPair> getAllPhrasesTWPairs(Parse p) {
+ List<String> nounphrases = new ArrayList<String>();
+ List<LemmaPair> LemmaPairs = new ArrayList<LemmaPair>();
+
+ Parse[] subparses = p.getChildren();
+ for (int pi = 0; pi < subparses.length; pi++) {
+ Span _span = subparses[pi].getSpan();
+
+ nounphrases.add(p.getText().substring(_span.getStart(), _span.getEnd()));
+ String expr = p.getText().substring(_span.getStart(), _span.getEnd());
+
+ // if (expr.indexOf(" ")>0)
+ LemmaPairs.add(new LemmaPair(subparses[pi].getType(), expr, _span
+ .getStart()));
+ if (!((Parse) subparses[pi]).isPosTag())
+ LemmaPairs.addAll(getAllPhrasesTWPairs(subparses[pi]));
+ }
+
+ return LemmaPairs;
+ }
+
+ protected List<List<ParseTreeChunk>> matchOrigSentences(String sent1,
+ String sent2) {
+ // with tokenizer now
+ Parse[] parses1 = parseLine(sent1, parser, 1);
+ Parse[] parses2 = parseLine(sent2, parser, 1);
+ List<LemmaPair> origChunks1 = getAllPhrasesTWPairs(parses1[0]);
+ List<LemmaPair> origChunks2 = getAllPhrasesTWPairs(parses2[0]);
+ System.out.println(origChunks1);
+ System.out.println(origChunks2);
+
+ ParseTreeChunk matcher = new ParseTreeChunk();
+ List<List<ParseTreeChunk>> matchResult = matcher
+ .matchTwoSentencesGivenPairLists(origChunks1, origChunks2);
+ return matchResult;
+ }
+
+ public List<List<ParseTreeChunk>> matchOrigSentencesCache(String sent1,
+ String sent2) {
+ sent1 = sent1.replace("'s", " 's").replace(":", " ");
+ sent2 = sent2.replace("'s", " 's").replace(":", " ");
+
+ ParseTreeChunk matcher = new ParseTreeChunk();
+ List<List<ParseTreeChunk>> sent1GrpLst = null, sent2GrpLst = null;
+
+ sent1GrpLst = parsingsCache.get(sent1);
+ if (sent1GrpLst == null) {
+ List<LemmaPair> origChunks1 = new ArrayList<LemmaPair>();
+ String[] sents1 = sentenceDetectorME.sentDetect(sent1);
+ for (String s1 : sents1) {
+ Parse[] parses1 = parseLine(s1, parser, 1);
+ origChunks1.addAll(getAllPhrasesTWPairs(parses1[0]));
+ }
+ List<ParseTreeChunk> chunk1List = matcher.buildChunks(origChunks1);
+ sent1GrpLst = matcher.groupChunksAsParses(chunk1List);
+ parsingsCache.put(sent1, sent1GrpLst);
+ System.out.println(origChunks1);
+ // System.out.println("=== Grouped chunks 1 "+ sent1GrpLst);
+ }
+ sent2GrpLst = parsingsCache.get(sent2);
+ if (sent2GrpLst == null) {
+ List<LemmaPair> origChunks2 = new ArrayList<LemmaPair>();
+ String[] sents2 = sentenceDetectorME.sentDetect(sent2);
+ for (String s2 : sents2) {
+ Parse[] parses2 = parseLine(s2, parser, 1);
+ origChunks2.addAll(getAllPhrasesTWPairs(parses2[0]));
+ }
+ List<ParseTreeChunk> chunk2List = matcher.buildChunks(origChunks2);
+ sent2GrpLst = matcher.groupChunksAsParses(chunk2List);
+ parsingsCache.put(sent2, sent2GrpLst);
+ System.out.println(origChunks2);
+ // System.out.println("=== Grouped chunks 2 "+ sent2GrpLst);
+ }
+
+ return parseTreeMatcherDeterministic
+ .matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst);
+
+ }
+
+ public SentencePairMatchResult assessRelevance(String minedSent1, String sent2) {
+ minedSent1 = minedSent1.replace("'s", " 's").replace(":", " ")
+ .replace("âs", " 's");
+ sent2 = sent2.replace("'s", " 's").replace(":", " ").replace("âs", " 's");
+
+ ParseTreeChunk matcher = new ParseTreeChunk();
+ List<List<ParseTreeChunk>> sent1GrpLst = null, sent2GrpLst = null;
+
+ // sent1GrpLst = parsingsCache.get(minedSent1);
+ // if (sent1GrpLst==null){
+ List<LemmaPair> origChunks1 = new ArrayList<LemmaPair>();
+ String[] sents1 = sentenceDetectorME.sentDetect(minedSent1);
+ for (String s1 : sents1) {
+ Parse[] parses1 = parseLine(s1, parser, 1);
+ origChunks1.addAll(getAllPhrasesTWPairs(parses1[0]));
+ }
+ List<ParseTreeChunk> chunk1List = matcher.buildChunks(origChunks1);
+ sent1GrpLst = matcher.groupChunksAsParses(chunk1List);
+ parsingsCache.put(minedSent1, sent1GrpLst);
+ // System.out.println(origChunks1);
+ // System.out.println("=== Grouped chunks 1 "+ sent1GrpLst);
+ // }
+ sent2GrpLst = parsingsCache.get(sent2);
+ if (sent2GrpLst == null) {
+ List<LemmaPair> origChunks2 = new ArrayList<LemmaPair>();
+ String[] sents2 = sentenceDetectorME.sentDetect(sent2);
+ for (String s2 : sents2) {
+ Parse[] parses2 = parseLine(s2, parser, 1);
+ origChunks2.addAll(getAllPhrasesTWPairs(parses2[0]));
+ }
+ List<ParseTreeChunk> chunk2List = matcher.buildChunks(origChunks2);
+ sent2GrpLst = matcher.groupChunksAsParses(chunk2List);
+ parsingsCache.put(sent2, sent2GrpLst);
+ // System.out.println(origChunks2);
+ // System.out.println("=== Grouped chunks 2 "+ sent2GrpLst);
+ }
+
+ ParseTreeMatcherDeterministic md = new ParseTreeMatcherDeterministic();
+ List<List<ParseTreeChunk>> res = md
+ .matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst);
+ return new SentencePairMatchResult(res, origChunks1);
+
+ }
+
+ public Map<String, List<LemmaPair>> findMappingBetweenSentencesOfAParagraphAndAClassReps(
+ String para1, String classStr) {
+ // profile of matches
+ List<List<List<ParseTreeChunk>>> matchResultPerSentence = new ArrayList<List<List<ParseTreeChunk>>>();
+
+ ParseTreeChunk matcher = new ParseTreeChunk();
+
+ String[] sents = sentenceDetectorME.sentDetect(para1);
+ String[] classSents = sentenceDetectorME.sentDetect(classStr);
+
+ List<List<LemmaPair>> parseSentList = new ArrayList<List<LemmaPair>>();
+ for (String s : sents) {
+ parseSentList.add(getAllPhrasesTWPairs((parseLine(s, parser, 1)[0])));
+ }
+
+ List<List<LemmaPair>> parseClassList = new ArrayList<List<LemmaPair>>();
+ for (String s : classSents) {
+ parseClassList.add(getAllPhrasesTWPairs((parseLine(s, parser, 1)[0])));
+ }
+
+ Map<String, List<LemmaPair>> sentence_bestClassRep = new HashMap<String, List<LemmaPair>>();
+ for (List<LemmaPair> chunksSent : parseSentList) {
+ Double maxScore = -1.0;
+ for (List<LemmaPair> chunksClass : parseClassList) {
+ List<List<ParseTreeChunk>> matchResult = matcher
+ .matchTwoSentencesGivenPairLists(chunksSent, chunksClass);
+ Double score = parseTreeChunkListScorer
+ .getParseTreeChunkListScore(matchResult);
+ if (score > maxScore) {
+ maxScore = score;
+ sentence_bestClassRep.put(chunksSent.toString(), chunksClass);
+ }
+ }
+ }
+ return sentence_bestClassRep;
+ }
+
+ public SentenceDetectorME getSentenceDetectorME() {
+ return sentenceDetectorME;
+ }
+
+ public Parser getParser() {
+ return parser;
+ }
+}
+
+// -Xms500M -Xmx500M
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcher.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherConfiguration.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherConfiguration.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherConfiguration.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherConfiguration.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.context.annotation.Bean;
+import org.springframework.context.annotation.Configuration;
+
+@Configuration
+public class SyntMatcherConfiguration {
+ @Value(value = "${resources.dir}")
+ private String resourcesDir;
+
+ @Bean
+ public SyntMatcher syntMatcher() {
+ return new SyntMatcher(resourcesDir);
+ }
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherConfiguration.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherFactory.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherFactory.java?rev=1181845&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherFactory.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherFactory.java Tue Oct 11 15:36:31 2011
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity;
+
+import org.springframework.stereotype.Component;
+
+@Component
+public class SyntMatcherFactory {
+ public static SyntMatcher get(String language) {
+ return null;
+ }
+}
Propchange: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/SyntMatcherFactory.java
------------------------------------------------------------------------------
svn:mime-type = text/plain