You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by bg...@apache.org on 2012/01/18 21:43:56 UTC
svn commit: r1233056 - in
/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools:
similarity/apps/ textsimilarity/ textsimilarity/chunker2matcher/
Author: bgalitsky
Date: Wed Jan 18 20:43:55 2012
New Revision: 1233056
URL: http://svn.apache.org/viewvc?rev=1233056&view=rev
Log:
demonstration how sensitive syntactic match is compared to bag-of-words approach
Key: OPENNLP-413
Added:
incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserPure2MatcherProcessor.java
Modified:
incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBaseComparable.java
incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java
incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java
incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/PhraseNode.java
incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SentenceNode.java
incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SyntacticTreeNode.java
incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/WordNode.java
Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBaseComparable.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBaseComparable.java?rev=1233056&r1=1233055&r2=1233056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBaseComparable.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBaseComparable.java Wed Jan 18 20:43:55 2012
@@ -1,28 +1,10 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
package opennlp.tools.similarity.apps;
import java.util.Comparator;
-public class HitBaseComparable implements Comparator<HitBase> {
-
- public int compare(HitBase o1, HitBase o2) {
- return (o1.getGenerWithQueryScore() > o2.getGenerWithQueryScore() ? -1
- : (o1 == o2 ? 0 : 1));
- }
+public class HitBaseComparable implements Comparator<HitBase>{
+ @Override
+ public int compare(HitBase o1, HitBase o2) {
+ return (o1.getGenerWithQueryScore()>o2.getGenerWithQueryScore() ? -1 : (o1==o2 ? 0 : 1));
+ }
}
\ No newline at end of file
Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java?rev=1233056&r1=1233055&r2=1233056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java Wed Jan 18 20:43:55 2012
@@ -14,7 +14,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
package opennlp.tools.similarity.apps;
import java.util.ArrayList;
@@ -27,84 +26,81 @@ import opennlp.tools.textsimilarity.Pars
import opennlp.tools.textsimilarity.SentencePairMatchResult;
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+
public class SearchResultsProcessor extends BingWebQueryRunner {
- private static Logger LOG = Logger
- .getLogger("opennlp.tools.similarity.apps.SearchResultsProcessor");
- private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
-
- /*
- * Takes Bing API search results and calculates the parse tree similarity
- * between the question and each snippet. Ranks those snippets with higher
- * similarity score up
- */
- private BingResponse calculateMatchScoreResortHits(BingResponse resp,
- String searchQuery) {
- // TODO
- /*
- * if query is multi-sentence, special handling int indexDot =
- * searchQuery.indexOf("."); if (indexDot>0 &&
- * indexDot<searchQuery.length()-1){ MultipleSentenceQueryAnswerer ans = new
- * MultipleSentenceQueryAnswerer(); return
- * ans.calculateMatchScoreResortHits(resp, searchQuery); }
- */
- List<HitBase> newHitList = new ArrayList<HitBase>();
- ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor
- .getInstance();
-
- for (HitBase hit : resp.getHits()) {
- String snapshot = hit.getAbstractText().replace("<b>...</b>", ". ")
- .replace("<b>", "").replace("</b>", "");
- snapshot = snapshot.replace("</B>", "").replace("<B>", "")
- .replace("<br>", "").replace("</br>", "").replace("...", ". ")
- .replace("|", " ").replace(">", " ");
- snapshot += " . " + hit.getTitle();
- Double score = 0.0;
- try {
- SentencePairMatchResult matchRes = sm.assessRelevance(snapshot,
- searchQuery);
- List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
- score = parseTreeChunkListScorer.getParseTreeChunkListScore(match);
- LOG.finest(score + " | " + snapshot);
- } catch (Exception e) {
- LOG.severe("Problem processing snapshot " + snapshot);
- e.printStackTrace();
- }
- hit.setGenerWithQueryScore(score);
- newHitList.add(hit);
- }
- Collections.sort(newHitList, new HitBaseComparable());
- resp.setHits(newHitList);
- LOG.info("\n\n ============= NEW ORDER ================= ");
- for (HitBase hit : newHitList) {
- LOG.info(hit.toString());
- }
-
- return resp;
- }
-
- public List<HitBase> runSearch(String query) {
- BingResponse resp = null, // obtained from bing
- newResp = null; // re-sorted based on similarity
- try {
- List<String> resultList = search(query, "", "", 30);
- resp = populateBingHit(resultList.get(0));
- // now we apply our own relevance filter
- newResp = calculateMatchScoreResortHits(resp, query);
-
- } catch (Exception e) {
- // e.printStackTrace();
- LOG.info("No search results for query '" + query);
- e.printStackTrace();
- return null;
- }
- // cast to super class
- List<HitBase> hits = new ArrayList<HitBase>();
- for (HitBase h : resp.getHits())
- hits.add((HitBase) h);
+ private static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.SearchResultsProcessor");
+ private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
+
+ /*
+ * Takes Bing API search results and calculates the parse tree similarity between the question and each snippet.
+ * Ranks those snippets with higher similarity score up
+ */
+ private BingResponse calculateMatchScoreResortHits(BingResponse resp, String searchQuery){
+ // TODO
+ /*if query is multi-sentence, special handling
+ int indexDot = searchQuery.indexOf(".");
+ if (indexDot>0 && indexDot<searchQuery.length()-1){
+ MultipleSentenceQueryAnswerer ans = new MultipleSentenceQueryAnswerer();
+ return ans.calculateMatchScoreResortHits(resp, searchQuery);
+ } */
+ List<HitBase> newHitList = new ArrayList<HitBase>();
+ ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor.getInstance();
+
+ for(HitBase hit: resp.getHits()){
+ String snapshot = hit.getAbstractText().replace("<b>...</b>", ". ").replace("<b>", "").replace("</b>","");
+ snapshot=snapshot.replace("</B>", "").replace("<B>", "").replace("<br>", "").replace("</br>", "").replace("...", ". ").replace("|", " ").replace(">", " ");
+ snapshot+=" . "+hit.getTitle();
+ Double score = 0.0;
+ try {
+ SentencePairMatchResult matchRes = sm.assessRelevance(snapshot, searchQuery);
+ List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
+ score = parseTreeChunkListScorer.getParseTreeChunkListScore(match);
+ LOG.finest(score + " | "+ snapshot );
+ }
+ catch (Exception e){
+ LOG.severe("Problem processing snapshot "+snapshot);
+ e.printStackTrace();
+ }
+ hit.setGenerWithQueryScore(score);
+ newHitList.add(hit);
+ }
+ Collections.sort(newHitList,new HitBaseComparable());
+ resp.setHits(newHitList);
+ LOG.info("\n\n ============= NEW ORDER ================= ");
+ for(HitBase hit: newHitList){
+ LOG.info(hit.toString());
+ }
+
+
+ return resp;
+ }
+
+ public List<HitBase> runSearch(String query) {
+ BingResponse resp = null, // obtained from bing
+ newResp = null; // re-sorted based on similarity
+ try {
+ List<String> resultList = search(query, "", "", 30);
+ resp = populateBingHit(resultList.get(0));
+ // now we apply our own relevance filter
+ newResp=calculateMatchScoreResortHits(resp, query);
+
+ } catch (Exception e) {
+ // e.printStackTrace();
+ LOG.info("No search results for query '" + query);
+ e.printStackTrace();
+ return null;
+ }
+ // cast to super class
+ List<HitBase> hits = new ArrayList<HitBase>();
+ for (HitBase h : resp.getHits())
+ hits.add((HitBase) h);
+
+ hits = removeDuplicates(hits, 0.9);
+
+ return hits;
+ }
+
- hits = removeDuplicates(hits, 0.9);
- return hits;
- }
}
Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java?rev=1233056&r1=1233055&r2=1233056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java Wed Jan 18 20:43:55 2012
@@ -19,6 +19,7 @@ package opennlp.tools.textsimilarity;
import java.util.List;
+
import opennlp.tools.stemmer.PorterStemmer;
public class LemmaFormManager {
Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java?rev=1233056&r1=1233055&r2=1233056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java Wed Jan 18 20:43:55 2012
@@ -33,17 +33,15 @@
package opennlp.tools.textsimilarity.chunker2matcher;
+import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
-import org.apache.commons.lang.StringUtils;
-
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.cmdline.parser.ParserTool;
@@ -66,16 +64,16 @@ import opennlp.tools.textsimilarity.Text
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
-import opennlp.tools.util.Sequence;
import opennlp.tools.util.Span;
-import opennlp.tools.util.StringUtil;
public class ParserChunker2MatcherProcessor {
- private static final int MIN_SENTENCE_LENGTH = 10;
+ protected static final int MIN_SENTENCE_LENGTH = 10;
private static final String MODEL_DIR_KEY = "nlp.models.dir";
- private static final String MODEL_DIR ;
- private static ParserChunker2MatcherProcessor instance;
+ // TODO config
+ // this is where resources shoudl live
+ private static String MODEL_DIR = "resources/models";
+ protected static ParserChunker2MatcherProcessor instance;
private SentenceDetector sentenceDetector;
private Tokenizer tokenizer;
@@ -85,18 +83,13 @@ public class ParserChunker2MatcherProces
private final int NUMBER_OF_SECTIONS_IN_SENTENCE_CHUNKS = 5;
private static Logger LOG = Logger.getLogger("opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor");
- static {
- //TODO config
- MODEL_DIR = "C:\\workspace\\similarity\\src\\main\\resources";
- }
-
- private ParserChunker2MatcherProcessor() {
+ protected ParserChunker2MatcherProcessor() {
+ MODEL_DIR = new File(".").getAbsolutePath().replace(".", "")+MODEL_DIR;
initializeSentenceDetector();
initializeTokenizer();
initializePosTagger();
initializeParser();
initializeChunker();
-
}
public synchronized static ParserChunker2MatcherProcessor getInstance() {
@@ -175,6 +168,8 @@ public class ParserChunker2MatcherProces
LOG.log(Level.WARNING, "failed to parse the sentence : '"+sentence, t);
return null;
}
+
+ // Parse[] chunks = ChunkSampleStream.getInitialChunks(parseArray[0]) ;
// there should be only one result parse
if (parseArray != null && parseArray.length > 0)
@@ -222,6 +217,7 @@ public class ParserChunker2MatcherProces
return null;
}
List<String> POSlist = node.getOrderedPOSList();
+
tags = POSlist.toArray(new String[0]);
if (toks.length != tags.length){
LOG.info("disagreement between toks and tags; sent = '"+sentence + "'\n tags = "+tags +
@@ -245,19 +241,15 @@ public class ParserChunker2MatcherProces
} else
return null;
}
-
}
String[] res = chunker.chunk(toks, tags);
- Span[] span = chunker.chunkAsSpans(toks, tags);
- Sequence[] seq = chunker.topKSequences(toks, tags);
-
+
List<List<ParseTreeChunk>> listOfChunks = new ArrayList<List<ParseTreeChunk>>();
List<ParseTreeChunk> nounPhr = new ArrayList<ParseTreeChunk>(),
prepPhr = new ArrayList<ParseTreeChunk>(), verbPhr = new ArrayList<ParseTreeChunk>(),
adjPhr = new ArrayList<ParseTreeChunk>(),
// to store the whole sentence
wholeSentence = new ArrayList<ParseTreeChunk>();
-
List<String> pOSsAll = new ArrayList<String>(), lemmasAll = new ArrayList<String>();
for(int i = 0; i< toks.length; i++){
@@ -278,7 +270,7 @@ public class ParserChunker2MatcherProces
for(int j=i+1; j<res.length; j++){
if (res[j].startsWith("B-VP")){
nounPhr.add(new ParseTreeChunk("NP", lemmas, pOSs));
- LOG.info(i + " => " +lemmas);
+ //LOG.info(i + " => " +lemmas);
currPhraseClosed = true;
break;
} else {
@@ -288,7 +280,7 @@ public class ParserChunker2MatcherProces
}
if (!currPhraseClosed){
nounPhr.add(new ParseTreeChunk("NP", lemmas, pOSs));
- LOG.fine(i + " => " + lemmas);
+ //LOG.fine(i + " => " + lemmas);
}
} else if (bi_POS.startsWith("B-PP")){// beginning of a phrase
@@ -299,7 +291,7 @@ public class ParserChunker2MatcherProces
for(int j=i+1; j<res.length; j++){
if (res[j].startsWith("B-VP")){
prepPhr.add(new ParseTreeChunk("PP", lemmas, pOSs));
- LOG.fine(i + " => " + lemmas);
+ //LOG.fine(i + " => " + lemmas);
currPhraseClosed = true;
break;
} else {
@@ -309,7 +301,7 @@ public class ParserChunker2MatcherProces
}
if (!currPhraseClosed){
prepPhr.add(new ParseTreeChunk("PP", lemmas, pOSs));
- LOG.fine(i + " => " + lemmas);
+ //LOG.fine(i + " => " + lemmas);
}
} else
if (bi_POS.startsWith("B-VP")){// beginning of a phrase
@@ -320,7 +312,7 @@ public class ParserChunker2MatcherProces
for(int j=i+1; j<res.length; j++){
if (res[j].startsWith("B-VP")){
verbPhr.add(new ParseTreeChunk("VP", lemmas, pOSs));
- LOG.fine(i + " => " +lemmas);
+ //LOG.fine(i + " => " +lemmas);
currPhraseClosed = true;
break;
} else {
@@ -330,7 +322,7 @@ public class ParserChunker2MatcherProces
}
if (!currPhraseClosed){
verbPhr.add(new ParseTreeChunk("VP", lemmas, pOSs));
- LOG.fine(i + " => " + lemmas);
+ //LOG.fine(i + " => " + lemmas);
}
} else
if (bi_POS.startsWith("B-ADJP") ){// beginning of a phrase
@@ -341,7 +333,7 @@ public class ParserChunker2MatcherProces
for(int j=i+1; j<res.length; j++){
if (res[j].startsWith("B-VP")){
adjPhr.add(new ParseTreeChunk("ADJP", lemmas, pOSs));
- LOG.fine(i + " => " +lemmas);
+ //LOG.fine(i + " => " +lemmas);
currPhraseClosed = true;
break;
} else {
@@ -351,7 +343,7 @@ public class ParserChunker2MatcherProces
}
if (!currPhraseClosed){
adjPhr.add(new ParseTreeChunk("ADJP", lemmas, pOSs));
- LOG.fine(i + " => " + lemmas);
+ //LOG.fine(i + " => " + lemmas);
}
}
}
@@ -471,7 +463,7 @@ public class ParserChunker2MatcherProces
return tokenizer.tokenize(sentence);
}
- private void initializeSentenceDetector() {
+ protected void initializeSentenceDetector() {
InputStream is = null;
try {
is = new FileInputStream(
@@ -493,7 +485,7 @@ public class ParserChunker2MatcherProces
}
}
- private void initializeTokenizer() {
+ protected void initializeTokenizer() {
InputStream is = null;
try {
is = new FileInputStream(
@@ -513,7 +505,7 @@ public class ParserChunker2MatcherProces
}
}
- private void initializePosTagger() {
+ protected void initializePosTagger() {
InputStream is = null;
try {
is = new FileInputStream(MODEL_DIR
@@ -532,7 +524,7 @@ public class ParserChunker2MatcherProces
}
}
- private void initializeParser() {
+ protected void initializeParser() {
InputStream is = null;
try {
is = new FileInputStream(MODEL_DIR
@@ -597,7 +589,9 @@ public class ParserChunker2MatcherProces
// if this node contains children nodes, then it is a phrase node
if (childrenNodeList != null && childrenNodeList.size() > 0) {
+ //System.out.println("Found "+ type + " phrase = "+ childrenNodeList);
return new PhraseNode(type, childrenNodeList);
+
}
// otherwise, it is a word node
@@ -628,11 +622,10 @@ public class ParserChunker2MatcherProces
public SentencePairMatchResult assessRelevance(String para1, String para2)
{
- ParserChunker2MatcherProcessor parser = ParserChunker2MatcherProcessor.getInstance();
- List<List<ParseTreeChunk>> sent1GrpLst = parser.formGroupedPhrasesFromChunksForPara(para1),
- sent2GrpLst = parser.formGroupedPhrasesFromChunksForPara(para2);
+ List<List<ParseTreeChunk>> sent1GrpLst = formGroupedPhrasesFromChunksForPara(para1),
+ sent2GrpLst = formGroupedPhrasesFromChunksForPara(para2);
- List<LemmaPair> origChunks1 = listListParseTreeChunk2ListLemmaPairs(sent1GrpLst); //TODO need to populate it!
+ List<LemmaPair> origChunks1 = listListParseTreeChunk2ListLemmaPairs(sent1GrpLst);
ParseTreeMatcherDeterministic md = new ParseTreeMatcherDeterministic();
@@ -640,7 +633,7 @@ public class ParserChunker2MatcherProces
return new SentencePairMatchResult(res, origChunks1);
}
- private List<LemmaPair> listListParseTreeChunk2ListLemmaPairs(
+ protected List<LemmaPair> listListParseTreeChunk2ListLemmaPairs(
List<List<ParseTreeChunk>> sent1GrpLst) {
List<LemmaPair> results = new ArrayList<LemmaPair>();
if (sent1GrpLst==null || sent1GrpLst.size() <1)
@@ -665,55 +658,4 @@ public class ParserChunker2MatcherProces
}
}
}
-
- public static void main(String[] args) throws Exception {
-
- String text = "Where do I apply? Go to your town office or city hall. If your town doesn't have an office, ask the town clerk or a Selectman. Tell them that you need a 1040 tax form . I Can 't Pay the Taxes on my House: What Can I Do?. Pine Tree Legal";
-
- /*
- * String text =
- * "I have been driving a 96 accord to death for 10 years. " +
- * "Lately it has been costing to much in repairs. " +
- * "I am looking for something 8,000-13,000. " +
- * "My last three vehicles have been Accords. " +
- * "I like them but I would like something different this time.";
- */
- /*
- * String text = "I love Fresh body styling. " + "I love lots of grip. "
- * + "I love strong engine and grippy tires. " + "I like Head turner. "
- * + "I like Right and left rearward blind spots. " +
- * "I like Great acceleration. " + "I like great noise. " +
- * "I like great brakes. " + "I like cheap feeling interior. " +
- * "I like uncomfortable seats. " + "I like nav system hard to read.";
- */
- // String sentence = "I love Fresh body styling";
- // String phrase = "I captures way more detail in high contrast scenes";
- ParserChunker2MatcherProcessor parser = ParserChunker2MatcherProcessor.getInstance();
- List<List<ParseTreeChunk>> res = parser.formGroupedPhrasesFromChunksForPara(text);
- System.out.println(res);
-
- //parser.printParseTree("How can I get short focus zoom lens for digital camera");
- //parser.formGroupedPhrasesFromChunksForSentence("How can I get short focus zoom lens for digital camera");
-
- System.exit(0);
-
- String phrase1 = "Its classy design and the Mercedes name make it a very cool vehicle to drive. "
- + "The engine makes it a powerful car. "
- + "The strong engine gives it enough power. "
- + "The strong engine gives the car a lot of power.";
- String phrase2 = "This car has a great engine. "
- + "This car has an amazingly good engine. "
- + "This car provides you a very good mileage.";
- String sentence = "Not to worry with the 2cv.";
-
-
- System.out.println(parser.assessRelevance(phrase1, phrase2));
-
-
- parser.formGroupedPhrasesFromChunksForSentence("Its classy design and the Mercedes name make it a very cool vehicle to drive. ");
- parser.formGroupedPhrasesFromChunksForSentence("Sounds too good to be true but it actually is, the world's first flying car is finally here. ");
- parser.formGroupedPhrasesFromChunksForSentence("UN Ambassador Ron Prosor repeated the Israeli position that the only way the Palestinians will get UN membership and statehood is through direct negotiations with the Israelis on a comprehensive peace agreement");
-
-
- }
}
Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserPure2MatcherProcessor.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserPure2MatcherProcessor.java?rev=1233056&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserPure2MatcherProcessor.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserPure2MatcherProcessor.java Wed Jan 18 20:43:55 2012
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity.chunker2matcher;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import opennlp.tools.textsimilarity.LemmaPair;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic;
+import opennlp.tools.textsimilarity.SentencePairMatchResult;
+import opennlp.tools.textsimilarity.TextProcessor;
+
+public class ParserPure2MatcherProcessor extends ParserChunker2MatcherProcessor{
+ protected static ParserPure2MatcherProcessor pinstance;
+ private static Logger LOG = Logger.getLogger("opennlp.tools.textsimilarity.chunker2matcher.ParserPure2MatcherProcessor");
+
+ public synchronized static ParserPure2MatcherProcessor getInstance() {
+ if (pinstance == null)
+ pinstance = new ParserPure2MatcherProcessor();
+
+ return pinstance;
+ }
+
+ private ParserPure2MatcherProcessor() {
+ initializeSentenceDetector();
+ initializeTokenizer();
+ initializePosTagger();
+ initializeParser();
+ }
+
+ public synchronized List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForSentence(String sentence) {
+ if (sentence == null || sentence.trim().length() < MIN_SENTENCE_LENGTH)
+ return null;
+
+ sentence = TextProcessor.removePunctuation(sentence);
+ SentenceNode node = parseSentenceNode(sentence);
+ if (node==null){
+ LOG.info("Problem parsing sentence '"+sentence);
+ return null;
+ }
+ List<ParseTreeChunk> ptcList = node.getParseTreeChunkList();
+ List<String> POSlist = node.getOrderedPOSList();
+ List<String> TokList = node.getOrderedLemmaList();
+
+ List<List<ParseTreeChunk>> listOfChunks = new ArrayList<List<ParseTreeChunk>>();
+ List<ParseTreeChunk> nounPhr = new ArrayList<ParseTreeChunk>(),
+ prepPhr = new ArrayList<ParseTreeChunk>(), verbPhr = new ArrayList<ParseTreeChunk>(),
+ adjPhr = new ArrayList<ParseTreeChunk>(),
+ // to store the whole sentence
+ wholeSentence = new ArrayList<ParseTreeChunk>();
+
+ wholeSentence.add(new ParseTreeChunk("SENTENCE", TokList, POSlist));
+ for(ParseTreeChunk phr: ptcList){
+ String phrType = phr.getMainPOS();
+ if (phrType.startsWith("NP")){
+ nounPhr.add(phr);
+ } else if (phrType.startsWith("VP")){
+ verbPhr.add(phr);
+ } else if (phrType.startsWith("PP")){
+ prepPhr.add(phr);
+ } else if (phrType.endsWith("ADJP")){
+ adjPhr.add(phr);
+ } else {
+ //LOG.info("Unexpected phrase type found :"+ phr);
+ }
+
+ }
+
+ listOfChunks.add(nounPhr);
+ listOfChunks.add(verbPhr);
+ listOfChunks.add(prepPhr);
+ listOfChunks.add(adjPhr);
+ listOfChunks.add(wholeSentence);
+
+ return listOfChunks;
+ }
+
+ public SentencePairMatchResult assessRelevance(String para1, String para2)
+ {
+
+ List<List<ParseTreeChunk>> sent1GrpLst = formGroupedPhrasesFromChunksForPara(para1),
+ sent2GrpLst = formGroupedPhrasesFromChunksForPara(para2);
+
+ List<LemmaPair> origChunks1 = listListParseTreeChunk2ListLemmaPairs(sent1GrpLst); //TODO need to populate it!
+
+
+ ParseTreeMatcherDeterministic md = new ParseTreeMatcherDeterministic();
+ List<List<ParseTreeChunk>> res = md.matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst);
+ return new SentencePairMatchResult(res, origChunks1);
+
+ }
+
+
+ public static void main(String[] args) throws Exception {
+ ParserPure2MatcherProcessor parser = ParserPure2MatcherProcessor.getInstance();
+ String text = "Its classy design and the Mercedes name make it a very cool vehicle to drive. ";
+
+ List<List<ParseTreeChunk>> res = parser.formGroupedPhrasesFromChunksForPara(text);
+ System.out.println(res);
+
+ // System.exit(0);
+
+
+ String phrase1 = "Its classy design and the Mercedes name make it a very cool vehicle to drive. "
+ + "The engine makes it a powerful car. "
+ + "The strong engine gives it enough power. "
+ + "The strong engine gives the car a lot of power.";
+ String phrase2 = "This car has a great engine. "
+ + "This car has an amazingly good engine. "
+ + "This car provides you a very good mileage.";
+ String sentence = "Not to worry with the 2cv.";
+
+
+ System.out.println(parser.assessRelevance(phrase1, phrase2).getMatchResult());
+
+
+ System.out.println(parser.formGroupedPhrasesFromChunksForSentence("Its classy design and the Mercedes name make it a very cool vehicle to drive. "));
+ System.out.println(parser.formGroupedPhrasesFromChunksForSentence("Sounds too good to be true but it actually is, the world's first flying car is finally here. "));
+ System.out.println(parser.formGroupedPhrasesFromChunksForSentence("UN Ambassador Ron Prosor repeated the Israeli position that the only way the Palestinians will get UN membership and statehood is through direct negotiations with the Israelis on a comprehensive peace agreement"));
+
+
+ }
+}
Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/PhraseNode.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/PhraseNode.java?rev=1233056&r1=1233055&r2=1233056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/PhraseNode.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/PhraseNode.java Wed Jan 18 20:43:55 2012
@@ -128,4 +128,16 @@ public class PhraseNode extends Syntacti
types.add(getType());
return types;
}
+
+ @Override
+ public List<String> getOrderedLemmaList(){
+ List<String> types = new ArrayList<String>();
+ if (children != null && children.size() > 0) {
+ for (SyntacticTreeNode child : children) {
+ types.addAll(child.getOrderedLemmaList());
+ }
+ } else
+ types.add(getType());
+ return types;
+ }
}
Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SentenceNode.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SentenceNode.java?rev=1233056&r1=1233055&r2=1233056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SentenceNode.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SentenceNode.java Wed Jan 18 20:43:55 2012
@@ -20,6 +20,8 @@ package opennlp.tools.textsimilarity.chu
import java.util.ArrayList;
import java.util.List;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+
/**
* Sentence node is the first clause node contained in the top node
*
@@ -69,5 +71,41 @@ public class SentenceNode extends Phrase
return types;
}
+ @Override
+ public List<String> getOrderedLemmaList(){
+ List<String> types = new ArrayList<String>();
+ if (this.getChildren()!= null && this.getChildren().size() > 0) {
+ for (SyntacticTreeNode child : this.getChildren()) {
+ types.addAll(child.getOrderedLemmaList());
+ }
+ }
+ return types;
+ }
+
+ public List<ParseTreeChunk> getParseTreeChunkList(){
+ List<ParseTreeChunk> chunks = new ArrayList<ParseTreeChunk>();
+
+ if (this.getChildren()!= null && this.getChildren().size() > 0) {
+ for (SyntacticTreeNode child : this.getChildren()) {
+ // if (child.getType().endsWith("P"))
+ chunks.add(new ParseTreeChunk(child.getType(),
+ child.getOrderedPOSList(), child.getOrderedLemmaList()));
+ }
+ }
+ return chunks;
+ }
+
+
}
+
+/*
+ * [[NP [PRP$-your NN-town NN-office CC-or NN-city NN-hall ], NP [PRP$-your NN-town NN-doesn NN-t ],
+ * NP [DT-an NN-office ], NP [DT-the NN-town NN-clerk CC-or DT-a NNP-Selectman ],
+ * NP [DT-a NNP-Selectman ], NP [PRP-them IN-that PRP-you ], NP [PRP-you ], NP [DT-a CD-1040 NN-tax NN-form ], NP [PRP-I ],
+ * NP [DT-the NNS-Taxes IN-on PRP$-my NNP-House WP-What MD-Can PRP-I ], NP [PRP$-my NNP-House WP-What MD-Can PRP-I ],
+ * NP [WP-What MD-Can PRP-I ], NP [PRP-I ], NP [NNP-Pine NNP-Tree NNP-Legal ]],
+ *
+ * [VP [VBP-do RB-I VB-apply ], VP [VB-Go TO-to PRP$-your NN-town NN-office CC-or NN-city NN-hall ],
+ * VP [VBP-have DT-an NN-office ], VP [VB-ask DT-the NN-town NN-clerk CC-or DT-a NNP-Selectman ], VP [VB-Tell PRP-them IN-that PRP-you ], VP [VBP-need DT-a CD-1040 NN-tax NN-form ], VP [MD-Can VB-t VB-Pay DT-the NNS-Taxes IN-on PRP$-my NNP-House WP-What MD-Can PRP-I ], VP [VB-Do NNP-Pine NNP-Tree NNP-Legal ]], [PP [TO-to PRP$-your NN-town NN-office CC-or NN-city NN-hall ], PP [IN-on PRP$-my NNP-House WP-What MD-Can PRP-I ]], [], [SENTENCE [WRB-Where VBP-do RB-I VB-apply ], SENTENCE [VB-Go TO-to PRP$-your NN-town NN-office CC-or NN-city NN-hall ], SENTENCE [IN-If PRP$-your NN-town NN-doesn NN-t VBP-have DT-an NN-office VB-ask DT-the NN-town NN-clerk CC-or DT-a NNP-Selectman ], SENTENCE [VB-Tell PRP-them IN-that PRP-you VBP-need DT-a CD-1040 NN-tax NN-form ], SENTENCE [PRP-I MD-Can VB-t VB-Pay DT-the NNS-Taxes IN-on PRP$-my NNP-House WP-What MD-Can PRP-I VB-Do NNP-Pine NNP-Tree NNP-Legal ]]]
+*/
Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SyntacticTreeNode.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SyntacticTreeNode.java?rev=1233056&r1=1233055&r2=1233056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SyntacticTreeNode.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SyntacticTreeNode.java Wed Jan 18 20:43:55 2012
@@ -39,6 +39,8 @@ public abstract class SyntacticTreeNode
public abstract String toStringIndented(int numTabs);
public abstract List<String> getOrderedPOSList();
+
+ public abstract List<String> getOrderedLemmaList();
public SyntacticTreeNode(String type) {
this.type = type;
@@ -153,5 +155,7 @@ public abstract class SyntacticTreeNode
}
}
+
+
}
Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/WordNode.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/WordNode.java?rev=1233056&r1=1233055&r2=1233056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/WordNode.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/WordNode.java Wed Jan 18 20:43:55 2012
@@ -82,4 +82,10 @@ public class WordNode extends SyntacticT
types.add(getType());
return types;
}
+ @Override
+ public List<String> getOrderedLemmaList() {
+ List<String> types = new ArrayList<String>();
+ types.add(this.getWord());
+ return types;
+ }
}