You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/10/22 13:15:38 UTC
svn commit: r1187691 - in /incubator/opennlp/sandbox/opennlp-similarity/src:
main/java/opennlp/tools/textsimilarity/chunker2matcher/
test/java/opennlp/tools/textsimilarity/
Author: joern
Date: Sat Oct 22 11:15:38 2011
New Revision: 1187691
URL: http://svn.apache.org/viewvc?rev=1187691&view=rev
Log:
OPENNLP-331 Added functions substituting POS taggers by Parser POS. Thanks to Boris Galitsky for providing a patch.
Modified:
incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/PhraseNode.java
incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SentenceNode.java
incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SyntacticTreeNode.java
incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/WordNode.java
incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/SyntMatcherTest.java
Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java?rev=1187691&r1=1187690&r2=1187691&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java Sat Oct 22 11:15:38 2011
@@ -96,7 +96,7 @@ public class ParserChunker2MatcherProces
initializePosTagger();
initializeParser();
initializeChunker();
-
+
}
public synchronized static ParserChunker2MatcherProcessor getInstance() {
@@ -214,17 +214,22 @@ public class ParserChunker2MatcherProces
sentence = TextProcessor.removePunctuation(sentence);
String[] toks = tokenizer.tokenize(sentence);
- String[] tags = posTagger.tag(toks);
+ String[] tags = new String[toks.length]; //posTagger.tag(toks);
+ int t=0;
+ SentenceNode node = parseSentenceNode(sentence);
+ List<String> POSlist = node.getOrderedPOSList();
+ tags = POSlist.toArray(new String[0]);
+
String[] res = chunker.chunk(toks, tags);
Span[] span = chunker.chunkAsSpans(toks, tags);
Sequence[] seq = chunker.topKSequences(toks, tags);
- // correction for chunking tags
+ /* correction for chunking tags
for(int i=0; i< toks.length; i++){
if (toks[i].equalsIgnoreCase("is")){
res[i] = "B-VP";
}
- }
+ } */
List<List<ParseTreeChunk>> listOfChunks = new ArrayList<List<ParseTreeChunk>>();
List<ParseTreeChunk> nounPhr = new ArrayList<ParseTreeChunk>(),
@@ -636,6 +641,16 @@ public class ParserChunker2MatcherProces
return results;
}
+ public void printParseTree(String phrase1){
+ ParserChunker2MatcherProcessor p = ParserChunker2MatcherProcessor.getInstance();
+ List<List<SentenceNode>> nodeListList = p.parseTextNode(phrase1);
+ for (List<SentenceNode> nodeList : nodeListList) {
+ for (SentenceNode node : nodeList) {
+ System.out.println(node);
+ }
+ }
+ }
+
public static void main(String[] args) throws Exception {
@@ -658,6 +673,12 @@ public class ParserChunker2MatcherProces
*/
// String sentence = "I love Fresh body styling";
// String phrase = "I captures way more detail in high contrast scenes";
+ ParserChunker2MatcherProcessor parser = ParserChunker2MatcherProcessor.getInstance();
+ parser.printParseTree("How can I get short focus zoom lens for digital camera");
+ parser.formGroupedPhrasesFromChunksForSentence("How can I get short focus zoom lens for digital camera");
+
+ System.exit(0);
+
String phrase1 = "Its classy design and the Mercedes name make it a very cool vehicle to drive. "
+ "The engine makes it a powerful car. "
+ "The strong engine gives it enough power. "
@@ -666,7 +687,7 @@ public class ParserChunker2MatcherProces
+ "This car has an amazingly good engine. "
+ "This car provides you a very good mileage.";
String sentence = "Not to worry with the 2cv.";
- ParserChunker2MatcherProcessor parser = ParserChunker2MatcherProcessor.getInstance();
+
System.out.println(parser.assessRelevance(phrase1, phrase2));
@@ -675,11 +696,6 @@ public class ParserChunker2MatcherProces
parser.formGroupedPhrasesFromChunksForSentence("Sounds too good to be true but it actually is, the world's first flying car is finally here. ");
parser.formGroupedPhrasesFromChunksForSentence("UN Ambassador Ron Prosor repeated the Israeli position that the only way the Palestinians will get UN membership and statehood is through direct negotiations with the Israelis on a comprehensive peace agreement");
- List<List<SentenceNode>> nodeListList = parser.parseTextNode(phrase1);
- for (List<SentenceNode> nodeList : nodeListList) {
- for (SentenceNode node : nodeList) {
- System.out.println(node);
- }
- }
+
}
}
Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/PhraseNode.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/PhraseNode.java?rev=1187691&r1=1187690&r2=1187691&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/PhraseNode.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/PhraseNode.java Sat Oct 22 11:15:38 2011
@@ -116,4 +116,16 @@ public class PhraseNode extends Syntacti
return builder.toString();
}
+
+ @Override
+ public List<String> getOrderedPOSList(){
+ List<String> types = new ArrayList<String>();
+ if (children != null && children.size() > 0) {
+ for (SyntacticTreeNode child : children) {
+ types.addAll(child.getOrderedPOSList());
+ }
+ } else
+ types.add(getType());
+ return types;
+ }
}
Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SentenceNode.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SentenceNode.java?rev=1187691&r1=1187690&r2=1187691&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SentenceNode.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SentenceNode.java Sat Oct 22 11:15:38 2011
@@ -17,6 +17,7 @@
package opennlp.tools.textsimilarity.chunker2matcher;
+import java.util.ArrayList;
import java.util.List;
/**
@@ -56,4 +57,15 @@ public class SentenceNode extends Phrase
return builder.toString();
}
+
+ @Override
+ public List<String> getOrderedPOSList(){
+ List<String> types = new ArrayList<String>();
+ if (this.getChildren()!= null && this.getChildren().size() > 0) {
+ for (SyntacticTreeNode child : this.getChildren()) {
+ types.addAll(child.getOrderedPOSList());
+ }
+ }
+ return types;
+ }
}
Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SyntacticTreeNode.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SyntacticTreeNode.java?rev=1187691&r1=1187690&r2=1187691&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SyntacticTreeNode.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SyntacticTreeNode.java Sat Oct 22 11:15:38 2011
@@ -36,6 +36,8 @@ public abstract class SyntacticTreeNode
public abstract String getLemma(boolean removeStopWord);
public abstract String toStringIndented(int numTabs);
+
+ public abstract List<String> getOrderedPOSList();
public SyntacticTreeNode(String type) {
this.type = type;
@@ -146,4 +148,6 @@ public abstract class SyntacticTreeNode
}
}
}
+
+
}
Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/WordNode.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/WordNode.java?rev=1187691&r1=1187690&r2=1187691&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/WordNode.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/WordNode.java Sat Oct 22 11:15:38 2011
@@ -17,6 +17,7 @@
package opennlp.tools.textsimilarity.chunker2matcher;
+import java.util.ArrayList;
import java.util.List;
public class WordNode extends SyntacticTreeNode {
@@ -74,4 +75,11 @@ public class WordNode extends SyntacticT
public static void main(String[] args) {
}
+
+ @Override
+ public List<String> getOrderedPOSList() {
+ List<String> types = new ArrayList<String>();
+ types.add(getType());
+ return types;
+ }
}
Modified: incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/SyntMatcherTest.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/SyntMatcherTest.java?rev=1187691&r1=1187690&r2=1187691&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/SyntMatcherTest.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/SyntMatcherTest.java Sat Oct 22 11:15:38 2011
@@ -31,75 +31,80 @@ import org.junit.runner.RunWith;
public class SyntMatcherTest extends TestCase {
- private ParserChunker2MatcherProcessor parserChunker2MatcherOlderOpenNLP;
+ private ParserChunker2MatcherProcessor parserChunker2Matcher;
- private ParseTreeChunk parseTreeChunk = new ParseTreeChunk();
+ private ParseTreeChunk parseTreeChunk = new ParseTreeChunk();
- public void notNullTest() {
- parserChunker2MatcherOlderOpenNLP = ParserChunker2MatcherProcessor.getInstance();
- assertNotNull(parserChunker2MatcherOlderOpenNLP);
- }
-
- public void testMatch() {
- parserChunker2MatcherOlderOpenNLP = ParserChunker2MatcherProcessor.getInstance();
- List<List<ParseTreeChunk>> matchResult = parserChunker2MatcherOlderOpenNLP
- .assessRelevance(
- // "Can I get auto focus lens for digital camera",
- // "How can I get short focus zoom lens for digital camera"
- "Pulitzer Prize-Winning Reporter is an Illegal Immigrant",
- "Gay Pulitzer Prize-Winning Reporter Jose Antonio Vargas Comes Out as Undocumented " +
- "Immigrant Jose Antonio Vargas, a gay journalist who won a Pulitzer Prize " +
- "for his coverage of the Virginia Tech shootings in the Washington Post")
- .getMatchResult();
-
- System.out.println(matchResult);
- assertEquals( "[[ [NNP-pulitzer NNP-prize NNP-winning NNP-reporter ], [NNP-immigrant ]], []]",
- matchResult.toString());
- System.out.println(parseTreeChunk.listToString(matchResult));
- assertEquals(" np [ [NNP-pulitzer NNP-prize NNP-winning NNP-reporter ], [NNP-immigrant ]]",
- parseTreeChunk.listToString(matchResult));
-
- matchResult = parserChunker2MatcherOlderOpenNLP
- .assessRelevance(
- "Sounds too good to be true but it actually is, the world's first flying car is finally here. ",
- "While it may seem like something straight out of a sci-fi " +
- "movie, the flying car might soon become a reality. ").getMatchResult();
-
- System.out.println(matchResult);
- // was "[[ [DT-the NN-* VBG-flying NN-car ]], []]"
- assertEquals("[[ [PRP-it ], [DT-the NN-* ], [NN-flying NN-car ]], [ [DT-the NN-* ], [NN-* ]]]",
- matchResult.toString()
- );
- System.out.println(parseTreeChunk.listToString(matchResult));
- assertEquals( " np [ [PRP-it ], [DT-the NN-* ], [NN-flying NN-car ]] vp [ [DT-the NN-* ], [NN-* ]]",
- parseTreeChunk.listToString(matchResult));
-
- }
-
-
- public void testMatchDigitalCamera() {
- parserChunker2MatcherOlderOpenNLP = ParserChunker2MatcherProcessor.getInstance();
- List<List<ParseTreeChunk>> matchResult = parserChunker2MatcherOlderOpenNLP.assessRelevance(
- "I am curious how to use the digital zoom of this camera for filming insects",
- "How can I get short focus zoom lens for digital camera").getMatchResult();
-
- System.out.println(matchResult);
- assertEquals("[[ [PRP-i ], [JJ-digital NN-* ], [NN-* IN-for ], [NN-camera ]], [ [JJ-digital NN-* ], [NN-* IN-for ]]]",
- matchResult.toString());
- System.out.println(parseTreeChunk.listToString(matchResult));
- assertEquals(" np [ [PRP-i ], [JJ-digital NN-* ], [NN-* IN-for ], [NN-camera ]] vp [ [JJ-digital NN-* ], [NN-* IN-for ]]",
- parseTreeChunk.listToString(matchResult));
-
- matchResult = parserChunker2MatcherOlderOpenNLP.assessRelevance(
- "Can I get auto focus lens for digital camera",
- "How can I get short focus zoom lens for digital camera").getMatchResult();
-
- System.out.println(matchResult);
- assertEquals( "[[ [PRP-i ], [NN-focus ], [NN-lens IN-for JJ-digital NN-camera ], [JJ-digital NN-camera ]], [ [VB-get NN-focus ], [NN-lens IN-for JJ-digital NN-camera ]]]",
- matchResult.toString());
- System.out.println(parseTreeChunk.listToString(matchResult));
- assertEquals(" np [ [PRP-i ], [NN-focus ], [NN-lens IN-for JJ-digital NN-camera ], [JJ-digital NN-camera ]] vp [ [VB-get NN-focus ], [NN-lens IN-for JJ-digital NN-camera ]]",
- parseTreeChunk.listToString(matchResult) );
- }
+ public void notNullTest() {
+ parserChunker2Matcher = ParserChunker2MatcherProcessor.getInstance();
+ assertNotNull(parserChunker2Matcher);
+ }
+
+ public void testMatch() {
+ parserChunker2Matcher = ParserChunker2MatcherProcessor.getInstance();
+ List<List<ParseTreeChunk>> matchResult = parserChunker2Matcher
+ .assessRelevance(
+ // "Can I get auto focus lens for digital camera",
+ // "How can I get short focus zoom lens for digital camera"
+ "Pulitzer Prize-Winning Reporter is an Illegal Immigrant",
+ "Gay Pulitzer Prize-Winning Reporter Jose Antonio Vargas Comes Out as Undocumented " +
+ "Immigrant Jose Antonio Vargas, a gay journalist who won a Pulitzer Prize " +
+ "for his coverage of the Virginia Tech shootings in the Washington Post")
+ .getMatchResult();
+
+ System.out.println(matchResult);
+ assertEquals( "[[ [NNP-pulitzer NNP-prize NNP-winning NNP-reporter ], [NN-immigrant ]], []]",
+ matchResult.toString());
+ System.out.println(parseTreeChunk.listToString(matchResult));
+ assertEquals(" np [ [NNP-pulitzer NNP-prize NNP-winning NNP-reporter ], [NN-immigrant ]]",
+ parseTreeChunk.listToString(matchResult));
+
+ matchResult = parserChunker2Matcher
+ .assessRelevance(
+ "Sounds too good to be true but it actually is, the world's first flying car is finally here. ",
+ "While it may seem like something straight out of a sci-fi " +
+ "movie, the flying car might soon become a reality. ").getMatchResult();
+
+ // TODO: possibly problem in new POS tagger from Parser
+ System.out.println(matchResult);
+ // was "[[ [DT-the NN-* VBG-flying NN-car ]], []]"
+ assertEquals("[[ [PRP-it ], [DT-the NN-* NNS-* ]], [ [DT-the NN-* NNS-* ]]]",
+ matchResult.toString()
+ );
+ System.out.println(parseTreeChunk.listToString(matchResult));
+ assertEquals( " np [ [PRP-it ], [DT-the NN-* NNS-* ]] vp [ [DT-the NN-* NNS-* ]]",
+ parseTreeChunk.listToString(matchResult));
+
+ }
+
+
+ public void testMatchDigitalCamera() {
+ parserChunker2Matcher = ParserChunker2MatcherProcessor.getInstance();
+ List<List<ParseTreeChunk>> matchResult = parserChunker2Matcher.assessRelevance(
+ "I am curious how to use the digital zoom of this camera for filming insects",
+ "How can I get short focus zoom lens for digital camera").getMatchResult();
+
+ System.out.println(matchResult);
+ assertEquals("[[ [PRP-i ], [NN-zoom NN-camera ], [JJ-digital NN-* ], [NN-* IN-for ], [NN-camera ]], [ [JJ-digital NN-* ], [NN-zoom NN-camera ], [NN-* IN-for ]]]",
+ matchResult.toString());
+ System.out.println(parseTreeChunk.listToString(matchResult));
+ assertEquals(" np [ [PRP-i ], [NN-zoom NN-camera ], [JJ-digital NN-* ], [NN-* IN-for ], [NN-camera ]] vp [ [JJ-digital NN-* ], [NN-zoom NN-camera ], [NN-* IN-for ]]",
+ parseTreeChunk.listToString(matchResult));
+ }
+
+
+ public void testHighSimilarity() {
+ parserChunker2Matcher = ParserChunker2MatcherProcessor.getInstance();
+ List<List<ParseTreeChunk>> matchResult = parserChunker2Matcher.assessRelevance(
+ "Can I get auto focus lens for digital camera",
+ "How can I get short focus zoom lens for digital camera").getMatchResult();
+
+ System.out.println(matchResult);
+ assertEquals( "[[ [PRP-i ], [NN-focus NNS-* NNS-lens IN-for JJ-digital NN-camera ], [JJ-digital NN-camera ]], [ [VB-get NN-focus NNS-* NNS-lens IN-for JJ-digital NN-camera ]]]",
+ matchResult.toString());
+ System.out.println(parseTreeChunk.listToString(matchResult));
+ assertEquals(" np [ [PRP-i ], [NN-focus NNS-* NNS-lens IN-for JJ-digital NN-camera ], [JJ-digital NN-camera ]] vp [ [VB-get NN-focus NNS-* NNS-lens IN-for JJ-digital NN-camera ]]",
+ parseTreeChunk.listToString(matchResult) );
+ }
}