You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/10/22 13:15:38 UTC

svn commit: r1187691 - in /incubator/opennlp/sandbox/opennlp-similarity/src: main/java/opennlp/tools/textsimilarity/chunker2matcher/ test/java/opennlp/tools/textsimilarity/

Author: joern
Date: Sat Oct 22 11:15:38 2011
New Revision: 1187691

URL: http://svn.apache.org/viewvc?rev=1187691&view=rev
Log:
OPENNLP-331 Added functions substituting POS taggers by Parser POS. Thanks to Boris Galitsky for providing a patch.

Modified:
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/PhraseNode.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SentenceNode.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SyntacticTreeNode.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/WordNode.java
    incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/SyntMatcherTest.java

Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java?rev=1187691&r1=1187690&r2=1187691&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java Sat Oct 22 11:15:38 2011
@@ -96,7 +96,7 @@ public class ParserChunker2MatcherProces
 		initializePosTagger();
 		initializeParser();
 		initializeChunker();
-		
+
 	}
 
 	public synchronized static ParserChunker2MatcherProcessor getInstance() {
@@ -214,17 +214,22 @@ public class ParserChunker2MatcherProces
 		sentence = TextProcessor.removePunctuation(sentence);
 
 		String[] toks = tokenizer.tokenize(sentence);
-		String[] tags = posTagger.tag(toks);
+		String[] tags = new String[toks.length]; //posTagger.tag(toks);
+		int t=0;
+		SentenceNode node  = parseSentenceNode(sentence);
+		List<String> POSlist = node.getOrderedPOSList();
+		tags = POSlist.toArray(new String[0]);
+
 		String[] res = chunker.chunk(toks, tags);
 		Span[] span =  chunker.chunkAsSpans(toks, tags);
 		Sequence[] seq = chunker.topKSequences(toks, tags);
 
-		// correction for chunking tags
+		/* correction for chunking tags
 		for(int i=0; i< toks.length; i++){
 			if (toks[i].equalsIgnoreCase("is")){
 				res[i] = "B-VP";
 			}
-		}
+		} */
 
 		List<List<ParseTreeChunk>> listOfChunks = new ArrayList<List<ParseTreeChunk>>();
 		List<ParseTreeChunk> nounPhr = new ArrayList<ParseTreeChunk>(), 
@@ -636,6 +641,16 @@ public class ParserChunker2MatcherProces
 		return results;
 	}
 
+	public void printParseTree(String phrase1){
+		ParserChunker2MatcherProcessor p = ParserChunker2MatcherProcessor.getInstance();
+		List<List<SentenceNode>> nodeListList = p.parseTextNode(phrase1);
+		for (List<SentenceNode> nodeList : nodeListList) {
+			for (SentenceNode node : nodeList) {
+				System.out.println(node);
+			}
+		}
+	}
+
 	public static void main(String[] args) throws Exception {
 
 
@@ -658,6 +673,12 @@ public class ParserChunker2MatcherProces
 		 */
 		// String sentence = "I love Fresh body styling";
 		// String phrase = "I captures way more detail in high contrast scenes";
+		ParserChunker2MatcherProcessor parser = ParserChunker2MatcherProcessor.getInstance();
+		parser.printParseTree("How can I get short focus zoom lens for digital camera");
+		parser.formGroupedPhrasesFromChunksForSentence("How can I get short focus zoom lens for digital camera");
+
+		System.exit(0);
+
 		String phrase1 = "Its classy design and the Mercedes name make it a very cool vehicle to drive. "
 			+ "The engine makes it a powerful car. "
 			+ "The strong engine gives it enough power. "
@@ -666,7 +687,7 @@ public class ParserChunker2MatcherProces
 			+ "This car has an amazingly good engine. "
 			+ "This car provides you a very good mileage.";
 		String sentence = "Not to worry with the 2cv.";
-		ParserChunker2MatcherProcessor parser = ParserChunker2MatcherProcessor.getInstance();
+
 
 		System.out.println(parser.assessRelevance(phrase1, phrase2));
 
@@ -675,11 +696,6 @@ public class ParserChunker2MatcherProces
 		parser.formGroupedPhrasesFromChunksForSentence("Sounds too good to be true but it actually is, the world's first flying car is finally here. ");
 		parser.formGroupedPhrasesFromChunksForSentence("UN Ambassador Ron Prosor repeated the Israeli position that the only way the Palestinians will get UN membership and statehood is through direct negotiations with the Israelis on a comprehensive peace agreement");
 
-		List<List<SentenceNode>> nodeListList = parser.parseTextNode(phrase1);
-		for (List<SentenceNode> nodeList : nodeListList) {
-			for (SentenceNode node : nodeList) {
-				System.out.println(node);
-			}
-		}
+
 	}
 }

Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/PhraseNode.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/PhraseNode.java?rev=1187691&r1=1187690&r2=1187691&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/PhraseNode.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/PhraseNode.java Sat Oct 22 11:15:38 2011
@@ -116,4 +116,16 @@ public class PhraseNode extends Syntacti
 
 		return builder.toString();
 	}
+	
+	@Override
+	public List<String> getOrderedPOSList(){
+		List<String> types = new ArrayList<String>(); 
+		if (children != null && children.size() > 0) {
+			for (SyntacticTreeNode child : children) {
+				types.addAll(child.getOrderedPOSList());
+			}
+		} else
+			types.add(getType());
+		return types;
+	}
 }

Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SentenceNode.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SentenceNode.java?rev=1187691&r1=1187690&r2=1187691&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SentenceNode.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SentenceNode.java Sat Oct 22 11:15:38 2011
@@ -17,6 +17,7 @@
 
 package opennlp.tools.textsimilarity.chunker2matcher;
 
+import java.util.ArrayList;
 import java.util.List;
 
 /**
@@ -56,4 +57,15 @@ public class SentenceNode extends Phrase
 
 		return builder.toString();
 	}
+	
+	@Override
+	public List<String> getOrderedPOSList(){
+		List<String> types = new ArrayList<String>(); 
+		if (this.getChildren()!= null && this.getChildren().size() > 0) {
+			for (SyntacticTreeNode child : this.getChildren()) {
+				types.addAll(child.getOrderedPOSList());
+			}
+		}
+		return types;
+	}
 }

Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SyntacticTreeNode.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SyntacticTreeNode.java?rev=1187691&r1=1187690&r2=1187691&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SyntacticTreeNode.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SyntacticTreeNode.java Sat Oct 22 11:15:38 2011
@@ -36,6 +36,8 @@ public abstract class SyntacticTreeNode 
 	public abstract String getLemma(boolean removeStopWord);
 
 	public abstract String toStringIndented(int numTabs);
+	
+	public abstract List<String> getOrderedPOSList(); 
 
 	public SyntacticTreeNode(String type) {
 		this.type = type;
@@ -146,4 +148,6 @@ public abstract class SyntacticTreeNode 
 			}
 		}
 	}
+
+	
 }

Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/WordNode.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/WordNode.java?rev=1187691&r1=1187690&r2=1187691&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/WordNode.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/WordNode.java Sat Oct 22 11:15:38 2011
@@ -17,6 +17,7 @@
 
 package opennlp.tools.textsimilarity.chunker2matcher;
 
+import java.util.ArrayList;
 import java.util.List;
 
 public class WordNode extends SyntacticTreeNode {
@@ -74,4 +75,11 @@ public class WordNode extends SyntacticT
 
 	public static void main(String[] args) {
 	}
+
+	@Override
+	public List<String> getOrderedPOSList() {
+		List<String> types = new ArrayList<String>();
+		types.add(getType());
+		return types;
+	}
 }

Modified: incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/SyntMatcherTest.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/SyntMatcherTest.java?rev=1187691&r1=1187690&r2=1187691&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/SyntMatcherTest.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/SyntMatcherTest.java Sat Oct 22 11:15:38 2011
@@ -31,75 +31,80 @@ import org.junit.runner.RunWith;
 
 public class SyntMatcherTest extends TestCase {
 
-  private ParserChunker2MatcherProcessor parserChunker2MatcherOlderOpenNLP;
+	private ParserChunker2MatcherProcessor parserChunker2Matcher;
 
-  private ParseTreeChunk parseTreeChunk = new ParseTreeChunk();
+	private ParseTreeChunk parseTreeChunk = new ParseTreeChunk();
 
-  public void notNullTest() {
-    parserChunker2MatcherOlderOpenNLP = ParserChunker2MatcherProcessor.getInstance();
-    assertNotNull(parserChunker2MatcherOlderOpenNLP);
-  }
-
-  public void testMatch() {
-    parserChunker2MatcherOlderOpenNLP = ParserChunker2MatcherProcessor.getInstance();
-    List<List<ParseTreeChunk>> matchResult = parserChunker2MatcherOlderOpenNLP
-        .assessRelevance(
-            // "Can I get auto focus lens for digital camera",
-            // "How can I get short focus zoom lens for digital camera"
-            "Pulitzer Prize-Winning Reporter is an Illegal Immigrant",
-            "Gay Pulitzer Prize-Winning Reporter Jose Antonio Vargas Comes Out as Undocumented " +
-            "Immigrant Jose Antonio Vargas, a gay journalist who won a Pulitzer Prize " +
-            "for his coverage of the Virginia Tech shootings in the Washington Post")
-           .getMatchResult();
-
-    System.out.println(matchResult);
-    assertEquals( "[[ [NNP-pulitzer NNP-prize NNP-winning NNP-reporter ],  [NNP-immigrant ]], []]",
-        matchResult.toString());
-    System.out.println(parseTreeChunk.listToString(matchResult));
-    assertEquals(" np [ [NNP-pulitzer NNP-prize NNP-winning NNP-reporter ],  [NNP-immigrant ]]",
-        parseTreeChunk.listToString(matchResult));
-
-    matchResult = parserChunker2MatcherOlderOpenNLP
-        .assessRelevance(
-            "Sounds too good to be true but it actually is, the world's first flying car is finally here. ",
-            "While it may seem like something straight out of a sci-fi " +
-            "movie, the  flying  car  might soon become a reality. ").getMatchResult();
-
-    System.out.println(matchResult);
-    // was  "[[ [DT-the NN-* VBG-flying NN-car ]], []]"
-    assertEquals("[[ [PRP-it ],  [DT-the NN-* ],  [NN-flying NN-car ]], [ [DT-the NN-* ],  [NN-* ]]]",
-    		matchResult.toString()
-       );
-    System.out.println(parseTreeChunk.listToString(matchResult));
-    assertEquals( " np [ [PRP-it ],  [DT-the NN-* ],  [NN-flying NN-car ]] vp [ [DT-the NN-* ],  [NN-* ]]",
-    		parseTreeChunk.listToString(matchResult));
-
-  }
-
-
-  public void testMatchDigitalCamera() {
-    parserChunker2MatcherOlderOpenNLP = ParserChunker2MatcherProcessor.getInstance();
-    List<List<ParseTreeChunk>> matchResult = parserChunker2MatcherOlderOpenNLP.assessRelevance(       
-            "I am curious how to use the digital zoom of this camera for filming insects",
-            "How can I get short focus zoom lens for digital camera").getMatchResult();
-
-    System.out.println(matchResult);
-    assertEquals("[[ [PRP-i ],  [JJ-digital NN-* ],  [NN-* IN-for ],  [NN-camera ]], [ [JJ-digital NN-* ],  [NN-* IN-for ]]]",
-        matchResult.toString());
-    System.out.println(parseTreeChunk.listToString(matchResult));
-    assertEquals(" np [ [PRP-i ],  [JJ-digital NN-* ],  [NN-* IN-for ],  [NN-camera ]] vp [ [JJ-digital NN-* ],  [NN-* IN-for ]]",
-        parseTreeChunk.listToString(matchResult));
-
-    matchResult = parserChunker2MatcherOlderOpenNLP.assessRelevance(
-        "Can I get auto focus lens for digital camera",
-        "How can I get short focus zoom lens for digital camera").getMatchResult();
-
-    System.out.println(matchResult);
-    assertEquals( "[[ [PRP-i ],  [NN-focus ],  [NN-lens IN-for JJ-digital NN-camera ],  [JJ-digital NN-camera ]], [ [VB-get NN-focus ],  [NN-lens IN-for JJ-digital NN-camera ]]]",
-        matchResult.toString());
-    System.out.println(parseTreeChunk.listToString(matchResult));
-    assertEquals(" np [ [PRP-i ],  [NN-focus ],  [NN-lens IN-for JJ-digital NN-camera ],  [JJ-digital NN-camera ]] vp [ [VB-get NN-focus ],  [NN-lens IN-for JJ-digital NN-camera ]]",
-        parseTreeChunk.listToString(matchResult) );
-  }
+	public void notNullTest() {
+		parserChunker2Matcher = ParserChunker2MatcherProcessor.getInstance();
+		assertNotNull(parserChunker2Matcher);
+	}
+
+	public void testMatch() {
+		parserChunker2Matcher = ParserChunker2MatcherProcessor.getInstance();
+		List<List<ParseTreeChunk>> matchResult = parserChunker2Matcher
+		.assessRelevance(
+				// "Can I get auto focus lens for digital camera",
+				// "How can I get short focus zoom lens for digital camera"
+				"Pulitzer Prize-Winning Reporter is an Illegal Immigrant",
+				"Gay Pulitzer Prize-Winning Reporter Jose Antonio Vargas Comes Out as Undocumented " +
+				"Immigrant Jose Antonio Vargas, a gay journalist who won a Pulitzer Prize " +
+		"for his coverage of the Virginia Tech shootings in the Washington Post")
+		.getMatchResult();
+
+		System.out.println(matchResult);
+		assertEquals( "[[ [NNP-pulitzer NNP-prize NNP-winning NNP-reporter ],  [NN-immigrant ]], []]",
+				matchResult.toString());
+		System.out.println(parseTreeChunk.listToString(matchResult));
+		assertEquals(" np [ [NNP-pulitzer NNP-prize NNP-winning NNP-reporter ],  [NN-immigrant ]]",
+				parseTreeChunk.listToString(matchResult));
+
+		matchResult = parserChunker2Matcher
+		.assessRelevance(
+				"Sounds too good to be true but it actually is, the world's first flying car is finally here. ",
+				"While it may seem like something straight out of a sci-fi " +
+		"movie, the  flying  car  might soon become a reality. ").getMatchResult();
+
+		// TODO: possibly problem in new POS tagger from Parser
+		System.out.println(matchResult);
+		// was  "[[ [DT-the NN-* VBG-flying NN-car ]], []]"
+		assertEquals("[[ [PRP-it ],  [DT-the NN-* NNS-* ]], [ [DT-the NN-* NNS-* ]]]",
+				matchResult.toString()
+		);
+		System.out.println(parseTreeChunk.listToString(matchResult));
+		assertEquals( " np [ [PRP-it ],  [DT-the NN-* NNS-* ]] vp [ [DT-the NN-* NNS-* ]]",
+				parseTreeChunk.listToString(matchResult));
+
+	}
+
+
+	public void testMatchDigitalCamera() {
+		parserChunker2Matcher = ParserChunker2MatcherProcessor.getInstance();
+		List<List<ParseTreeChunk>> matchResult = parserChunker2Matcher.assessRelevance(       
+				"I am curious how to use the digital zoom of this camera for filming insects",
+		"How can I get short focus zoom lens for digital camera").getMatchResult();
+
+		System.out.println(matchResult);
+		assertEquals("[[ [PRP-i ],  [NN-zoom NN-camera ],  [JJ-digital NN-* ],  [NN-* IN-for ],  [NN-camera ]], [ [JJ-digital NN-* ],  [NN-zoom NN-camera ],  [NN-* IN-for ]]]",
+				matchResult.toString());
+		System.out.println(parseTreeChunk.listToString(matchResult));
+		assertEquals(" np [ [PRP-i ],  [NN-zoom NN-camera ],  [JJ-digital NN-* ],  [NN-* IN-for ],  [NN-camera ]] vp [ [JJ-digital NN-* ],  [NN-zoom NN-camera ],  [NN-* IN-for ]]",
+				parseTreeChunk.listToString(matchResult));
+	}
+	
+	
+	public void testHighSimilarity() {
+		parserChunker2Matcher = ParserChunker2MatcherProcessor.getInstance();
+		List<List<ParseTreeChunk>>  matchResult = parserChunker2Matcher.assessRelevance(
+				"Can I get auto focus lens for digital camera",
+		"How can I get short focus zoom lens for digital camera").getMatchResult();
+
+		System.out.println(matchResult);
+		assertEquals( "[[ [PRP-i ],  [NN-focus NNS-* NNS-lens IN-for JJ-digital NN-camera ],  [JJ-digital NN-camera ]], [ [VB-get NN-focus NNS-* NNS-lens IN-for JJ-digital NN-camera ]]]",
+				matchResult.toString());
+		System.out.println(parseTreeChunk.listToString(matchResult));
+		assertEquals(" np [ [PRP-i ],  [NN-focus NNS-* NNS-lens IN-for JJ-digital NN-camera ],  [JJ-digital NN-camera ]] vp [ [VB-get NN-focus NNS-* NNS-lens IN-for JJ-digital NN-camera ]]",
+				parseTreeChunk.listToString(matchResult) );
+	}
 
 }