You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by bg...@apache.org on 2012/01/18 21:43:56 UTC
svn commit: r1233056 - in /incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools: similarity/apps/ textsimilarity/ textsimilarity/chunker2matcher/

Author: bgalitsky
Date: Wed Jan 18 20:43:55 2012
New Revision: 1233056

URL: http://svn.apache.org/viewvc?rev=1233056&view=rev
Log:
demonstration how sensitive syntactic match is compared to bag-of-words approach
Key: OPENNLP-413

Added:
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserPure2MatcherProcessor.java
Modified:
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBaseComparable.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/PhraseNode.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SentenceNode.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SyntacticTreeNode.java
    incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/WordNode.java

Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBaseComparable.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBaseComparable.java?rev=1233056&r1=1233055&r2=1233056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBaseComparable.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBaseComparable.java Wed Jan 18 20:43:55 2012
@@ -1,28 +1,10 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
 package opennlp.tools.similarity.apps;
 
 import java.util.Comparator;
 
-public class HitBaseComparable implements Comparator<HitBase> {
-
-  public int compare(HitBase o1, HitBase o2) {
-    return (o1.getGenerWithQueryScore() > o2.getGenerWithQueryScore() ? -1
-        : (o1 == o2 ? 0 : 1));
-  }
+public class HitBaseComparable implements Comparator<HitBase>{
+	@Override
+	public int compare(HitBase o1, HitBase o2) {
+		return (o1.getGenerWithQueryScore()>o2.getGenerWithQueryScore() ? -1 : (o1==o2 ? 0 : 1));
+	}
 }
\ No newline at end of file

Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java?rev=1233056&r1=1233055&r2=1233056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java Wed Jan 18 20:43:55 2012
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package opennlp.tools.similarity.apps;
 
 import java.util.ArrayList;
@@ -27,84 +26,81 @@ import opennlp.tools.textsimilarity.Pars
 import opennlp.tools.textsimilarity.SentencePairMatchResult;
 import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
 
+
 public class SearchResultsProcessor extends BingWebQueryRunner {
-  private static Logger LOG = Logger
-      .getLogger("opennlp.tools.similarity.apps.SearchResultsProcessor");
-  private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
-
-  /*
-   * Takes Bing API search results and calculates the parse tree similarity
-   * between the question and each snippet. Ranks those snippets with higher
-   * similarity score up
-   */
-  private BingResponse calculateMatchScoreResortHits(BingResponse resp,
-      String searchQuery) {
-    // TODO
-    /*
-     * if query is multi-sentence, special handling int indexDot =
-     * searchQuery.indexOf("."); if (indexDot>0 &&
-     * indexDot<searchQuery.length()-1){ MultipleSentenceQueryAnswerer ans = new
-     * MultipleSentenceQueryAnswerer(); return
-     * ans.calculateMatchScoreResortHits(resp, searchQuery); }
-     */
-    List<HitBase> newHitList = new ArrayList<HitBase>();
-    ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor
-        .getInstance();
-
-    for (HitBase hit : resp.getHits()) {
-      String snapshot = hit.getAbstractText().replace("<b>...</b>", ". ")
-          .replace("<b>", "").replace("</b>", "");
-      snapshot = snapshot.replace("</B>", "").replace("<B>", "")
-          .replace("<br>", "").replace("</br>", "").replace("...", ". ")
-          .replace("|", " ").replace(">", " ");
-      snapshot += " . " + hit.getTitle();
-      Double score = 0.0;
-      try {
-        SentencePairMatchResult matchRes = sm.assessRelevance(snapshot,
-            searchQuery);
-        List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
-        score = parseTreeChunkListScorer.getParseTreeChunkListScore(match);
-        LOG.finest(score + " | " + snapshot);
-      } catch (Exception e) {
-        LOG.severe("Problem processing snapshot " + snapshot);
-        e.printStackTrace();
-      }
-      hit.setGenerWithQueryScore(score);
-      newHitList.add(hit);
-    }
-    Collections.sort(newHitList, new HitBaseComparable());
-    resp.setHits(newHitList);
-    LOG.info("\n\n ============= NEW ORDER ================= ");
-    for (HitBase hit : newHitList) {
-      LOG.info(hit.toString());
-    }
-
-    return resp;
-  }
-
-  public List<HitBase> runSearch(String query) {
-    BingResponse resp = null, // obtained from bing
-    newResp = null; // re-sorted based on similarity
-    try {
-      List<String> resultList = search(query, "", "", 30);
-      resp = populateBingHit(resultList.get(0));
-      // now we apply our own relevance filter
-      newResp = calculateMatchScoreResortHits(resp, query);
-
-    } catch (Exception e) {
-      // e.printStackTrace();
-      LOG.info("No search results for query '" + query);
-      e.printStackTrace();
-      return null;
-    }
-    // cast to super class
-    List<HitBase> hits = new ArrayList<HitBase>();
-    for (HitBase h : resp.getHits())
-      hits.add((HitBase) h);
+	private static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.SearchResultsProcessor");
+	private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
+
+	/*
+	 * Takes Bing API search results and calculates the parse tree similarity between the question and each snippet.
+	 * Ranks those snippets with higher similarity score up
+	 */
+	private	BingResponse calculateMatchScoreResortHits(BingResponse resp, String searchQuery){
+		// TODO
+		/*if query is multi-sentence, special handling
+		int indexDot = searchQuery.indexOf("."); 
+		if (indexDot>0 && indexDot<searchQuery.length()-1){
+			MultipleSentenceQueryAnswerer ans = new MultipleSentenceQueryAnswerer();
+			return ans.calculateMatchScoreResortHits(resp, searchQuery);		
+		} */
+		List<HitBase> newHitList =	new ArrayList<HitBase>();
+		ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor.getInstance();
+
+		for(HitBase hit: resp.getHits()){
+			String snapshot = hit.getAbstractText().replace("<b>...</b>", ". ").replace("<b>", "").replace("</b>","");
+			snapshot=snapshot.replace("</B>", "").replace("<B>", "").replace("<br>", "").replace("</br>", "").replace("...", ". ").replace("|", " ").replace(">", " ");
+			snapshot+=" . "+hit.getTitle();
+			Double score = 0.0;
+			try {
+				SentencePairMatchResult matchRes = sm.assessRelevance(snapshot, searchQuery);
+				List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
+				score = parseTreeChunkListScorer.getParseTreeChunkListScore(match);
+				LOG.finest(score + " | "+ snapshot );
+			}
+			catch (Exception e){
+				LOG.severe("Problem processing snapshot "+snapshot);
+				e.printStackTrace();
+			}
+			hit.setGenerWithQueryScore(score);
+			newHitList.add(hit);
+		}
+		Collections.sort(newHitList,new HitBaseComparable());
+		resp.setHits(newHitList);
+		LOG.info("\n\n ============= NEW ORDER ================= ");
+		for(HitBase hit: newHitList){
+			LOG.info(hit.toString());
+		}
+
+
+		return resp; 
+	}
+
+	public List<HitBase> runSearch(String query) {
+		BingResponse resp = null, // obtained from bing
+		newResp = null; // re-sorted based on similarity
+		try {
+			List<String> resultList = search(query, "", "", 30);
+			resp = populateBingHit(resultList.get(0));
+			// now we apply our own relevance filter
+			newResp=calculateMatchScoreResortHits(resp, query);
+
+		} catch (Exception e) {
+			// e.printStackTrace();
+			LOG.info("No search results for query '" + query);
+			e.printStackTrace();
+			return null;
+		}
+		// cast to super class
+		List<HitBase> hits = new ArrayList<HitBase>();
+		for (HitBase h : resp.getHits())
+			hits.add((HitBase) h);
+
+		hits = removeDuplicates(hits, 0.9);
+
+		return hits;
+	}
+
 
-    hits = removeDuplicates(hits, 0.9);
 
-    return hits;
-  }
 
 }

Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java?rev=1233056&r1=1233055&r2=1233056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java Wed Jan 18 20:43:55 2012
@@ -19,6 +19,7 @@ package opennlp.tools.textsimilarity;
 
 import java.util.List;
 
+
 import opennlp.tools.stemmer.PorterStemmer;
 
 public class LemmaFormManager {

Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java?rev=1233056&r1=1233055&r2=1233056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java Wed Jan 18 20:43:55 2012
@@ -33,17 +33,15 @@
 
 package opennlp.tools.textsimilarity.chunker2matcher;
 
+import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.List;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
-import org.apache.commons.lang.StringUtils;
-
 import opennlp.tools.chunker.ChunkerME;
 import opennlp.tools.chunker.ChunkerModel;
 import opennlp.tools.cmdline.parser.ParserTool;
@@ -66,16 +64,16 @@ import opennlp.tools.textsimilarity.Text
 import opennlp.tools.tokenize.Tokenizer;
 import opennlp.tools.tokenize.TokenizerME;
 import opennlp.tools.tokenize.TokenizerModel;
-import opennlp.tools.util.Sequence;
 import opennlp.tools.util.Span;
-import opennlp.tools.util.StringUtil;
 
 
 public class ParserChunker2MatcherProcessor {
-	private static final int MIN_SENTENCE_LENGTH = 10;
+	protected static final int MIN_SENTENCE_LENGTH = 10;
 	private static final String MODEL_DIR_KEY = "nlp.models.dir";
-	private static final String MODEL_DIR ;
-	private static ParserChunker2MatcherProcessor instance;
+	// TODO config
+	// this is where resources shoudl live
+	private static String MODEL_DIR = "resources/models";
+	protected static ParserChunker2MatcherProcessor instance;
 
 	private SentenceDetector sentenceDetector;
 	private Tokenizer tokenizer;
@@ -85,18 +83,13 @@ public class ParserChunker2MatcherProces
 	private final int NUMBER_OF_SECTIONS_IN_SENTENCE_CHUNKS = 5;
 	private static Logger LOG = Logger.getLogger("opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor");
 
-	static {
-		//TODO config
-		MODEL_DIR = "C:\\workspace\\similarity\\src\\main\\resources";
-	}
-
-	private ParserChunker2MatcherProcessor() {
+	protected ParserChunker2MatcherProcessor() {
+		MODEL_DIR = new File(".").getAbsolutePath().replace(".", "")+MODEL_DIR;
 		initializeSentenceDetector();
 		initializeTokenizer();
 		initializePosTagger();
 		initializeParser();
 		initializeChunker();
-
 	}
 
 	public synchronized static ParserChunker2MatcherProcessor getInstance() {
@@ -175,6 +168,8 @@ public class ParserChunker2MatcherProces
 			LOG.log(Level.WARNING, "failed to parse the sentence : '"+sentence, t);
 			return null;
 		}
+		
+		//	Parse[] chunks = ChunkSampleStream.getInitialChunks(parseArray[0]) ;
 
 		// there should be only one result parse
 		if (parseArray != null && parseArray.length > 0)
@@ -222,6 +217,7 @@ public class ParserChunker2MatcherProces
 			return null;
 		}
 		List<String> POSlist = node.getOrderedPOSList();
+		
 		tags = POSlist.toArray(new String[0]);
 		if (toks.length != tags.length){
 			LOG.info("disagreement between toks and tags; sent =  '"+sentence + "'\n tags = "+tags + 
@@ -245,19 +241,15 @@ public class ParserChunker2MatcherProces
 				} else
 					return null;
 			}
-			
 		}
 		String[] res = chunker.chunk(toks, tags);
-		Span[] span =  chunker.chunkAsSpans(toks, tags);
-		Sequence[] seq = chunker.topKSequences(toks, tags);
-
+	
 		List<List<ParseTreeChunk>> listOfChunks = new ArrayList<List<ParseTreeChunk>>();
 		List<ParseTreeChunk> nounPhr = new ArrayList<ParseTreeChunk>(), 
 		prepPhr = new ArrayList<ParseTreeChunk>(), verbPhr  = new ArrayList<ParseTreeChunk>(), 
 		adjPhr  = new ArrayList<ParseTreeChunk>(), 
 		// to store the whole sentence
 		wholeSentence = new ArrayList<ParseTreeChunk>();
-
 		List<String> pOSsAll = new ArrayList<String>(), lemmasAll = new ArrayList<String>();
 
 		for(int i = 0; i< toks.length; i++){
@@ -278,7 +270,7 @@ public class ParserChunker2MatcherProces
 				for(int j=i+1; j<res.length; j++){
 					if (res[j].startsWith("B-VP")){
 						nounPhr.add(new ParseTreeChunk("NP", lemmas, pOSs));
-						LOG.info(i + " => " +lemmas);
+						//LOG.info(i + " => " +lemmas);
 						currPhraseClosed = true;
 						break;
 					} else {
@@ -288,7 +280,7 @@ public class ParserChunker2MatcherProces
 				}
 				if (!currPhraseClosed){
 					nounPhr.add(new ParseTreeChunk("NP", lemmas, pOSs));
-					LOG.fine(i + " => " + lemmas);
+					//LOG.fine(i + " => " + lemmas);
 				}
 
 			} else if (bi_POS.startsWith("B-PP")){// beginning of a phrase
@@ -299,7 +291,7 @@ public class ParserChunker2MatcherProces
 				for(int j=i+1; j<res.length; j++){
 					if (res[j].startsWith("B-VP")){
 						prepPhr.add(new ParseTreeChunk("PP", lemmas, pOSs));
-						LOG.fine(i + " => " + lemmas);
+						//LOG.fine(i + " => " + lemmas);
 						currPhraseClosed = true;
 						break;
 					} else {
@@ -309,7 +301,7 @@ public class ParserChunker2MatcherProces
 				}
 				if (!currPhraseClosed){
 					prepPhr.add(new ParseTreeChunk("PP", lemmas, pOSs));
-					LOG.fine(i + " => " + lemmas);
+					//LOG.fine(i + " => " + lemmas);
 				}
 			} else
 				if (bi_POS.startsWith("B-VP")){// beginning of a phrase
@@ -320,7 +312,7 @@ public class ParserChunker2MatcherProces
 					for(int j=i+1; j<res.length; j++){
 						if (res[j].startsWith("B-VP")){
 							verbPhr.add(new ParseTreeChunk("VP", lemmas, pOSs));
-							LOG.fine(i + " => " +lemmas);
+							//LOG.fine(i + " => " +lemmas);
 							currPhraseClosed = true;
 							break;
 						} else {
@@ -330,7 +322,7 @@ public class ParserChunker2MatcherProces
 					}
 					if (!currPhraseClosed){
 						verbPhr.add(new ParseTreeChunk("VP", lemmas, pOSs));
-						LOG.fine(i + " => " + lemmas);
+						//LOG.fine(i + " => " + lemmas);
 					}
 				} else
 					if (bi_POS.startsWith("B-ADJP") ){// beginning of a phrase
@@ -341,7 +333,7 @@ public class ParserChunker2MatcherProces
 						for(int j=i+1; j<res.length; j++){
 							if (res[j].startsWith("B-VP")){
 								adjPhr.add(new ParseTreeChunk("ADJP", lemmas, pOSs));
-								LOG.fine(i + " => " +lemmas);
+								//LOG.fine(i + " => " +lemmas);
 								currPhraseClosed = true;
 								break;
 							} else {
@@ -351,7 +343,7 @@ public class ParserChunker2MatcherProces
 						}
 						if (!currPhraseClosed){
 							adjPhr.add(new ParseTreeChunk("ADJP", lemmas, pOSs));
-							LOG.fine(i + " => " + lemmas);
+							//LOG.fine(i + " => " + lemmas);
 						}
 					}
 		}
@@ -471,7 +463,7 @@ public class ParserChunker2MatcherProces
 		return tokenizer.tokenize(sentence);
 	}
 
-	private void initializeSentenceDetector() {
+	protected void initializeSentenceDetector() {
 		InputStream is = null;
 		try {
 			is = new FileInputStream(
@@ -493,7 +485,7 @@ public class ParserChunker2MatcherProces
 		}
 	}
 
-	private void initializeTokenizer() {
+	protected void initializeTokenizer() {
 		InputStream is = null;
 		try {
 			is = new FileInputStream(
@@ -513,7 +505,7 @@ public class ParserChunker2MatcherProces
 		}
 	}
 
-	private void initializePosTagger() {
+	protected void initializePosTagger() {
 		InputStream is = null;
 		try {
 			is = new FileInputStream(MODEL_DIR
@@ -532,7 +524,7 @@ public class ParserChunker2MatcherProces
 		}
 	}
 
-	private void initializeParser() {
+	protected void initializeParser() {
 		InputStream is = null;
 		try {
 			is = new FileInputStream(MODEL_DIR
@@ -597,7 +589,9 @@ public class ParserChunker2MatcherProces
 
 		// if this node contains children nodes, then it is a phrase node
 		if (childrenNodeList != null && childrenNodeList.size() > 0) {
+			//System.out.println("Found "+ type + " phrase = "+ childrenNodeList);
 			return new PhraseNode(type, childrenNodeList);
+			
 		}
 
 		// otherwise, it is a word node
@@ -628,11 +622,10 @@ public class ParserChunker2MatcherProces
 	
 	public SentencePairMatchResult assessRelevance(String para1, String para2)
 	{
-		ParserChunker2MatcherProcessor parser = ParserChunker2MatcherProcessor.getInstance();
-		List<List<ParseTreeChunk>> sent1GrpLst = parser.formGroupedPhrasesFromChunksForPara(para1), 
-		sent2GrpLst = parser.formGroupedPhrasesFromChunksForPara(para2);
+		List<List<ParseTreeChunk>> sent1GrpLst = formGroupedPhrasesFromChunksForPara(para1), 
+		sent2GrpLst = formGroupedPhrasesFromChunksForPara(para2);
 
-		List<LemmaPair> origChunks1 = listListParseTreeChunk2ListLemmaPairs(sent1GrpLst); //TODO  need to populate it!
+		List<LemmaPair> origChunks1 = listListParseTreeChunk2ListLemmaPairs(sent1GrpLst); 
 
 
 		ParseTreeMatcherDeterministic md = new ParseTreeMatcherDeterministic();
@@ -640,7 +633,7 @@ public class ParserChunker2MatcherProces
 		return new SentencePairMatchResult(res, origChunks1);
 
 	}
-	private List<LemmaPair> listListParseTreeChunk2ListLemmaPairs(
+	protected List<LemmaPair> listListParseTreeChunk2ListLemmaPairs(
 			List<List<ParseTreeChunk>> sent1GrpLst) {
 		List<LemmaPair>  results = new ArrayList<LemmaPair>();
 		if (sent1GrpLst==null || sent1GrpLst.size() <1)
@@ -665,55 +658,4 @@ public class ParserChunker2MatcherProces
 			}
 		}
 	}
-
-	public static void main(String[] args) throws Exception {
-
-		String text = "Where do I apply? Go to your town office or city hall. If your town doesn't have an office, ask the town clerk or a Selectman. Tell them that you need a 1040 tax form . I Can 't Pay the Taxes on my House: What Can I Do?. Pine Tree Legal";
-
-		/*
-		 * String text =
-		 * "I have been driving a 96 accord to death for 10 years.  " +
-		 * "Lately it has been costing to much in repairs.  " +
-		 * "I am looking for something 8,000-13,000.  " +
-		 * "My last three vehicles have been Accords.  " +
-		 * "I like them but I would like something different this time.";
-		 */
-		/*
-		 * String text = "I love Fresh body styling. " + "I love lots of grip. "
-		 * + "I love strong engine and grippy tires. " + "I like Head turner. "
-		 * + "I like Right and left rearward blind spots. " +
-		 * "I like Great acceleration. " + "I like great noise. " +
-		 * "I like great brakes. " + "I like cheap feeling interior. " +
-		 * "I like uncomfortable seats. " + "I like nav system hard to read.";
-		 */
-		// String sentence = "I love Fresh body styling";
-		// String phrase = "I captures way more detail in high contrast scenes";
-		ParserChunker2MatcherProcessor parser = ParserChunker2MatcherProcessor.getInstance();
-		List<List<ParseTreeChunk>> res = parser.formGroupedPhrasesFromChunksForPara(text);
-		System.out.println(res);
-		
-		//parser.printParseTree("How can I get short focus zoom lens for digital camera");
-		//parser.formGroupedPhrasesFromChunksForSentence("How can I get short focus zoom lens for digital camera");
-
-		System.exit(0);
-
-		String phrase1 = "Its classy design and the Mercedes name make it a very cool vehicle to drive. "
-			+ "The engine makes it a powerful car. "
-			+ "The strong engine gives it enough power. "
-			+ "The strong engine gives the car a lot of power.";
-		String phrase2 = "This car has a great engine. "
-			+ "This car has an amazingly good engine. "
-			+ "This car provides you a very good mileage.";
-		String sentence = "Not to worry with the 2cv.";
-
-
-		System.out.println(parser.assessRelevance(phrase1, phrase2));
-
-
-		parser.formGroupedPhrasesFromChunksForSentence("Its classy design and the Mercedes name make it a very cool vehicle to drive. ");
-		parser.formGroupedPhrasesFromChunksForSentence("Sounds too good to be true but it actually is, the world's first flying car is finally here. ");
-		parser.formGroupedPhrasesFromChunksForSentence("UN Ambassador Ron Prosor repeated the Israeli position that the only way the Palestinians will get UN membership and statehood is through direct negotiations with the Israelis on a comprehensive peace agreement");
-
-
-	}
 }

Added: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserPure2MatcherProcessor.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserPure2MatcherProcessor.java?rev=1233056&view=auto
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserPure2MatcherProcessor.java (added)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserPure2MatcherProcessor.java Wed Jan 18 20:43:55 2012
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.textsimilarity.chunker2matcher;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import opennlp.tools.textsimilarity.LemmaPair;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic;
+import opennlp.tools.textsimilarity.SentencePairMatchResult;
+import opennlp.tools.textsimilarity.TextProcessor;
+
+public class ParserPure2MatcherProcessor extends ParserChunker2MatcherProcessor{
+	protected static ParserPure2MatcherProcessor pinstance;
+	private static Logger LOG = Logger.getLogger("opennlp.tools.textsimilarity.chunker2matcher.ParserPure2MatcherProcessor");
+
+	public synchronized static ParserPure2MatcherProcessor getInstance() {
+		if (pinstance == null)
+			pinstance = new ParserPure2MatcherProcessor();
+
+		return pinstance;
+	}
+	
+	private ParserPure2MatcherProcessor() {
+		initializeSentenceDetector();
+		initializeTokenizer();
+		initializePosTagger();
+		initializeParser();
+	}
+
+	public synchronized List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForSentence(String sentence) {
+		if (sentence == null || sentence.trim().length() < MIN_SENTENCE_LENGTH)
+			return null;
+
+		sentence = TextProcessor.removePunctuation(sentence);
+		SentenceNode node  = parseSentenceNode(sentence);
+		if (node==null){
+			LOG.info("Problem parsing sentence '"+sentence);
+			return null;
+		}
+		List<ParseTreeChunk> ptcList = node.getParseTreeChunkList();
+		List<String> POSlist = node.getOrderedPOSList();
+		List<String> TokList = node.getOrderedLemmaList();
+	
+		List<List<ParseTreeChunk>> listOfChunks = new ArrayList<List<ParseTreeChunk>>();
+		List<ParseTreeChunk> nounPhr = new ArrayList<ParseTreeChunk>(), 
+		prepPhr = new ArrayList<ParseTreeChunk>(), verbPhr  = new ArrayList<ParseTreeChunk>(), 
+		adjPhr  = new ArrayList<ParseTreeChunk>(), 
+		// to store the whole sentence
+		wholeSentence = new ArrayList<ParseTreeChunk>();
+
+		wholeSentence.add(new ParseTreeChunk("SENTENCE", TokList, POSlist));
+		for(ParseTreeChunk phr: ptcList){
+			String phrType = phr.getMainPOS();
+			if (phrType.startsWith("NP")){
+				nounPhr.add(phr);
+			} else if (phrType.startsWith("VP")){
+				verbPhr.add(phr);
+			} else if (phrType.startsWith("PP")){
+				prepPhr.add(phr);
+			} else if (phrType.endsWith("ADJP")){
+				adjPhr.add(phr);
+			} else {
+				//LOG.info("Unexpected phrase type found :"+ phr);				
+			}
+			
+		}
+	
+		listOfChunks.add(nounPhr);
+		listOfChunks.add(verbPhr);
+		listOfChunks.add(prepPhr);
+		listOfChunks.add(adjPhr);
+		listOfChunks.add(wholeSentence);
+
+		return listOfChunks;
+	}
+	
+	public SentencePairMatchResult assessRelevance(String para1, String para2)
+	{
+	
+		List<List<ParseTreeChunk>> sent1GrpLst = formGroupedPhrasesFromChunksForPara(para1), 
+		sent2GrpLst = formGroupedPhrasesFromChunksForPara(para2);
+
+		List<LemmaPair> origChunks1 = listListParseTreeChunk2ListLemmaPairs(sent1GrpLst); //TODO  need to populate it!
+
+
+		ParseTreeMatcherDeterministic md = new ParseTreeMatcherDeterministic();
+		List<List<ParseTreeChunk>> res = md.matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst);
+		return new SentencePairMatchResult(res, origChunks1);
+
+	}
+	
+
+	public static void main(String[] args) throws Exception {
+		ParserPure2MatcherProcessor parser = ParserPure2MatcherProcessor.getInstance();
+		String text = "Its classy design and the Mercedes name make it a very cool vehicle to drive. ";
+
+		List<List<ParseTreeChunk>> res = parser.formGroupedPhrasesFromChunksForPara(text);
+		System.out.println(res);
+		
+	//	System.exit(0);
+
+		
+		String phrase1 = "Its classy design and the Mercedes name make it a very cool vehicle to drive. "
+			+ "The engine makes it a powerful car. "
+			+ "The strong engine gives it enough power. "
+			+ "The strong engine gives the car a lot of power.";
+		String phrase2 = "This car has a great engine. "
+			+ "This car has an amazingly good engine. "
+			+ "This car provides you a very good mileage.";
+		String sentence = "Not to worry with the 2cv.";
+
+
+		System.out.println(parser.assessRelevance(phrase1, phrase2).getMatchResult());
+
+
+		System.out.println(parser.formGroupedPhrasesFromChunksForSentence("Its classy design and the Mercedes name make it a very cool vehicle to drive. "));
+		System.out.println(parser.formGroupedPhrasesFromChunksForSentence("Sounds too good to be true but it actually is, the world's first flying car is finally here. "));
+		System.out.println(parser.formGroupedPhrasesFromChunksForSentence("UN Ambassador Ron Prosor repeated the Israeli position that the only way the Palestinians will get UN membership and statehood is through direct negotiations with the Israelis on a comprehensive peace agreement"));
+
+
+	}
+}

Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/PhraseNode.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/PhraseNode.java?rev=1233056&r1=1233055&r2=1233056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/PhraseNode.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/PhraseNode.java Wed Jan 18 20:43:55 2012
@@ -128,4 +128,16 @@ public class PhraseNode extends Syntacti
 			types.add(getType());
 		return types;
 	}
+	
+	@Override
+	public List<String> getOrderedLemmaList(){
+		List<String> types = new ArrayList<String>(); 
+		if (children != null && children.size() > 0) {
+			for (SyntacticTreeNode child : children) {
+				types.addAll(child.getOrderedLemmaList());
+			}
+		} else
+			types.add(getType());
+		return types;
+	}
 }

Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SentenceNode.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SentenceNode.java?rev=1233056&r1=1233055&r2=1233056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SentenceNode.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SentenceNode.java Wed Jan 18 20:43:55 2012
@@ -20,6 +20,8 @@ package opennlp.tools.textsimilarity.chu
 import java.util.ArrayList;
 import java.util.List;
 
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+
 /**
  * Sentence node is the first clause node contained in the top node
  * 
@@ -69,5 +71,41 @@ public class SentenceNode extends Phrase
 		return types;
 	}
 	
+	@Override
+	public List<String> getOrderedLemmaList(){
+		List<String> types = new ArrayList<String>(); 
+		if (this.getChildren()!= null && this.getChildren().size() > 0) {
+			for (SyntacticTreeNode child : this.getChildren()) {
+				types.addAll(child.getOrderedLemmaList());
+			}
+		}
+		return types;
+	}
+	
+	public List<ParseTreeChunk> getParseTreeChunkList(){
+		List<ParseTreeChunk> chunks = new ArrayList<ParseTreeChunk>();
+		
+		if (this.getChildren()!= null && this.getChildren().size() > 0) {
+			for (SyntacticTreeNode child : this.getChildren()) {
+			//	if (child.getType().endsWith("P"))
+					chunks.add(new ParseTreeChunk(child.getType(),  
+							child.getOrderedPOSList(), child.getOrderedLemmaList()));
+			}
+		}
+		return chunks;
+	}
+	
+	
 	
 }
+
+/*
+ * [[NP [PRP$-your NN-town NN-office CC-or NN-city NN-hall ], NP [PRP$-your NN-town NN-doesn NN-t ], 
+ * NP [DT-an NN-office ], NP [DT-the NN-town NN-clerk CC-or DT-a NNP-Selectman ], 
+ * NP [DT-a NNP-Selectman ], NP [PRP-them IN-that PRP-you ], NP [PRP-you ], NP [DT-a CD-1040 NN-tax NN-form ], NP [PRP-I ], 
+ * NP [DT-the NNS-Taxes IN-on PRP$-my NNP-House WP-What MD-Can PRP-I ], NP [PRP$-my NNP-House WP-What MD-Can PRP-I ], 
+ * NP [WP-What MD-Can PRP-I ], NP [PRP-I ], NP [NNP-Pine NNP-Tree NNP-Legal ]], 
+ * 
+ * [VP [VBP-do RB-I VB-apply ], VP [VB-Go TO-to PRP$-your NN-town NN-office CC-or NN-city NN-hall ], 
+ * VP [VBP-have DT-an NN-office ], VP [VB-ask DT-the NN-town NN-clerk CC-or DT-a NNP-Selectman ], VP [VB-Tell PRP-them IN-that PRP-you ], VP [VBP-need DT-a CD-1040 NN-tax NN-form ], VP [MD-Can VB-t VB-Pay DT-the NNS-Taxes IN-on PRP$-my NNP-House WP-What MD-Can PRP-I ], VP [VB-Do NNP-Pine NNP-Tree NNP-Legal ]], [PP [TO-to PRP$-your NN-town NN-office CC-or NN-city NN-hall ], PP [IN-on PRP$-my NNP-House WP-What MD-Can PRP-I ]], [], [SENTENCE [WRB-Where VBP-do RB-I VB-apply ], SENTENCE [VB-Go TO-to PRP$-your NN-town NN-office CC-or NN-city NN-hall ], SENTENCE [IN-If PRP$-your NN-town NN-doesn NN-t VBP-have DT-an NN-office VB-ask DT-the NN-town NN-clerk CC-or DT-a NNP-Selectman ], SENTENCE [VB-Tell PRP-them IN-that PRP-you VBP-need DT-a CD-1040 NN-tax NN-form ], SENTENCE [PRP-I MD-Can VB-t VB-Pay DT-the NNS-Taxes IN-on PRP$-my NNP-House WP-What MD-Can PRP-I VB-Do NNP-Pine NNP-Tree NNP-Legal ]]]
+*/

Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SyntacticTreeNode.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SyntacticTreeNode.java?rev=1233056&r1=1233055&r2=1233056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SyntacticTreeNode.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/SyntacticTreeNode.java Wed Jan 18 20:43:55 2012
@@ -39,6 +39,8 @@ public abstract class SyntacticTreeNode 
 	public abstract String toStringIndented(int numTabs);
 	
 	public abstract List<String> getOrderedPOSList(); 
+	
+	public abstract List<String> getOrderedLemmaList(); 
 
 	public SyntacticTreeNode(String type) {
 		this.type = type;
@@ -153,5 +155,7 @@ public abstract class SyntacticTreeNode 
 		}
 	}
 
+
+
 	
 }

Modified: incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/WordNode.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/WordNode.java?rev=1233056&r1=1233055&r2=1233056&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/WordNode.java (original)
+++ incubator/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/WordNode.java Wed Jan 18 20:43:55 2012
@@ -82,4 +82,10 @@ public class WordNode extends SyntacticT
 		types.add(getType());
 		return types;
 	}
+	@Override
+	public List<String> getOrderedLemmaList() {
+		List<String> types = new ArrayList<String>();
+		types.add(this.getWord());
+		return types;
+	}
 }