You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by bg...@apache.org on 2016/11/16 09:05:03 UTC
[4/5] opennlp-sandbox git commit: merge from bgalitsky's own git repo

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaFormManager.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaFormManager.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaFormManager.java
index cb6f3e9..694abce 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaFormManager.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaFormManager.java
@@ -19,11 +19,11 @@ package opennlp.tools.parse_thicket.matching;
 
 import java.util.List;
 
-import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.stemmer.PStemmer;
 
 public class LemmaFormManager {
 
-  public String matchLemmas(PorterStemmer ps, String lemma1, String lemma2,
+  public String matchLemmas(PStemmer ps, String lemma1, String lemma2,
       String POS) {
     if (POS == null) {
       return null;
@@ -95,7 +95,7 @@ public class LemmaFormManager {
     // if (sim!=null && (lemmaMatch!=null && !lemmaMatch.equals("fail"))){
 
   }
-
+/*
   // all lemmas ending with # in ch1 and/or ch2 SHOULD occur in chunkToAdd
   public boolean mustOccurVerifier(ParseTreePath ch1, ParseTreePath ch2,
       ParseTreePath chunkToAdd) {
@@ -112,5 +112,5 @@ public class LemmaFormManager {
     }
     return true;
   }
-
+*/
 }

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/Matcher.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/Matcher.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/Matcher.java
index 0830276..8540ff2 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/Matcher.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/Matcher.java
@@ -1,26 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package opennlp.tools.parse_thicket.matching;
 
+import java.io.File;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
-
-
 import opennlp.tools.parse_thicket.IGeneralizer;
-import opennlp.tools.parse_thicket.ParseCorefsBuilder;
+import opennlp.tools.parse_thicket.ParseCorefBuilderWithNER;
 import opennlp.tools.parse_thicket.ParseThicket;
 import opennlp.tools.parse_thicket.ParseTreeNode;
-import opennlp.tools.textsimilarity.LemmaPair;
+import opennlp.tools.parse_thicket.VerbNetProcessor;
 import opennlp.tools.textsimilarity.ParseTreeChunk;
-import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic;
-import opennlp.tools.textsimilarity.SentencePairMatchResult;
-import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
 
 public class Matcher implements IGeneralizer<List<List<ParseTreeNode>>>{
-	ParseTreeMatcherDeterministic md = new ParseTreeMatcherDeterministic();
-	ParseCorefsBuilder ptBuilder = ParseCorefsBuilder.getInstance();
+	public static String resourceDir = new File(".").getAbsolutePath().replace("/.", "") + "/src/test/resources";
+	VerbNetProcessor proc = VerbNetProcessor.getInstance(resourceDir);
+
+	protected PhraseGroupGeneralizer pgGen = new PhraseGroupGeneralizer();
+
+	protected static ParseCorefBuilderWithNER ptBuilder = null;
+	
+	static {
+		synchronized (Matcher.class) {
+			ptBuilder = ParseCorefBuilderWithNER.getInstance();
+		}
+	}
+	
+	
 	PT2ThicketPhraseBuilder phraseBuilder = new PT2ThicketPhraseBuilder();
-	Map<String, ParseThicket> parseThicketHash = new HashMap<String, ParseThicket>();
+	protected Map<String, ParseThicket> parseThicketHash = new HashMap<String, ParseThicket>();
+
+
 	/**	   * The key function of similarity component which takes two portions of text
 	 * and does similarity assessment by finding the set of all maximum common
 	 * subtrees of the set of parse trees for each portion of text
@@ -31,11 +58,16 @@ public class Matcher implements IGeneralizer<List<List<ParseTreeNode>>>{
 	 *          text 2
 	 * @return the matching results structure, which includes the similarity score
 	 */
-	
-	public Matcher(){
-		
+	private static Matcher instance;
+
+	public synchronized static Matcher getInstance() {
+		if (instance == null)
+			instance = new Matcher();
+
+		return instance;
 	}
-	
+
+
 	public List<List<ParseTreeChunk>> assessRelevance(String para1, String para2) {
 		// first build PTs for each text
 		ParseThicket pt1 = ptBuilder.buildParseThicket(para1);
@@ -47,28 +79,60 @@ public class Matcher implements IGeneralizer<List<List<ParseTreeNode>>>{
 		List<List<ParseTreeChunk>> sent1GrpLst = formGroupedPhrasesFromChunksForPara(phrs1), 
 				sent2GrpLst = formGroupedPhrasesFromChunksForPara(phrs2);
 
-		
-		List<List<ParseTreeChunk>> res = md
-				.matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst);
+
+		List<List<ParseTreeChunk>> res = pgGen.generalize(sent1GrpLst, sent2GrpLst);
+
 		return res;
 
 	}
-	
+
+
+	public List<List<ParseTreeChunk>> assessRelevance(List<List<ParseTreeChunk>> para0, String para2) {
+		// first build PTs for each text
+
+		ParseThicket pt2 = ptBuilder.buildParseThicket(para2);
+		// then build phrases and rst arcs
+		List<List<ParseTreeNode>> phrs2 = phraseBuilder.buildPT2ptPhrases(pt2);
+		// group phrases by type
+		List<List<ParseTreeChunk>> sent2GrpLst = formGroupedPhrasesFromChunksForPara(phrs2);
+
+
+		List<List<ParseTreeChunk>> res = pgGen.generalize(para0, sent2GrpLst);
+
+		return res;
+
+	}
+
+	public GeneralizationResult  assessRelevanceG(List<List<ParseTreeChunk>> para0, String para2) {
+		List<List<ParseTreeChunk>> res = assessRelevance( para0, para2);
+		return new GeneralizationResult(res);
+	}
+
+	public GeneralizationResult  assessRelevanceG(String para0, String para2) {
+		List<List<ParseTreeChunk>> res = assessRelevance( para0, para2);
+		return new GeneralizationResult(res);
+	}
+
+	public GeneralizationResult  assessRelevanceG(GeneralizationResult  para0, String para2) {
+		List<List<ParseTreeChunk>> res = assessRelevance( para0.getGen(), para2);
+		return new GeneralizationResult(res);
+	}
+
 	public List<List<ParseTreeChunk>> assessRelevanceCache(String para1, String para2) {
 		// first build PTs for each text
-		
+
 		ParseThicket pt1 = parseThicketHash.get(para1);
 		if (pt1==null){
-			 pt1=	ptBuilder.buildParseThicket(para1);
-			 parseThicketHash.put(para1, pt1);
+			pt1=	ptBuilder.buildParseThicket(para1);
+			parseThicketHash.put(para1, pt1);
 		}
-		
+
 		ParseThicket pt2 = parseThicketHash.get(para2);
 		if (pt2==null){
-			 pt2=	ptBuilder.buildParseThicket(para2);
-			 parseThicketHash.put(para2, pt2);
+			pt2=	ptBuilder.buildParseThicket(para2);
+			parseThicketHash.put(para2, pt2);
 		}
-		
+
 		// then build phrases and rst arcs
 		List<List<ParseTreeNode>> phrs1 = phraseBuilder.buildPT2ptPhrases(pt1);
 		List<List<ParseTreeNode>> phrs2 = phraseBuilder.buildPT2ptPhrases(pt2);
@@ -76,31 +140,29 @@ public class Matcher implements IGeneralizer<List<List<ParseTreeNode>>>{
 		List<List<ParseTreeChunk>> sent1GrpLst = formGroupedPhrasesFromChunksForPara(phrs1), 
 				sent2GrpLst = formGroupedPhrasesFromChunksForPara(phrs2);
 
-		
-		List<List<ParseTreeChunk>> res = md
-				.matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst);
+
+		List<List<ParseTreeChunk>> res = pgGen.generalize(sent1GrpLst, sent2GrpLst);
 		return res;
 
 	}
-	
+
 	public List<List<ParseTreeChunk>> generalize(List<List<ParseTreeNode>> phrs1,
 			List<List<ParseTreeNode>> phrs2) {
 		// group phrases by type
-				List<List<ParseTreeChunk>> sent1GrpLst = formGroupedPhrasesFromChunksForPara(phrs1), 
-						sent2GrpLst = formGroupedPhrasesFromChunksForPara(phrs2);
+		List<List<ParseTreeChunk>> sent1GrpLst = formGroupedPhrasesFromChunksForPara(phrs1), 
+				sent2GrpLst = formGroupedPhrasesFromChunksForPara(phrs2);
 
-				
-				List<List<ParseTreeChunk>> res = md
-						.matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst);
-				return res;
+
+		List<List<ParseTreeChunk>> res = pgGen.generalize(sent1GrpLst, sent2GrpLst);
+		return res;
 	}
-	private List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForPara(
+	protected List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForPara(
 			List<List<ParseTreeNode>> phrs) {
 		List<List<ParseTreeChunk>> results = new ArrayList<List<ParseTreeChunk>>();
 		List<ParseTreeChunk> nps = new ArrayList<ParseTreeChunk>(), vps = new ArrayList<ParseTreeChunk>(), 
 				pps = new ArrayList<ParseTreeChunk>();
 		for(List<ParseTreeNode> ps:phrs){
-			ParseTreeChunk ch = convertNodeListIntoChunk(ps);
+			ParseTreeChunk ch = new ParseTreeChunk(ps);
 			String ptype = ps.get(0).getPhraseType();
 			if (ptype.equals("NP")){
 				nps.add(ch);
@@ -122,16 +184,31 @@ public class Matcher implements IGeneralizer<List<List<ParseTreeNode>>>{
 		}
 		ParseTreeChunk ch = new ParseTreeChunk(lemmas, poss, 0, 0);
 		ch.setMainPOS(ps.get(0).getPhraseType());
+		ch.setParseTreeNodes(ps);
 		return ch;
 	}
-	
+
 	// this function is the main entry point into the PT builder if rst arcs are required
 	public ParseThicket buildParseThicketFromTextWithRST(String para){
 		ParseThicket pt = ptBuilder.buildParseThicket(para);
-		phraseBuilder.buildPT2ptPhrases(pt);
+		List<List<ParseTreeNode>> phrs = phraseBuilder.buildPT2ptPhrases(pt);
+		pt.setPhrases(phrs);
 		return pt;	
 	}
 
+	// verify that all sections (NP, PRP and VP are present
+	public boolean isCoveredByTemplate(List<List<ParseTreeChunk>> template, List<List<ParseTreeChunk>> sampleGen){
+		try {
+			if (template.size() == sampleGen.size() && sampleGen.get(0).size()>0  &&  sampleGen.get(1).size()>0  )
+				//template.get(0).get(0).getParseTreeNodes().size() == template.get(0).get(0).size())
+				return true;
+		} catch (Exception e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+
+		return false;
+	}
 
 	@Override
 	public List<List<List<ParseTreeNode>>> generalize(Object o1, Object o2) {
@@ -139,4 +216,48 @@ public class Matcher implements IGeneralizer<List<List<ParseTreeNode>>>{
 		return null;
 	}
 
+
+	public static void main(String[] args){
+		Matcher m = new Matcher();
+
+		m.buildParseThicketFromTextWithRST("Mary Poppins got her identification 8765");
+
+		List<List<ParseTreeChunk>> template = m.assessRelevance("John Doe send his California driver license 1234567", 
+				"John Travolta send her california license 4567456"
+				//"New York hid her US social number 666-66-6666");
+				);
+
+		System.out.println(template+"\n");
+		//in		
+		List<List<ParseTreeChunk>> res = m.assessRelevance(template, "Mary Jones send her Canada prisoner id number 666666666");
+		System.out.println(res+ " => "+
+				m.isCoveredByTemplate(template, res));
+		res = m.assessRelevance(template, "Mary Stewart hid her Mexico cook id number 666666666");
+		System.out.println(res + " => "+
+				m.isCoveredByTemplate(template, res));
+		res = m.assessRelevance(template, "Robin mentioned her Peru fisher id  2345");
+		System.out.println(res+ " => "+
+				m.isCoveredByTemplate(template, res));
+		res = m.assessRelevance(template, "Yesterday Peter Doe hid his Bolivia set id number 666666666");
+		System.out.println(res + " => "+
+				m.isCoveredByTemplate(template, res));
+		res = m.assessRelevance(template, "Robin mentioned her best Peru fisher man id  2345");
+		System.out.println(res+ " => "+
+				m.isCoveredByTemplate(template, res));
+		//out		
+		res = m.assessRelevance(template, "Spain hid her Canada driver id number 666666666");
+		System.out.println(res+ " => "+
+				m.isCoveredByTemplate(template, res));
+		res = m.assessRelevance(template, "John Poppins hid her  prisoner id  666666666");
+		System.out.println(res+ " => "+
+				m.isCoveredByTemplate(template, res));
+
+		res = m.assessRelevance(template, "Microsoft announced its Windows Azure release number 666666666");
+		System.out.println(res+ " => "+
+				m.isCoveredByTemplate(template, res));
+		res = m.assessRelevance(template, "John Poppins hid her Google id  666666666");
+		System.out.println(res+ " => "+
+				m.isCoveredByTemplate(template, res));
+	}
 }
+

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PT2ThicketPhraseBuilder.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PT2ThicketPhraseBuilder.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PT2ThicketPhraseBuilder.java
index 7612f26..5f07593 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PT2ThicketPhraseBuilder.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PT2ThicketPhraseBuilder.java
@@ -1,26 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package opennlp.tools.parse_thicket.matching;
 
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.logging.Logger;
 
 import opennlp.tools.parse_thicket.ParseThicket;
 import opennlp.tools.parse_thicket.ParseTreeNode;
 import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;
 import opennlp.tools.parse_thicket.rhetoric_structure.RhetoricStructureArcsBuilder;
-
-import org.jgrapht.Graph;
-import org.jgrapht.graph.DefaultEdge;
-import org.jgrapht.graph.SimpleGraph;
-
-
 import edu.stanford.nlp.trees.Tree;
 
 public class PT2ThicketPhraseBuilder {
-	
+
 	RhetoricStructureArcsBuilder rstBuilder = new RhetoricStructureArcsBuilder();
-	
+	private static Logger log = Logger
+		      .getLogger("opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder");
+
 	/*
 	 * Building phrases takes a Parse Thicket and forms phrases for each sentence individually
 	 * Then based on built phrases and obtained arcs, it builds arcs for RST
@@ -29,108 +43,111 @@ public class PT2ThicketPhraseBuilder {
 
 	public List<List<ParseTreeNode>> buildPT2ptPhrases(ParseThicket pt ) {
 		List<List<ParseTreeNode>> phrasesAllSent = new ArrayList<List<ParseTreeNode>> ();
+		if (pt ==null) // parsing failed, return empty
+			return phrasesAllSent;
 		Map<Integer, List<List<ParseTreeNode>>> sentNumPhrases = new HashMap<Integer, List<List<ParseTreeNode>>>();
 		// build regular phrases
 		for(int nSent=0; nSent<pt.getSentences().size(); nSent++){
-			
-			
 			List<ParseTreeNode> sentence = pt.getNodesThicket().get(nSent);
 			Tree ptree = pt.getSentences().get(nSent);
 			//ptree.pennPrint();
 			List<List<ParseTreeNode>> phrases = buildPT2ptPhrasesForASentence(ptree, sentence);
-			System.out.println(phrases);
+			log.info(phrases.toString());
 			phrasesAllSent.addAll(phrases);
 			sentNumPhrases.put(nSent, phrases);
 
 		}
-		
+
 		// discover and add RST arcs
 		List<WordWordInterSentenceRelationArc> arcsRST =
 				rstBuilder.buildRSTArcsFromMarkersAndCorefs(pt.getArcs(), sentNumPhrases, pt);
-		
+
 		List<WordWordInterSentenceRelationArc> arcs = pt.getArcs();
 		arcs.addAll(arcsRST);
 		pt.setArcs(arcs);
 		
-		
+		if (pt.getArcs().size()>20){
+			log.info(pt.toString());
+		}
+
 		List<List<ParseTreeNode>> expandedPhrases = expandTowardsThicketPhrases(phrasesAllSent, pt.getArcs(), sentNumPhrases, pt);
 		return expandedPhrases;
 	}
 
-/* Take all phrases, all arcs and merge phrases into Thicket phrases.
- * Then add the set of generalized (Thicket) phrases to the input set of phrases
- * phrasesAllSent - list of lists of phrases for each sentence
- * sentNumPhrase - map , gives for each sentence id, the above list
- * arcs - arcs formed so far
- * pt - the built Parse Thicket
- */
-	private List<List<ParseTreeNode>> expandTowardsThicketPhrases(
+	/* Take all phrases, all arcs and merge phrases into Thicket phrases.
+	 * Then add the set of generalized (Thicket) phrases to the input set of phrases
+	 * phrasesAllSent - list of lists of phrases for each sentence
+	 * sentNumPhrase - map , gives for each sentence id, the above list
+	 * arcs - arcs formed so far
+	 * pt - the built Parse Thicket
+	 */
+	protected List<List<ParseTreeNode>> expandTowardsThicketPhrases(
 			List<List<ParseTreeNode>> phrasesAllSent,
 			List<WordWordInterSentenceRelationArc> arcs,
 			Map<Integer, List<List<ParseTreeNode>>> sentNumPhrases, 
 			ParseThicket pt ) {
 		List<List<ParseTreeNode>> thicketPhrasesAllSent = new ArrayList<List<ParseTreeNode>>();
-		
-		
-			for(int nSent=0; nSent<pt.getSentences().size(); nSent++){
-				for(int mSent=nSent+1; mSent<pt.getSentences().size(); mSent++){
-					// for given arc, find phrases connected by this arc and add to the list of phrases
-					for(WordWordInterSentenceRelationArc arc: arcs){
-						List<List<ParseTreeNode>> phrasesFrom = sentNumPhrases.get(nSent);
-						List<List<ParseTreeNode>> phrasesTo = sentNumPhrases.get(mSent);
-						int fromIndex = arc.getCodeFrom().getFirst();
-						int toIndex = arc.getCodeTo().getFirst();
-						if (nSent==fromIndex && mSent==toIndex){
-							int sentPosFrom = arc.getCodeFrom().getSecond();
-							int sentPosTo = arc.getCodeTo().getSecond();
-							// for the given arc arc, find phrases which are connected by it
-							List<ParseTreeNode> lFromFound = null, lToFound = null;
-							for(List<ParseTreeNode> lFrom: phrasesFrom){
-								if (lToFound!=null)
+
+
+		for(int nSent=0; nSent<pt.getSentences().size(); nSent++){
+			for(int mSent=nSent+1; mSent<pt.getSentences().size(); mSent++){
+				// for given arc, find phrases connected by this arc and add to the list of phrases
+				for(WordWordInterSentenceRelationArc arc: arcs){
+					List<List<ParseTreeNode>> phrasesFrom = sentNumPhrases.get(nSent);
+					List<List<ParseTreeNode>> phrasesTo = sentNumPhrases.get(mSent);
+					int fromIndex = arc.getCodeFrom().getFirst();
+					int toIndex = arc.getCodeTo().getFirst();
+					if (nSent==fromIndex && mSent==toIndex){
+						int sentPosFrom = arc.getCodeFrom().getSecond();
+						int sentPosTo = arc.getCodeTo().getSecond();
+						// for the given arc arc, find phrases which are connected by it
+						List<ParseTreeNode> lFromFound = null, lToFound = null;
+						for(List<ParseTreeNode> lFrom: phrasesFrom){
+							if (lToFound!=null)
+								break;
+							for(ParseTreeNode lFromP: lFrom){
+								if (lFromP.getId()!=null &&  lFromP.getId()==sentPosFrom){
+									lFromFound = lFrom;
 									break;
-								for(ParseTreeNode lFromP: lFrom){
-									if (lFromP.getId()!=null &&  lFromP.getId()==sentPosFrom){
-											lFromFound = lFrom;
-											break;
-										}
 								}
 							}
-							for(List<ParseTreeNode> lTo: phrasesTo){
-								if (lToFound!=null)
+						}
+						for(List<ParseTreeNode> lTo: phrasesTo){
+							if (lToFound!=null)
+								break;
+							for(ParseTreeNode lToP: lTo)
+								if (lToP.getId()!=null && lToP.getId()==sentPosTo){
+									lToFound = lTo;
 									break;
-								for(ParseTreeNode lToP: lTo)
-									if (lToP.getId()!=null && lToP.getId()==sentPosTo){
-										lToFound = lTo;
-										break;
-									}
-							}
-							// obtain a thicket phrase and add it to the list
-							if (lFromFound!=null && lToFound!=null){
-								
-								if (identicalSubPhrase(lFromFound, lToFound))
-									continue;
-								List<ParseTreeNode> appended = append(lFromFound, lToFound);
-								if (thicketPhrasesAllSent.contains(appended))
-									continue;
-								System.out.println("rel: "+arc);
-								System.out.println("From "+lFromFound);
-								System.out.println("TO "+lToFound);
-								thicketPhrasesAllSent.add(append(lFromFound, lToFound));	
-								//break;
-							}
+								}
+						}
+						// obtain a thicket phrase and add it to the list
+						if (lFromFound!=null && lToFound!=null){
+
+							if (identicalSubPhrase(lFromFound, lToFound))
+								continue;
+							List<ParseTreeNode> appended = append(lFromFound, lToFound);
+							if (thicketPhrasesAllSent.contains(appended))
+								continue;
+							log.info("rel: "+arc);
+							log.info("From "+lFromFound);
+							System.out.println("TO "+lToFound);
+							thicketPhrasesAllSent.add(append(lFromFound, lToFound));	
+							//break;
 						}
-						
 					}
+
 				}
 			}
-			phrasesAllSent.addAll(thicketPhrasesAllSent);
-			return phrasesAllSent;
+		}
+		phrasesAllSent.addAll(thicketPhrasesAllSent);
+		return phrasesAllSent;
 	}
 
-/* check that one phrase is subphrase of another by lemma (ignoring other node properties)
- * returns true if not found different word
- */
-	
+	/* check that one phrase is subphrase of another by lemma (ignoring other node properties)
+	 * returns true if not found different word
+	 */
+
 	private boolean identicalSubPhrase(List<ParseTreeNode> lFromFound,
 			List<ParseTreeNode> lToFound) {
 		for(int pos=0; pos<lFromFound.size()&& pos<lToFound.size(); pos++){
@@ -143,8 +160,17 @@ public class PT2ThicketPhraseBuilder {
 	private List<ParseTreeNode> append(List<ParseTreeNode> lFromFound,
 			List<ParseTreeNode> lToFound) {
 		List<ParseTreeNode> appendList = new ArrayList<ParseTreeNode>();
-		appendList.addAll(lFromFound);
-		appendList.addAll(lToFound);
+		if (lFromFound.get(0).getPhraseType().equals(lToFound.get(0).getPhraseType())){
+			appendList.addAll(lFromFound);
+			appendList.addAll(lToFound);
+		} else {
+			String pType = lFromFound.get(0).getPhraseType();
+			appendList.addAll(lFromFound);
+			for(ParseTreeNode p: lToFound){
+				p.setPhraseType(pType);
+				appendList.add(p);
+			}
+		}
 		return appendList;
 	}
 
@@ -159,10 +185,10 @@ public class PT2ThicketPhraseBuilder {
 	}
 
 
-	
 
-/*
- * 
+
+	/*
+	 * 
 [[<1>NP'Iran':NNP], [<2>VP'refuses':VBZ, <3>VP'to':TO, <4>VP'accept':VB, <5>VP'the':DT, <6>VP'UN':NNP, 
 <7>VP'proposal':NN, <8>VP'to':TO, <9>VP'end':VB, <10>VP'its':PRP$, <11>VP'dispute':NN, <12>VP'over':IN, <13>VP'its':PRP$,
  <14>VP'work':NN, <15>VP'on':IN, <16>VP'nuclear':JJ, <17>VP'weapons':NNS], [<3>VP'to':TO, <4>VP'accept':VB, <5>VP'the':DT,
@@ -177,9 +203,9 @@ public class PT2ThicketPhraseBuilder {
    <14>PP'work':NN, <15>PP'on':IN, <16>PP'nuclear':JJ, <17>PP'weapons':NNS], [<13>NP'its':PRP$, <14>NP'work':NN, 
    <15>NP'on':IN, <16>NP'nuclear':JJ, <17>NP'weapons':NNS], [<13>NP'its':PRP$, <14>NP'work':NN],
  [<15>PP'on':IN, <16>PP'nuclear':JJ, <17>PP'weapons':NNS], [<16>NP'nuclear':JJ, <17>NP'weapons':NNS]]
- *  
- * 
- */
+	 *  
+	 * 
+	 */
 	private void navigateR(Tree t, List<ParseTreeNode> sentence,
 			List<List<ParseTreeNode>> phrases) {
 		if (!t.isPreTerminal()) {
@@ -191,7 +217,17 @@ public class PT2ThicketPhraseBuilder {
 					if (!nodes.isEmpty())
 						phrases.add(nodes);
 					if (nodes.size()>0 && nodes.get(0).getId()==null){
-							System.err.println("Failed alignment:"+nodes);
+						if (nodes.size()>1 && nodes.get(1)!=null && nodes.get(1).getId()!=null){
+							try {
+								ParseTreeNode n = nodes.get(0);
+								n.setId(nodes.get(1).getId()-1);
+								nodes.set(0, n);
+							} catch (Exception e) {
+								e.printStackTrace();
+							}
+						} else {
+							log.severe("Failed alignment:"+nodes);
+						}
 					}
 				}
 			}
@@ -204,22 +240,22 @@ public class PT2ThicketPhraseBuilder {
 			return ;
 		}
 	}
-	
-	
+
+
 	/* alignment of phrases extracted from tree against the sentence as a list of lemma-pos */
-	
+
 	private List<ParseTreeNode> assignIndexToNodes(List<ParseTreeNode> node,
 			List<ParseTreeNode> sentence) {
 		if (sentence==null || sentence.size()<1)
 			return node;
-		
+
 		List<ParseTreeNode> results = new ArrayList<ParseTreeNode>();
-		
+
 		for(int i= 0; i<node.size(); i++){
 			String thisLemma = node.get(i).getWord();			
 			String thisPOS = node.get(i).getPos();
 			String nextLemma = null, nextPOS = null;
-			
+
 			if (i+1<node.size()){
 				nextLemma = node.get(i+1).getWord();
 				nextPOS = node.get(i+1).getPos();
@@ -231,20 +267,21 @@ public class PT2ThicketPhraseBuilder {
 					continue;
 				if (i+1<node.size() && j+1 < sentence.size() && nextLemma!=null 
 						&& ! (sentence.get(j+1).getWord().equals(nextLemma)
-					  && sentence.get(j+1).getPos().equals(nextPOS)))
+								&& sentence.get(j+1).getPos().equals(nextPOS)))
 					continue;
 				matchOccurred = true;
 				break;
 			}
-			
+
 			ParseTreeNode n = node.get(i);
 			if (matchOccurred){
 				n.setId(sentence.get(j).getId());
 				n.setNe(sentence.get(j).getNe());
+				n.setAttributes(sentence.get(j).getAttributes());
 			}
 			results.add(n);
 		}
-		
+
 		try {
 			if (results!=null && results.size()>1 && results.get(0)!=null && results.get(0).getId()!=null &&
 					results.get(1) !=null && results.get(1).getId()!=null &&  results.get(1).getId()>0){
@@ -313,53 +350,55 @@ public class PT2ThicketPhraseBuilder {
 			return nlist;
 		if (value.equals("ROOT")|| value.equals("S")) 
 			return nlist;
-		
+
 		String[] pos_value = value.split(" ");
 		ParseTreeNode node = null;
 		if (value.endsWith("P")){
 			node = new ParseTreeNode("", ""); 
-		    node.setPhraseType(value);
+			node.setPhraseType(value);
 		} else 
-		if (pos_value != null && pos_value.length==2){
-			node = new ParseTreeNode(pos_value[0], pos_value[1]);
-		} else {
-			node = new ParseTreeNode(value, "");
-		}
-			
+			if (pos_value != null && pos_value.length==2){
+				node = new ParseTreeNode(pos_value[0], pos_value[1]);
+			} else {
+				node = new ParseTreeNode(value, "");
+			}
+
 		nlist.add(node);
 		return nlist;
 	}
-	
+
 	private ParseTreeNode parsePhraseNode(String value) {
-		
+
 		if (value.equals("ROOT")|| value.equals("S")) 
 			return null;
-		
+
 		String[] pos_value = value.split(" ");
 		ParseTreeNode node = null;
 		if (value.endsWith("P")){
 			node = new ParseTreeNode("", ""); 
-		    node.setPhraseType(value);
+			node.setPhraseType(value);
 		} else 
-		if (pos_value != null && pos_value.length==2){
-			node = new ParseTreeNode(pos_value[0], pos_value[1]);
-		} else {
-			node = new ParseTreeNode(value, "");
-		}			
-		
+			if (pos_value != null && pos_value.length==2){
+				node = new ParseTreeNode(pos_value[0], pos_value[1]);
+			} else {
+				node = new ParseTreeNode(value, "");
+			}			
+
 		return node;
 	}
-	
+
 	public List<ParseTreeNode> parsePhrase(String value, String fullDump) {
-		
+
 		List<ParseTreeNode> nlist = new ArrayList<ParseTreeNode>(); 
 		if (value.equals("S")|| value.equals("ROOT"))
-				return nlist;
+			return nlist;
+		// first phrase type normalization
+		fullDump = fullDump.replace("NP-TMP", "NP");
 		
 		String flattened = fullDump.replace("(ROOT","").replace("(NP","").replace("(VP","").replace("(PP","")
 				.replace("(ADVP","").replace("(UCP","").replace("(ADJP","").replace("(SBAR","").
 				replace("(PRT", "").replace("(WHNP","").
-				 replace("))))",")").replace(")))",")").replace("))",")")
+				replace("))))",")").replace(")))",")").replace("))",")")
 				.replace("   ", " ").replace("  ", " ").replace("(S","")
 				.replace(") (","#").replace(")  (", "#");
 		String[] flattenedArr =  flattened.split("#");
@@ -373,9 +412,9 @@ public class PT2ThicketPhraseBuilder {
 		}
 		return nlist;
 	}
-	
-/* recursion example */
-	
+
+	/* recursion example */
+
 	private StringBuilder toStringBuilder(StringBuilder sb, Tree t) {
 		if (t.isLeaf()) {
 			if (t.label() != null) {
@@ -399,23 +438,40 @@ public class PT2ThicketPhraseBuilder {
 			return sb.append(')');
 		}
 	}
-	
+
 	public static void main(String[] args){
+		Matcher matcher = new Matcher();
+		String para = 
+				"Last Wednesday, world powers reached agreement with Iran on limiting Iranian nuclear activity in return for the lifting of sanctions. "
+		/*+
+						"The Israeli Prime Minister called the deal an historic mistake which would only make it easier for Iran to back its proxies in the Middle East. "+
+						"That position may have hardened after Iran's supreme leader Ayatollah Ali Khamenei said his country would continue its support for the people of Palestine after the deal. "+
+						"Saudi Arabia has officially said it supports the deal, although it is also thought to have similar concerns to Israel that the agreement legitimises Iran. "
+						*/
+						;
+		matcher.buildParseThicketFromTextWithRST(para);
+		
+		
 		PT2ThicketPhraseBuilder phraseBuilder = new PT2ThicketPhraseBuilder();
 		String line = "(NP (NNP Iran)) (VP (VBZ refuses) (S (VP (TO to) (VP (VB accept) (S (NP (DT the) " +
 				"(NNP UN) (NN proposal)) (VP (TO to) (VP (VB end) (NP (PRP$ its) (NN dispute))))))))";
-		
+
 		List<ParseTreeNode> res = phraseBuilder. parsePhrase("NP", line);
 		System.out.println(res);
-		
+
 
 		line = "(VP (VBP am) (NP (NP (DT a) (NNP US) (NN citizen)) (UCP (VP (VBG living) (ADVP (RB abroad))) (, ,) (CC and) (ADJP (JJ concerned) (PP (IN about) (NP (NP (DT the) (NN health) (NN reform) (NN regulation)) (PP (IN of) (NP (CD 2014)))))))))";
 		res = phraseBuilder. parsePhrase("VP", line);
 		System.out.println(res);
-				
+
 		line = "(VP (TO to) (VP (VB wait) (SBAR (IN till) (S (NP (PRP I)) (VP (VBP am) (ADJP (JJ sick) (S (VP (TO to) (VP (VB buy) (NP (NN health) (NN insurance)))))))))))";
 		res = phraseBuilder. parsePhrase("VP", line);
 		System.out.println(res);
 	}
-  
+
 }
+/*
+ * The Ukrainian government, Western leaders and Nato all say there is clear evidence that Russia is helping the rebels in the eastern Donetsk and Luhansk regions with heavy weapons and soldiers. Independent experts echo that accusation.
+Moscow denies it, insisting that any Russians serving with the rebels are volunteers.
+
+*/

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreeChunkListScorer.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreeChunkListScorer.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreeChunkListScorer.java
deleted file mode 100644
index 21e7f52..0000000
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreeChunkListScorer.java
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.parse_thicket.matching;
-
-import java.util.List;
-
-public class ParseTreeChunkListScorer {
-  // find the single expression with the highest score
-  public double getParseTreeChunkListScore(
-      List<List<ParseTreePath>> matchResult) {
-    double currScore = 0.0;
-    for (List<ParseTreePath> chunksGivenPhraseType : matchResult)
-      for (ParseTreePath chunk : chunksGivenPhraseType) {
-        Double score = getScore(chunk);
-        // System.out.println(chunk+ " => score >>> "+score);
-        if (score > currScore) {
-          currScore = score;
-        }
-      }
-    return currScore;
-  }
-
-  // get max score per phrase type and then sum up
-  public double getParseTreeChunkListScoreAggregPhraseType(
-      List<List<ParseTreePath>> matchResult) {
-    double currScoreTotal = 0.0;
-    for (List<ParseTreePath> chunksGivenPhraseType : matchResult) {
-      double currScorePT = 0.0;
-      for (ParseTreePath chunk : chunksGivenPhraseType) {
-        Double score = getScore(chunk);
-        // System.out.println(chunk+ " => score >>> "+score);
-        if (score > currScorePT) {
-          currScorePT = score;
-        }
-      }
-      // if substantial for given phrase type
-      if (currScorePT > 0.5) {
-        currScoreTotal += currScorePT;
-      }
-    }
-    return currScoreTotal;
-  }
-
-  // score is meaningful only for chunks which are results of generalization
-
-  public double getScore(ParseTreePath chunk) {
-    double score = 0.0;
-    int i = 0;
-    for (String l : chunk.getLemmas()) {
-      String pos = chunk.getPOSs().get(i);
-      if (l.equals("*")) {
-        if (pos.startsWith("CD")) { // number vs number gives high score
-                                    // although different numbers
-          score += 0.7;
-        } else if (pos.endsWith("_high")) { // if query modification adds 'high'
-          score += 1.0;
-        } else {
-          score += 0.1;
-        }
-      } else {
-
-        if (pos.startsWith("NN") || pos.startsWith("NP")
-            || pos.startsWith("CD") || pos.startsWith("RB")) {
-          score += 1.0;
-        } else if (pos.startsWith("VB") || pos.startsWith("JJ")) {
-          if (l.equals("get")) { // 'common' verbs are not that important
-            score += 0.3;
-          } else {
-            score += 0.5;
-          }
-        } else {
-          score += 0.3;
-        }
-      }
-      i++;
-
-    }
-    return score;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePath.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePath.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePath.java
deleted file mode 100644
index d0bf61f..0000000
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePath.java
+++ /dev/null
@@ -1,422 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.parse_thicket.matching;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import opennlp.tools.textsimilarity.LemmaPair;
-
-public class ParseTreePath {
-  private String mainPOS;
-
-  private List<String> lemmas;
-
-  private List<String> POSs;
-  //order number of a word in a sentence
-  private List<Integer> wordUniqueCodes;
-
-  private int startPos;
-
-  private int endPos;
-
-  private int size;
-
-  private ParseTreePathMatcher parseTreeMatcher;
-
-  private LemmaFormManager lemmaFormManager;
-
-  private GeneralizationListReducer generalizationListReducer;
-
-  public ParseTreePath() {
-  }
-
-  public ParseTreePath(List<String> lemmas, List<String> POSs, int startPos,
-      int endPos) {
-    this.lemmas = lemmas;
-    this.POSs = POSs;
-    this.startPos = startPos;
-    this.endPos = endPos;
-
-  }
-
-  // constructor which takes lemmas and POS as lists so that phrases can be
-  // conveniently specified.
-  // usage: stand-alone runs
-  public ParseTreePath(String mPOS, String[] lemmas, String[] POSss) {
-    this.mainPOS = mPOS;
-    this.lemmas = new ArrayList<String>();
-    for (String l : lemmas) {
-      this.lemmas.add(l);
-    }
-    if (mPOS.equals("SENTENCE")){
-    	for(int i=0; i<lemmas.length; i++){
-    		wordUniqueCodes.add(this.lemmas.get(i).hashCode());
-    	}
-    }
-    
-    this.POSs = new ArrayList<String>();
-    for (String p : POSss) {
-      this.POSs.add(p);
-    }
-  }
-
-  // constructor which takes lemmas and POS as lists so that phrases can be
-  // conveniently specified.
-  // usage: stand-alone runs
-  public ParseTreePath(String mPOS, List<String> lemmas, List<String> POSss) {
-    this.mainPOS = mPOS;
-    this.lemmas = lemmas;
-    this.POSs = POSss;
-
-  }
-
-  // Before:
-  // [0(S-At home we like to eat great pizza deals), 0(PP-At home), 0(IN-At),
-  // 3(NP-home), 3(NN-home), 8(NP-we),
-  // 8(PRP-we), 11(VP-like to eat great pizza deals), 11(VBP-like), 16(S-to eat
-  // great pizza deals), 16(VP-to eat great
-  // pizza deals),
-  // 16(TO-to), 19(VP-eat great pizza deals), 19(VB-eat), 23(NP-great pizza
-  // deals), 23(JJ-great), 29(NN-pizza),
-  // 35(NNS-deals)]
-
-  // After:
-  // [S [IN-At NP-home NP-we VBP-like ], PP [IN-At NP-home ], IN [IN-At ], NP
-  // [NP-home ], NN [NP-home ], NP [NP-we ],
-  // PRP [NP-we ], VP [VBP-like TO-to VB-eat JJ-great ], VBP [VBP-like ], S
-  // [TO-to VB-eat JJ-great NN-pizza ], VP
-  // [TO-to VB-eat JJ-great NN-pizza ], TO [TO-to ], VP [VB-eat JJ-great
-  // NN-pizza NNS-deals ],
-  // VB [VB-eat ], NP [JJ-great NN-pizza NNS-deals ], JJ [JJ-great ], NN
-  // [NN-pizza ], NNS [NNS-deals ]]
-
-  public List<ParseTreePath> buildChunks(List<LemmaPair> parseResults) {
-    List<ParseTreePath> chunksResults = new ArrayList<ParseTreePath>();
-    for (LemmaPair chunk : parseResults) {
-      String[] lemmasAr = chunk.getLemma().split(" ");
-      List<String> poss = new ArrayList<String>(), lems = new ArrayList<String>();
-      for (String lem : lemmasAr) {
-        lems.add(lem);
-        // now looking for POSs for individual word
-        for (LemmaPair chunkCur : parseResults) {
-          if (chunkCur.getLemma().equals(lem)
-              &&
-              // check that this is a proper word in proper position
-              chunkCur.getEndPos() <= chunk.getEndPos()
-              && chunkCur.getStartPos() >= chunk.getStartPos()) {
-            poss.add(chunkCur.getPOS());
-            break;
-          }
-        }
-      }
-      if (lems.size() != poss.size()) {
-        System.err.println("lems.size()!= poss.size()");
-      }
-      if (lems.size() < 2) { // single word phrase, nothing to match
-        continue;
-      }
-      ParseTreePath ch = new ParseTreePath(lems, poss, chunk.getStartPos(),
-          chunk.getEndPos());
-      ch.setMainPOS(chunk.getPOS());
-      chunksResults.add(ch);
-    }
-    return chunksResults;
-  }
-
-  public List<List<ParseTreePath>> matchTwoSentencesGivenPairLists(
-      List<LemmaPair> sent1Pairs, List<LemmaPair> sent2Pairs) {
-
-    List<ParseTreePath> chunk1List = buildChunks(sent1Pairs);
-    List<ParseTreePath> chunk2List = buildChunks(sent2Pairs);
-
-    List<List<ParseTreePath>> sent1GrpLst = groupChunksAsParses(chunk1List);
-    List<List<ParseTreePath>> sent2GrpLst = groupChunksAsParses(chunk2List);
-
-    System.out.println("=== Grouped chunks 1 " + sent1GrpLst);
-    System.out.println("=== Grouped chunks 2 " + sent2GrpLst);
-
-    return matchTwoSentencesGroupedChunks(sent1GrpLst, sent2GrpLst);
-  }
-
-  // groups noun phrases, verb phrases, propos phrases etc. for separate match
-
-  public List<List<ParseTreePath>> groupChunksAsParses(
-      List<ParseTreePath> parseResults) {
-    List<ParseTreePath> np = new ArrayList<ParseTreePath>(), vp = new ArrayList<ParseTreePath>(), prp = new ArrayList<ParseTreePath>(), sbarp = new ArrayList<ParseTreePath>(), pp = new ArrayList<ParseTreePath>(), adjp = new ArrayList<ParseTreePath>(), whadvp = new ArrayList<ParseTreePath>(), restOfPhrasesTypes = new ArrayList<ParseTreePath>();
-    List<List<ParseTreePath>> results = new ArrayList<List<ParseTreePath>>();
-    for (ParseTreePath ch : parseResults) {
-      String mainPos = ch.getMainPOS().toLowerCase();
-
-      if (mainPos.equals("s")) {
-        continue;
-      }
-      if (mainPos.equals("np")) {
-        np.add(ch);
-      } else if (mainPos.equals("vp")) {
-        vp.add(ch);
-      } else if (mainPos.equals("prp")) {
-        prp.add(ch);
-      } else if (mainPos.equals("pp")) {
-        pp.add(ch);
-      } else if (mainPos.equals("adjp")) {
-        adjp.add(ch);
-      } else if (mainPos.equals("whadvp")) {
-        whadvp.add(ch);
-      } else if (mainPos.equals("sbar")) {
-        sbarp.add(ch);
-      } else {
-        restOfPhrasesTypes.add(ch);
-      }
-
-    }
-    results.add(np);
-    results.add(vp);
-    results.add(prp);
-    results.add(pp);
-    results.add(adjp);
-    results.add(whadvp);
-    results.add(restOfPhrasesTypes);
-
-    return results;
-
-  }
-
-  // main function to generalize two expressions grouped by phrase types
-  // returns a list of generalizations for each phrase type with filtered
-  // sub-expressions
-  public List<List<ParseTreePath>> matchTwoSentencesGroupedChunks(
-      List<List<ParseTreePath>> sent1, List<List<ParseTreePath>> sent2) {
-    List<List<ParseTreePath>> results = new ArrayList<List<ParseTreePath>>();
-    // first irerate through component
-    for (int comp = 0; comp < 2 && // just np & vp
-        comp < sent1.size() && comp < sent2.size(); comp++) {
-      List<ParseTreePath> resultComps = new ArrayList<ParseTreePath>();
-      // then iterate through each phrase in each component
-      for (ParseTreePath ch1 : sent1.get(comp)) {
-        for (ParseTreePath ch2 : sent2.get(comp)) { // simpler version
-          ParseTreePath chunkToAdd = parseTreeMatcher
-              .generalizeTwoGroupedPhrasesRandomSelectHighestScoreWithTransforms(
-                  ch1, ch2);
-
-          if (!lemmaFormManager.mustOccurVerifier(ch1, ch2, chunkToAdd)) {
-            continue; // if the words which have to stay do not stay, proceed to
-                      // other elements
-          }
-          Boolean alreadyThere = false;
-          for (ParseTreePath chunk : resultComps) {
-            if (chunk.equalsTo(chunkToAdd)) {
-              alreadyThere = true;
-              break;
-            }
-
-            if (parseTreeMatcher
-                .generalizeTwoGroupedPhrasesRandomSelectHighestScore(chunk,
-                    chunkToAdd).equalsTo(chunkToAdd)) {
-              alreadyThere = true;
-              break;
-            }
-          }
-
-          if (!alreadyThere) {
-            resultComps.add(chunkToAdd);
-          }
-
-          List<ParseTreePath> resultCompsReduced = generalizationListReducer
-              .applyFilteringBySubsumption(resultComps);
-          // if (resultCompsReduced.size() != resultComps.size())
-          // System.out.println("reduction of gen list occurred");
-        }
-      }
-      results.add(resultComps);
-    }
-
-    return results;
-  }
-
-  public Boolean equals(ParseTreePath ch) {
-    List<String> lems = ch.getLemmas();
-    List<String> poss = ch.POSs;
-
-    if (this.lemmas.size() <= lems.size())
-      return false; // sub-chunk should be shorter than chunk
-
-    for (int i = 0; i < lems.size() && i < this.lemmas.size(); i++) {
-      if (!(this.lemmas.get(i).equals(lems.get(i)) && this.POSs.get(i).equals(
-          poss.get(i))))
-        return false;
-    }
-    return true;
-  }
-
-  // 'this' is super - chunk of ch, ch is sub-chunk of 'this'
-  public Boolean isASubChunk(ParseTreePath ch) {
-    List<String> lems = ch.getLemmas();
-    List<String> poss = ch.POSs;
-
-    if (this.lemmas.size() < lems.size())
-      return false; // sub-chunk should be shorter than chunk
-
-    for (int i = 0; i < lems.size() && i < this.lemmas.size(); i++) {
-      if (!(this.lemmas.get(i).equals(lems.get(i)) && this.POSs.get(i).equals(
-          poss.get(i))))
-        return false;
-    }
-    return true;
-  }
-
-  public Boolean equalsTo(ParseTreePath ch) {
-    List<String> lems = ch.getLemmas();
-    List<String> poss = ch.POSs;
-    if (this.lemmas.size() != lems.size() || this.POSs.size() != poss.size())
-      return false;
-
-    for (int i = 0; i < lems.size(); i++) {
-      if (!(this.lemmas.get(i).equals(lems.get(i)) && this.POSs.get(i).equals(
-          poss.get(i))))
-        return false;
-    }
-
-    return true;
-  }
-
-  public String toString() {
-    String buf = " [";
-    if (mainPOS != null)
-      buf = mainPOS + " [";
-    for (int i = 0; i < lemmas.size() && i < POSs.size() // && i<=3
-    ; i++) {
-      buf += POSs.get(i) + "-" + lemmas.get(i) + " ";
-    }
-    return buf + "]";
-  }
-
-  public int compareTo(ParseTreePath o) {
-    if (this.size > o.size)
-      return -1;
-    else
-      return 1;
-
-  }
-
-  public String listToString(List<List<ParseTreePath>> chunks) {
-    StringBuffer buf = new StringBuffer();
-    if (chunks.get(0).size() > 0) {
-      buf.append(" np " + chunks.get(0).toString());
-    }
-    if (chunks.get(1).size() > 0) {
-      buf.append(" vp " + chunks.get(1).toString());
-    }
-    if (chunks.size() < 3) {
-      return buf.toString();
-    }
-    if (chunks.get(2).size() > 0) {
-      buf.append(" prp " + chunks.get(2).toString());
-    }
-    if (chunks.get(3).size() > 0) {
-      buf.append(" pp " + chunks.get(3).toString());
-    }
-    if (chunks.get(4).size() > 0) {
-      buf.append(" adjp " + chunks.get(4).toString());
-    }
-    if (chunks.get(5).size() > 0) {
-      buf.append(" whadvp " + chunks.get(5).toString());
-    }
-    /*
-     * if (mainPos.equals("np")) np.add(ch); else if (mainPos.equals( "vp"))
-     * vp.add(ch); else if (mainPos.equals( "prp")) prp.add(ch); else if
-     * (mainPos.equals( "pp")) pp.add(ch); else if (mainPos.equals( "adjp"))
-     * adjp.add(ch); else if (mainPos.equals( "whadvp")) whadvp.add(ch);
-     */
-    return buf.toString();
-  }
-
-  public List<List<ParseTreePath>> obtainParseTreeChunkListByParsingList(
-      String toParse) {
-    List<List<ParseTreePath>> results = new ArrayList<List<ParseTreePath>>();
-    // if (toParse.endsWith("]]]")){
-    // toParse = toParse.replace("[[","").replace("]]","");
-    // }
-    toParse = toParse.replace(" ]], [ [", "&");
-    String[] phraseTypeFragments = toParse.trim().split("&");
-    for (String toParseFragm : phraseTypeFragments) {
-      toParseFragm = toParseFragm.replace("],  [", "#");
-
-      List<ParseTreePath> resultsPhraseType = new ArrayList<ParseTreePath>();
-      String[] indivChunks = toParseFragm.trim().split("#");
-      for (String expr : indivChunks) {
-        List<String> lems = new ArrayList<String>(), poss = new ArrayList<String>();
-        expr = expr.replace("[", "").replace(" ]", "");
-        String[] pairs = expr.trim().split(" ");
-        for (String word : pairs) {
-          word = word.replace("]]", "").replace("]", "");
-          String[] pos_lem = word.split("-");
-          lems.add(pos_lem[1].trim());
-          poss.add(pos_lem[0].trim());
-        }
-        ParseTreePath ch = new ParseTreePath();
-        ch.setLemmas(lems);
-        ch.setPOSs(poss);
-        resultsPhraseType.add(ch);
-      }
-      results.add(resultsPhraseType);
-    }
-    System.out.println(results);
-    return results;
-
-    // 2.1 | Vietnam <b>embassy</b> <b>in</b> <b>Israel</b>: information on how
-    // to get your <b>visa</b> at Vietnam
-    // <b>embassy</b> <b>in</b> <b>Israel</b>. <b>...</b> <b>Spain</b>.
-    // Scotland. Sweden. Slovakia. Switzerland. T
-    // [Top of Page] <b>...</b>
-    // [[ [NN-* IN-in NP-israel ], [NP-* IN-in NP-israel ], [NP-* IN-* TO-* NN-*
-    // ], [NN-visa IN-* NN-* IN-in ]], [
-    // [VB-get NN-visa IN-* NN-* IN-in .-* ], [VBD-* IN-* NN-* NN-* .-* ], [VB-*
-    // NP-* ]]]
-
-  }
-
-  public void setMainPOS(String mainPOS) {
-    this.mainPOS = mainPOS;
-  }
-
-  public String getMainPOS() {
-    return mainPOS;
-  }
-
-  public List<String> getLemmas() {
-    return lemmas;
-  }
-
-  public void setLemmas(List<String> lemmas) {
-    this.lemmas = lemmas;
-  }
-
-  public List<String> getPOSs() {
-    return POSs;
-  }
-
-  public void setPOSs(List<String> pOSs) {
-    POSs = pOSs;
-  }
-
-  public ParseTreePathMatcher getParseTreeMatcher() {
-    return parseTreeMatcher;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathComparable.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathComparable.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathComparable.java
deleted file mode 100644
index 539c61e..0000000
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathComparable.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.parse_thicket.matching;
-
-import java.util.Comparator;
-
-public class ParseTreePathComparable implements Comparator<ParseTreePath> {
-  public int compare(ParseTreePath ch1, ParseTreePath ch2) {
-    for (int i = 0; i < ch1.getLemmas().size() && i < ch2.getLemmas().size(); i++) {
-      if (!(ch1.getLemmas().get(i).equals(ch2.getLemmas().get(i)) && ch1
-          .getPOSs().get(i).equals(ch2.getPOSs().get(i))))
-        return -1;
-    }
-    return 0;
-
-  }
-}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathMatcher.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathMatcher.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathMatcher.java
deleted file mode 100644
index 7323a8e..0000000
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathMatcher.java
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.parse_thicket.matching;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-
-import opennlp.tools.textsimilarity.POSManager;
-
-public class ParseTreePathMatcher {
-
-  private static final int NUMBER_OF_ITERATIONS = 2;
-
-  private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
-  private POSManager posManager = new POSManager();
-  private LemmaFormManager lemmaFormManager = new LemmaFormManager();
-
-  public ParseTreePathMatcher() {
-
-  }
-
-  public ParseTreePath generalizeTwoGroupedPhrasesOLD(ParseTreePath chunk1,
-      ParseTreePath chunk2) {
-    List<String> pos1 = chunk1.getPOSs();
-    List<String> pos2 = chunk1.getPOSs();
-
-    List<String> commonPOS = new ArrayList<String>(), commonLemmas = new ArrayList<String>();
-    int k1 = 0, k2 = 0;
-    Boolean incrFirst = true;
-    while (k1 < pos1.size() && k2 < pos2.size()) {
-      // first check if the same POS
-      String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2));
-      if (sim != null) {
-        commonPOS.add(pos1.get(k1));
-        if (chunk1.getLemmas().size() > k1 && chunk2.getLemmas().size() > k2
-            && chunk1.getLemmas().get(k1).equals(chunk2.getLemmas().get(k2))) {
-          commonLemmas.add(chunk1.getLemmas().get(k1));
-        } else {
-          commonLemmas.add("*");
-        }
-        k1++;
-        k2++;
-      } else if (incrFirst) {
-        k1++;
-      } else {
-        k2++;
-      }
-      incrFirst = !incrFirst;
-    }
-
-    ParseTreePath res = new ParseTreePath(commonLemmas, commonPOS, 0, 0);
-    // if (parseTreeChunkListScorer.getScore(res)> 0.6)
-    // System.out.println(chunk1 + "  + \n"+ chunk2 + " = \n" + res);
-    return res;
-  }
-
-  // A for B => B have A
-  // transforms expr { A B C prep X Y }
-  // into {A B {X Y} C}
-  // should only be applied to a noun phrase
-  public ParseTreePath prepositionalNNSTransform(ParseTreePath ch) {
-    List<String> transfPOS = new ArrayList<String>(), transfLemmas = new ArrayList<String>();
-    if (!ch.getPOSs().contains("IN"))
-      return ch;
-    int indexIN = ch.getPOSs().lastIndexOf("IN");
-
-    if (indexIN < 2)// preposition is a first word - should not be in a noun
-                    // phrase
-      return ch;
-    String Word_IN = ch.getLemmas().get(indexIN);
-    if (!(Word_IN.equals("to") || Word_IN.equals("on") || Word_IN.equals("in")
-        || Word_IN.equals("of") || Word_IN.equals("with")
-        || Word_IN.equals("by") || Word_IN.equals("from")))
-      return ch;
-
-    List<String> toShiftAfterPartPOS = ch.getPOSs().subList(indexIN + 1,
-        ch.getPOSs().size());
-    List<String> toShiftAfterPartLemmas = ch.getLemmas().subList(indexIN + 1,
-        ch.getLemmas().size());
-
-    if (indexIN - 1 > 0)
-      transfPOS.addAll(ch.getPOSs().subList(0, indexIN - 1));
-    transfPOS.addAll(toShiftAfterPartPOS);
-    transfPOS.add(ch.getPOSs().get(indexIN - 1));
-
-    if (indexIN - 1 > 0)
-      transfLemmas.addAll(ch.getLemmas().subList(0, indexIN - 1));
-    transfLemmas.addAll(toShiftAfterPartLemmas);
-    transfLemmas.add(ch.getLemmas().get(indexIN - 1));
-
-    return new ParseTreePath(transfLemmas, transfPOS, 0, 0);
-  }
-
-  public ParseTreePath generalizeTwoGroupedPhrasesRandomSelectHighestScoreWithTransforms(
-      ParseTreePath chunk1, ParseTreePath chunk2) {
-    ParseTreePath chRes1 = generalizeTwoGroupedPhrasesRandomSelectHighestScore(
-        chunk1, chunk2);
-    ParseTreePath chRes2 = generalizeTwoGroupedPhrasesRandomSelectHighestScore(
-        prepositionalNNSTransform(chunk1), chunk2);
-    ParseTreePath chRes3 = generalizeTwoGroupedPhrasesRandomSelectHighestScore(
-        prepositionalNNSTransform(chunk2), chunk1);
-
-    ParseTreePath chRes = null;
-    if (parseTreeChunkListScorer.getScore(chRes1) > parseTreeChunkListScorer
-        .getScore(chRes2))
-      if (parseTreeChunkListScorer.getScore(chRes1) > parseTreeChunkListScorer
-          .getScore(chRes3))
-        chRes = chRes1;
-      else
-        chRes = chRes3;
-    else if (parseTreeChunkListScorer.getScore(chRes2) > parseTreeChunkListScorer
-        .getScore(chRes3))
-      chRes = chRes2;
-    else
-      chRes = chRes3;
-
-    return chRes;
-  }
-
-  public ParseTreePath generalizeTwoGroupedPhrasesRandomSelectHighestScore(
-      ParseTreePath chunk1, ParseTreePath chunk2) {
-    List<String> pos1 = chunk1.getPOSs();
-    List<String> pos2 = chunk2.getPOSs();
-    // Map <ParseTreeChunk, Double> scoredResults = new HashMap <ParseTreeChunk,
-    // Double> ();
-    int timesRepetitiveRun = NUMBER_OF_ITERATIONS;
-
-    Double globalScore = -1.0;
-    ParseTreePath result = null;
-
-    for (int timesRun = 0; timesRun < timesRepetitiveRun; timesRun++) {
-      List<String> commonPOS = new ArrayList<String>(), commonLemmas = new ArrayList<String>();
-      int k1 = 0, k2 = 0;
-      Double score = 0.0;
-      while (k1 < pos1.size() && k2 < pos2.size()) {
-        // first check if the same POS
-        String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2));
-        String lemmaMatch = lemmaFormManager.matchLemmas(null, chunk1
-            .getLemmas().get(k1), chunk2.getLemmas().get(k2), sim);
-        // if (LemmaFormManager.acceptableLemmaAndPOS(sim, lemmaMatch)){
-        if ((sim != null)
-            && (lemmaMatch == null || (lemmaMatch != null && !lemmaMatch
-                .equals("fail")))) {
-          // if (sim!=null){ // && (lemmaMatch!=null &&
-          // !lemmaMatch.equals("fail"))){
-          commonPOS.add(pos1.get(k1));
-          if (chunk1.getLemmas().size() > k1 && chunk2.getLemmas().size() > k2
-              && lemmaMatch != null) {
-            commonLemmas.add(lemmaMatch);
-
-          } else {
-            commonLemmas.add("*");
-
-          }
-          k1++;
-          k2++;
-        } else if (Math.random() > 0.5) {
-          k1++;
-        } else {
-          k2++;
-        }
-
-      }
-      ParseTreePath currResult = new ParseTreePath(commonLemmas, commonPOS,
-          0, 0);
-      score = parseTreeChunkListScorer.getScore(currResult);
-      if (score > globalScore) {
-        // System.out.println(chunk1 + "  + \n"+ chunk2 + " = \n" +
-        // result+" score = "+ score +"\n\n");
-        result = currResult;
-        globalScore = score;
-      }
-    }
-
-    for (int timesRun = 0; timesRun < timesRepetitiveRun; timesRun++) {
-      List<String> commonPOS = new ArrayList<String>(), commonLemmas = new ArrayList<String>();
-      int k1 = pos1.size() - 1, k2 = pos2.size() - 1;
-      Double score = 0.0;
-      while (k1 >= 0 && k2 >= 0) {
-        // first check if the same POS
-        String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2));
-        String lemmaMatch = lemmaFormManager.matchLemmas(null, chunk1
-            .getLemmas().get(k1), chunk2.getLemmas().get(k2), sim);
-        // if (acceptableLemmaAndPOS(sim, lemmaMatch)){
-        if ((sim != null)
-            && (lemmaMatch == null || (lemmaMatch != null && !lemmaMatch
-                .equals("fail")))) {
-          commonPOS.add(pos1.get(k1));
-          if (chunk1.getLemmas().size() > k1 && chunk2.getLemmas().size() > k2
-              && lemmaMatch != null) {
-            commonLemmas.add(lemmaMatch);
-          } else {
-            commonLemmas.add("*");
-
-          }
-          k1--;
-          k2--;
-        } else if (Math.random() > 0.5) {
-          k1--;
-        } else {
-          k2--;
-        }
-
-      }
-      Collections.reverse(commonLemmas);
-      Collections.reverse(commonPOS);
-
-      ParseTreePath currResult = new ParseTreePath(commonLemmas, commonPOS,
-          0, 0);
-      score = parseTreeChunkListScorer.getScore(currResult);
-      if (score > globalScore) {
-        // System.out.println(chunk1 + "  + \n"+ chunk2 + " = \n" +
-        // currResult+" score = "+ score +"\n\n");
-        result = currResult;
-        globalScore = score;
-      }
-    }
-
-    // // System.out.println(chunk1 + "  + \n"+ chunk2 + " = \n" + result
-    // +" score = " +
-    // // parseTreeChunkListScorer.getScore(result)+"\n\n");
-    return result;
-  }
-
-  public Boolean acceptableLemmaAndPOS(String sim, String lemmaMatch) {
-    if (sim == null) {
-      return false;
-    }
-
-    if (lemmaMatch != null && !lemmaMatch.equals("fail")) {
-      return false;
-    }
-    // even if lemmaMatch==null
-    return true;
-    // if (sim!=null && (lemmaMatch!=null && !lemmaMatch.equals("fail"))){
-
-  }
-}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathMatcherDeterministic.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathMatcherDeterministic.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathMatcherDeterministic.java
deleted file mode 100644
index fc32380..0000000
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathMatcherDeterministic.java
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.parse_thicket.matching;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import opennlp.tools.stemmer.PorterStemmer;
-import opennlp.tools.textsimilarity.POSManager;
-
-
-public class ParseTreePathMatcherDeterministic {
-
-  private GeneralizationListReducer generalizationListReducer = new GeneralizationListReducer();
-
-  private LemmaFormManager lemmaFormManager = new LemmaFormManager();
-
-  private POSManager posManager = new POSManager();
-
-  /**
-   * key matching function which takes two phrases, aligns them and finds a set
-   * of maximum common sub-phrase
-   * 
-   * @param chunk1
-   * @param chunk2
-   * @return
-   */
-
-  public List<ParseTreePath> generalizeTwoGroupedPhrasesDeterministic(
-      ParseTreePath chunk1, ParseTreePath chunk2) {
-    List<String> pos1 = chunk1.getPOSs();
-    List<String> pos2 = chunk2.getPOSs();
-    List<String> lem1 = chunk1.getLemmas();
-    List<String> lem2 = chunk2.getLemmas();
-
-    List<String> lem1stem = new ArrayList<String>();
-    List<String> lem2stem = new ArrayList<String>();
-
-    PorterStemmer ps = new PorterStemmer();
-    for (String word : lem1) {
-      try {
-        lem1stem.add(ps.stem(word.toLowerCase()).toString());
-      } catch (Exception e) {
-        // e.printStackTrace();
-
-        if (word.length() > 2)
-          System.err.println("Unable to stem: " + word);
-      }
-    }
-    try {
-      for (String word : lem2) {
-        lem2stem.add(ps.stem(word.toLowerCase()).toString());
-      }
-    } catch (Exception e) {
-      System.err.println("problem processing word " + lem2.toString());
-    }
-
-    List<String> overlap = new ArrayList(lem1stem);
-    overlap.retainAll(lem2stem);
-
-    if (overlap == null || overlap.size() < 1)
-      return null;
-
-    List<Integer> occur1 = new ArrayList<Integer>(), occur2 = new ArrayList<Integer>();
-    for (String word : overlap) {
-      Integer i1 = lem1stem.indexOf(word);
-      Integer i2 = lem2stem.indexOf(word);
-      occur1.add(i1);
-      occur2.add(i2);
-    }
-
-    // now we search for plausible sublists of overlaps
-    // if at some position correspondence is inverse (one of two position
-    // decreases instead of increases)
-    // then we terminate current alignment accum and start a new one
-    List<List<int[]>> overlapsPlaus = new ArrayList<List<int[]>>();
-    // starts from 1, not 0
-    List<int[]> accum = new ArrayList<int[]>();
-    accum.add(new int[] { occur1.get(0), occur2.get(0) });
-    for (int i = 1; i < occur1.size(); i++) {
-
-      if (occur1.get(i) > occur1.get(i - 1)
-          && occur2.get(i) > occur2.get(i - 1))
-        accum.add(new int[] { occur1.get(i), occur2.get(i) });
-      else {
-        overlapsPlaus.add(accum);
-        accum = new ArrayList<int[]>();
-        accum.add(new int[] { occur1.get(i), occur2.get(i) });
-      }
-    }
-    if (accum.size() > 0) {
-      overlapsPlaus.add(accum);
-    }
-
-    List<ParseTreePath> results = new ArrayList<ParseTreePath>();
-    for (List<int[]> occur : overlapsPlaus) {
-      List<Integer> occr1 = new ArrayList<Integer>(), occr2 = new ArrayList<Integer>();
-      for (int[] column : occur) {
-        occr1.add(column[0]);
-        occr2.add(column[1]);
-      }
-
-      int ov1 = 0, ov2 = 0; // iterators over common words;
-      List<String> commonPOS = new ArrayList<String>(), commonLemmas = new ArrayList<String>();
-      // we start two words before first word
-      int k1 = occr1.get(ov1) - 2, k2 = occr2.get(ov2) - 2;
-      // if (k1<0) k1=0; if (k2<0) k2=0;
-      Boolean bReachedCommonWord = false;
-      while (k1 < 0 || k2 < 0) {
-        k1++;
-        k2++;
-      }
-      int k1max = pos1.size() - 1, k2max = pos2.size() - 1;
-      while (k1 <= k1max && k2 <= k2max) {
-        // first check if the same POS
-        String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2));
-        String lemmaMatch = lemmaFormManager.matchLemmas(ps, lem1.get(k1),
-            lem2.get(k2), sim);
-        if ((sim != null)
-            && (lemmaMatch == null || (lemmaMatch != null && !lemmaMatch
-                .equals("fail")))) {
-          commonPOS.add(pos1.get(k1));
-          if (lemmaMatch != null) {
-            commonLemmas.add(lemmaMatch);
-            // System.out.println("Added "+lemmaMatch);
-            if (k1 == occr1.get(ov1) && k2 == occr2.get(ov2))
-              bReachedCommonWord = true; // now we can have different increment
-                                         // opera
-            else {
-              if (occr1.size() > ov1 + 1 && occr2.size() > ov2 + 1
-                  && k1 == occr1.get(ov1 + 1) && k2 == occr2.get(ov2 + 1)) {
-                ov1++;
-                ov2++;
-                bReachedCommonWord = true;
-              }
-              // else
-              // System.err.println("Next match reached '"+lemmaMatch+
-              // "' | k1 - k2: "+k1 + " "+k2 +
-              // "| occur index ov1-ov2 "+
-              // ov1+" "+ov2+
-              // "| identified positions of match: occr1.get(ov1) - occr2.get(ov1) "
-              // +
-              // occr1.get(ov1) + " "+ occr2.get(ov1));
-            }
-          } else {
-            commonLemmas.add("*");
-          } // the same parts of speech, proceed to the next word in both
-            // expressions
-          k1++;
-          k2++;
-
-        } else if (!bReachedCommonWord) {
-          k1++;
-          k2++;
-        } // still searching
-        else {
-          // different parts of speech, jump to the next identified common word
-          ov1++;
-          ov2++;
-          if (ov1 > occr1.size() - 1 || ov2 > occr2.size() - 1)
-            break;
-          // now trying to find
-          int kk1 = occr1.get(ov1) - 2, // new positions of iterators
-          kk2 = occr2.get(ov2) - 2;
-          int countMove = 0;
-          while ((kk1 < k1 + 1 || kk2 < k2 + 1) && countMove < 2) { // if it is
-                                                                    // behind
-                                                                    // current
-                                                                    // position,
-                                                                    // synchroneously
-                                                                    // move
-                                                                    // towards
-                                                                    // right
-            kk1++;
-            kk2++;
-            countMove++;
-          }
-          k1 = kk1;
-          k2 = kk2;
-
-          if (k1 > k1max)
-            k1 = k1max;
-          if (k2 > k2max)
-            k2 = k2max;
-          bReachedCommonWord = false;
-        }
-      }
-      ParseTreePath currResult = new ParseTreePath(commonLemmas, commonPOS,
-          0, 0);
-      results.add(currResult);
-    }
-
-    return results;
-  }
-
-  /**
-   * main function to generalize two expressions grouped by phrase types returns
-   * a list of generalizations for each phrase type with filtered
-   * sub-expressions
-   * 
-   * @param sent1
-   * @param sent2
-   * @return List<List<ParseTreeChunk>> list of list of POS-words pairs for each
-   *         resultant matched / overlapped phrase
-   */
-  public List<List<ParseTreePath>> matchTwoSentencesGroupedChunksDeterministic(
-      List<List<ParseTreePath>> sent1, List<List<ParseTreePath>> sent2) {
-    List<List<ParseTreePath>> results = new ArrayList<List<ParseTreePath>>();
-    // first iterate through component
-    for (int comp = 0; comp < 2 && // just np & vp
-        comp < sent1.size() && comp < sent2.size(); comp++) {
-      List<ParseTreePath> resultComps = new ArrayList<ParseTreePath>();
-      // then iterate through each phrase in each component
-      for (ParseTreePath ch1 : sent1.get(comp)) {
-        for (ParseTreePath ch2 : sent2.get(comp)) { // simpler version
-          List<ParseTreePath> chunkToAdd = generalizeTwoGroupedPhrasesDeterministic(
-              ch1, ch2);
-
-          if (chunkToAdd == null)
-            chunkToAdd = new ArrayList<ParseTreePath>();
-          // System.out.println("ch1 = "+
-          // ch1.toString()+" | ch2="+ch2.toString()
-          // +"\n result = "+chunkToAdd.toString() + "\n");
-          /*
-           * List<ParseTreeChunk> chunkToAdd1 =
-           * ParseTreeMatcherDeterministic.generalizeTwoGroupedPhrasesDeterministic
-           * ( ParseTreeMatcher.prepositionalNNSTransform(ch1), ch2); if
-           * (chunkToAdd1!=null) chunkToAdd.addAll(chunkToAdd1);
-           * List<ParseTreeChunk> chunkToAdd2 =
-           * ParseTreeMatcherDeterministic.generalizeTwoGroupedPhrasesDeterministic
-           * ( ParseTreeMatcher.prepositionalNNSTransform(ch2), ch1); if
-           * (chunkToAdd2!=null) chunkToAdd.addAll(chunkToAdd2);
-           */
-
-          // For generalized match not with orig sentences but with templates
-          // if (!LemmaFormManager.mustOccurVerifier(ch1, ch2, chunkToAdd))
-          // continue; // if the words which have to stay do not stay, proceed
-          // to other elements
-          Boolean alreadyThere = false;
-          for (ParseTreePath chunk : resultComps) {
-            if (chunkToAdd.contains(chunk)) {
-              alreadyThere = true;
-              break;
-            }
-
-            // }
-          }
-
-          if (!alreadyThere && chunkToAdd != null && chunkToAdd.size() > 0) {
-            resultComps.addAll(chunkToAdd);
-          }
-
-        }
-      }
-      List<ParseTreePath> resultCompsRed = generalizationListReducer
-          .applyFilteringBySubsumption(resultComps);
-
-      resultComps = resultCompsRed;
-      results.add(resultComps);
-    }
-
-    return results;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/EdgeProductBuilder.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/EdgeProductBuilder.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/EdgeProductBuilder.java
index fb97716..6b72e47 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/EdgeProductBuilder.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/EdgeProductBuilder.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package opennlp.tools.parse_thicket.parse_thicket2graph;
 
 import java.util.ArrayList;

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/GraphFromPTreeBuilder.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/GraphFromPTreeBuilder.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/GraphFromPTreeBuilder.java
index bad6403..d19d7db 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/GraphFromPTreeBuilder.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/GraphFromPTreeBuilder.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package opennlp.tools.parse_thicket.parse_thicket2graph;
 
 import java.io.PrintWriter;

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseGraphNode.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseGraphNode.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseGraphNode.java
index 9620499..6f9c3ea 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseGraphNode.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseGraphNode.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package opennlp.tools.parse_thicket.parse_thicket2graph;
 
 import java.util.List;

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseTreeVisualizer.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseTreeVisualizer.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseTreeVisualizer.java
index d34d974..71c1fa3 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseTreeVisualizer.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseTreeVisualizer.java
@@ -1,27 +1,20 @@
-/* ==========================================
- * JGraphT : a free Java graph-theory library
- * ==========================================
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
  *
- * Project Info:  http://jgrapht.sourceforge.net/
- * Project Creator:  Barak Naveh (http://sourceforge.net/users/barak_naveh)
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
- * (C) Copyright 2003-2008, by Barak Naveh and Contributors.
- *
- * This library is free software; you can redistribute it and/or modify it
- * under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- * License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software Foundation,
- * Inc.,
- * 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
+ 
 /* ----------------------
  * JGraphAdapterDemo.java
  * ----------------------

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhraseConcept.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhraseConcept.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhraseConcept.java
index ecba4b5..d7f3e75 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhraseConcept.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhraseConcept.java
@@ -1,45 +1,84 @@
-package opennlp.tools.parse_thicket.pattern_structure;
-
-import java.util.*;
-import java.io.*;
-
-import opennlp.tools.textsimilarity.ParseTreeChunk;
-
-public class PhraseConcept {
-	int position;
-	//Set<Integer> intent;
-	List<List<ParseTreeChunk>> intent;
-	Set<Integer> parents;
-	public PhraseConcept() {
-		position = -1;
-		intent = new ArrayList<List<ParseTreeChunk>>();
-		parents = new HashSet<Integer>();
-	}
-	public void setPosition( int newPosition ){
-	       position = newPosition;
-	}
-	public void setIntent( List<List<ParseTreeChunk>> newIntent ){
-	       intent.clear();
-	       intent.addAll(newIntent);
-	}
-	public void setParents( Set<Integer> newParents ){
-	       //parents = newParents;
-		parents.clear();
-		parents.addAll(newParents);
-	}
-	public void printConcept() {
-		System.out.println("Concept position:" + position);
-		System.out.println("Concept intent:" + intent);
-		System.out.println("Concept parents:" + parents);
-	}
-	 public static void main(String []args) {
-		 PhraseConcept c = new PhraseConcept();
-		 c.printConcept();
-		 c.setPosition(10);
-		 c.printConcept();
-		 //List<List<ParseTreeChunk>> test = new List<List<ParseTreeChunk>>();
-		 //c.setIntent(test);
-		 c.printConcept();
-
-	 }
-}
\ No newline at end of file
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.pattern_structure;
+
+import java.util.*;
+import java.io.*;
+
+import opennlp.tools.fca.FormalConcept;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+
+
+public class PhraseConcept {
+	int position;
+	public List<List<ParseTreeChunk>> intent;
+	Set<Integer> parents;
+	Set<Integer> childs;
+	Set<Integer> extent;
+	
+	double intLogStabilityBottom = 0;
+	double intLogStabilityUp = 0;
+	
+	
+	public PhraseConcept() {
+		position = -1;
+		intent = new ArrayList<List<ParseTreeChunk>>();
+		parents = new HashSet<Integer>();
+		extent = new HashSet<Integer>();
+		childs = new HashSet<Integer>();
+	}
+	public void setPosition( int newPosition ){
+	       position = newPosition;
+	}
+	public void setIntent( List<List<ParseTreeChunk>> newIntent ){
+	       intent.clear();
+	       intent.addAll(newIntent);
+	}
+	public void setParents( Set<Integer> newParents ){
+	       //parents = newParents;
+		parents.clear();
+		parents.addAll(newParents);
+	}
+	public void printConcept() {
+		System.out.println("Concept position:" + position);
+		System.out.println("Concept intent:" + intent);
+		System.out.println("Concept parents:" + parents);
+	}
+	
+	public void printConceptExtended() {
+		System.out.println("Concept position:" + position);
+		System.out.println("Concept intent:" + intent);
+		System.out.println("Concept extent:" + extent);
+		System.out.println("Concept parents:" + parents);
+		System.out.println("Concept parents:" + childs);
+		System.out.println("log stab: ["+ intLogStabilityBottom + "; "+intLogStabilityUp+"]");		
+	}
+	
+	public void addExtents(LinkedHashSet<Integer> ext){
+		extent.addAll(ext);
+}
+	
+	
+	 public static void main(String []args) {
+		 PhraseConcept c = new PhraseConcept();
+		 c.printConcept();
+		 c.setPosition(10);
+		 c.printConcept();
+		 c.printConcept();
+
+	 }
+}