You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by bg...@apache.org on 2016/11/16 09:11:44 UTC
[48/51] [partial] opennlp-sandbox git commit: merge from bgalitsky's own git repo

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/JSMLearnerOnLatticeWithDeduction.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/JSMLearnerOnLatticeWithDeduction.java b/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/JSMLearnerOnLatticeWithDeduction.java
new file mode 100644
index 0000000..cea7187
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/JSMLearnerOnLatticeWithDeduction.java
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.jsmlearning;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.collections.ListUtils;
+
+import edu.stanford.nlp.util.StringUtils;
+import opennlp.tools.parse_thicket.pattern_structure.LinguisticPatternStructure;
+
+import opennlp.tools.similarity.apps.utils.Pair;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+
+public class JSMLearnerOnLatticeWithDeduction extends JSMLearnerOnLatticeBase{
+	List<JSMDecision> accumulatedJSMResults = new ArrayList<JSMDecision>();
+
+
+
+	public JSMDecision buildLearningModel(List<String> posTexts, List<String> negTexts, 
+			String unknown, String[] separationKeywords){
+		psPos = new LinguisticPatternStructure(0,0); psNeg = new LinguisticPatternStructure(0,0);
+		if (separationKeywords!=null){ // re-sort by occurrence of separation keyword
+			Pair<List<String>, List<String>> pair = reGroupByOccurrenceOfSeparationKeyword(posTexts, negTexts, separationKeywords );
+			posTexts = pair.getFirst(); negTexts = 	pair.getSecond();
+		}
+
+		List<List<List<ParseTreeChunk>>> lingRepsPos = new ArrayList<List<List<ParseTreeChunk>>>(),
+				lingRepsNeg = new ArrayList<List<List<ParseTreeChunk>>>();
+		for(String text: posTexts)
+			lingRepsPos.add(chunk_maker.formGroupedPhrasesFromChunksForPara(text));
+
+		for(String text: negTexts)
+			lingRepsNeg.add(chunk_maker.formGroupedPhrasesFromChunksForPara(text));
+
+		LinkedHashSet<Integer> obj = null;
+		int i=0;
+		for(List<List<ParseTreeChunk>> chunk: lingRepsPos){
+			obj = new LinkedHashSet<Integer>();
+			obj.add(i);
+			psPos.AddIntent(chunk, obj, 0);
+			i++;
+		}
+		i=0;
+		for(List<List<ParseTreeChunk>> chunk: lingRepsNeg){
+			obj = new LinkedHashSet<Integer>();
+			obj.add(i);
+			psNeg.AddIntent(chunk, obj, 0);
+			i++;
+		}
+
+		List<List<ParseTreeChunk>> chunksUnknown = chunk_maker.formGroupedPhrasesFromChunksForPara(unknown);
+		List<List<List<ParseTreeChunk>>> posIntersections = new ArrayList<List<List<ParseTreeChunk>>>(), 
+				negIntersections = new ArrayList<List<List<ParseTreeChunk>>>();
+		List<List<ParseTreeChunk>> intersection = null;
+		for(int iConcept = 0; iConcept<psPos.conceptList.size(); iConcept++){
+			if (psPos.conceptList.get(iConcept).intent!=null && psPos.conceptList.get(iConcept).intent.size()>0){
+				intersection =  computeIntersectionWithIntentExtendedByDeduction(psPos, iConcept, chunksUnknown);
+				if (reduceList(intersection).size()>0)
+					posIntersections.add(reduceList(intersection));
+			}
+		}
+		for(int iConcept = 0; iConcept<psNeg.conceptList.size(); iConcept++){
+			if (psNeg.conceptList.get(iConcept).intent!=null && psNeg.conceptList.get(iConcept).intent.size()>0){				
+				intersection = computeIntersectionWithIntentExtendedByDeduction(psNeg, iConcept, chunksUnknown);
+				if (reduceList(intersection).size()>0)
+					negIntersections.add(reduceList(intersection));
+			}
+		}
+
+		Pair<List<List<List<ParseTreeChunk>>>, List<List<List<ParseTreeChunk>>>> pair = 
+				removeInconsistenciesFromPosNegIntersections( posIntersections, 
+						negIntersections);
+
+		posIntersections = pair.getFirst();
+		negIntersections = pair.getSecond();
+
+		List<List<List<ParseTreeChunk>>> posIntersectionsUnderNeg = new ArrayList<List<List<ParseTreeChunk>>>(), 
+				negIntersectionsUnderPos = new ArrayList<List<List<ParseTreeChunk>>>();
+
+		for(int iConcept = 0; iConcept<psNeg.conceptList.size(); iConcept++){
+			for(int iConceptJ = 0; iConceptJ<negIntersections.size(); iConceptJ++){
+				intersection = md
+						.matchTwoSentencesGroupedChunksDeterministic(psNeg.conceptList.get(iConcept).intent, negIntersections.get(iConceptJ));
+				if (reduceList(intersection).size()>0)
+					posIntersectionsUnderNeg.add(reduceList(intersection));
+			}
+		}
+
+		for(int iConcept = 0; iConcept<psPos.conceptList.size(); iConcept++){
+			for(int iConceptJ = 0; iConceptJ<posIntersections.size(); iConceptJ++){
+				intersection = md
+						.matchTwoSentencesGroupedChunksDeterministic(psPos.conceptList.get(iConcept).intent, posIntersections.get(iConceptJ));
+				if (reduceList(intersection).size()>0)
+					negIntersectionsUnderPos.add(reduceList(intersection));
+			}
+		}
+
+		List<ParseTreeChunk>posIntersectionsUnderNegLst = flattenParseTreeChunkLst(posIntersectionsUnderNeg);
+		List<ParseTreeChunk>negIntersectionsUnderPosLst=flattenParseTreeChunkLst(negIntersectionsUnderPos);
+
+		posIntersectionsUnderNegLst = subtract(posIntersectionsUnderNegLst, negIntersectionsUnderPosLst);
+		negIntersectionsUnderPosLst= subtract(negIntersectionsUnderPosLst, posIntersectionsUnderNegLst);
+
+		System.out.println("Pos - neg inters = "+posIntersectionsUnderNegLst);
+		System.out.println("Neg - pos inters = "+negIntersectionsUnderPosLst);
+
+		Boolean bPositiveClass = (float)posIntersectionsUnderNegLst.size()/(float)negIntersectionsUnderPosLst.size() > 1f;
+
+		JSMDecision decision = new JSMDecision("keywordClassName", bPositiveClass, 
+				posIntersections , negIntersections, 
+				posIntersectionsUnderNeg,
+				negIntersectionsUnderPos, separationKeywords);
+
+		accumulatedJSMResults.add(decision);
+
+		return decision;
+
+	}
+
+	private List<List<ParseTreeChunk>> computeIntersectionWithIntentExtendedByDeduction(
+			LinguisticPatternStructure psPos, int iConcept,
+			List<List<ParseTreeChunk>> chunksUnknown) {
+		
+		List<List<ParseTreeChunk>> intent = psPos.conceptList.get(iConcept).intent, 
+				intentExtendedByDeduction = new ArrayList<List<ParseTreeChunk>>();
+		
+	
+		for(  List<ParseTreeChunk> group: intent){
+			List<ParseTreeChunk> newGroup = new ArrayList<ParseTreeChunk>();
+			for(ParseTreeChunk ch: group){
+					newGroup.add(ch);
+					List<String> lemmas = ch.getLemmas();
+					List<List<List<ParseTreeChunk>>> clausesBodiesToAdd = findClausesForListOfLemmas(lemmas);
+					if (clausesBodiesToAdd!=null && clausesBodiesToAdd.size()>0)
+						intentExtendedByDeduction .add(flattenParseTreeChunkLst(clausesBodiesToAdd));
+			}
+			intentExtendedByDeduction .add(newGroup);
+		} 
+		 return md
+			.matchTwoSentencesGroupedChunksDeterministic(intentExtendedByDeduction, chunksUnknown);
+		
+	}
+    
+	// for list of words in a phrase, identify if it includes a separation word/multiword and get respective clause body 
+	//(to add to existing intent by the calling func)
+	private List<List<List<ParseTreeChunk>>> findClausesForListOfLemmas(
+			List<String> lemmas) {
+		for(String lemma: lemmas){
+			for(JSMDecision dec:  accumulatedJSMResults ){
+				String[] sepKeywords = dec.getSeparationKeywords();
+				// if all separation keywords occur in this phrase
+				if (ListUtils.intersection(lemmas, Arrays.asList(sepKeywords)).size() == sepKeywords.length){
+					return dec.getPosHypotheses();
+				}
+			}
+		}
+		return null;
+	}
+
+	public Pair<List<String>, List<String>>  reGroupByOccurrenceOfSeparationKeyword(List<String> posTexts, List<String> negTexts, String[] keywords){
+		List<String> posTextsNew = new ArrayList<String>(), negTextsNew = new ArrayList<String>();
+		for(String posText:posTexts){
+			Boolean multiwordOccurs = true;
+			for(String keyword: keywords){
+				if (!StringUtils.find(posText, keyword))
+					multiwordOccurs = false;
+				break;
+			}
+			if (multiwordOccurs)
+				posTextsNew.add(posText);
+			else
+				negTextsNew.add(posText);
+		}
+		for(String negText:negTexts){
+			Boolean multiwordOccurs = true;
+			for(String keyword: keywords){
+				if (!StringUtils.find(negText, keyword))
+					multiwordOccurs = false;
+				break;
+			}
+			if (multiwordOccurs)
+				posTextsNew.add(negText);
+			else
+				negTextsNew.add(negText);
+		}
+
+
+		return new Pair<List<String>, List<String>>(posTextsNew , negTextsNew);
+	}
+
+	
+
+	public static void main (String[] args) {
+
+		String[] posArr = new String[] {"I rent an office space. This office is for my business. I can deduct office rental expense from my business profit to calculate net income. ",
+				"To run my business, I have to rent an office. The net business profit is calculated as follows. Rental expense needs to be subtracted from revenue. ",
+				"To store goods for my retail business I rent some space. When I calculate the net income, I take revenue and subtract business expenses such as office rent. ",
+		"I rent some space for my business. To calculate my net income, I subtract from revenue my rental business expense."};
+
+		String[] negArr = new String[] {"I rent out a first floor unit of my house to a travel business. I need to add the rental income to my profit. However, when I repair my house, I can deduct the repair expense from my rental income. ",
+				"I receive rental income from my office. I have to claim it as a profit in my tax forms. I need to add my rental income to my profits, but subtract rental expenses such as repair from it. ",
+				"I advertised my property as a business rental. Advertisement and repair expenses can be subtracted from the rental income. Remaining rental income needs to be added to my profit and be reported as taxable profit. ",			
+		"I showed  my property to a business owner to rent. Expenses on my time spent on advertisement are subtracted from the rental income. My rental profits are added to my taxable income.  "};	
+
+		String unknown = "I do not want to rent anything to anyone. I just want to rent a space for myself. I neither calculate deduction of individual or business tax. I subtract my tax from my income";
+		JSMLearnerOnLatticeWithDeduction jsm = new JSMLearnerOnLatticeWithDeduction();
+		JSMDecision dec1 =  // may be determined by 'subtract'
+				jsm.buildLearningModel(Arrays.asList(posArr), Arrays.asList(negArr), unknown , new String[]{"subtract"});
+		JSMDecision dec2 = // may be determined by ...
+				jsm.buildLearningModel(Arrays.asList(posArr), Arrays.asList(negArr), unknown , new String[]{"business"});
+		JSMDecision dec3 = // may be determined by ...
+				jsm.buildLearningModel(Arrays.asList(posArr), Arrays.asList(negArr), unknown , new String[]{"property"});
+		// Finally, do prediction
+		JSMDecision dec = // may be determined by ...
+				jsm.buildLearningModel(Arrays.asList(posArr), Arrays.asList(negArr), unknown , new String[]{"property"});
+		
+		
+		
+
+
+	}
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseCorefBuilderWithNER.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseCorefBuilderWithNER.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseCorefBuilderWithNER.java
new file mode 100644
index 0000000..3841592
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseCorefBuilderWithNER.java
@@ -0,0 +1,156 @@
+package opennlp.tools.parse_thicket;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import edu.stanford.nlp.dcoref.CorefChain;
+import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
+import edu.stanford.nlp.dcoref.CorefChain.CorefMention;
+import edu.stanford.nlp.ie.AbstractSequenceClassifier;
+import edu.stanford.nlp.ie.crf.CRFClassifier;
+import edu.stanford.nlp.ling.CoreAnnotations;
+import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
+import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations;
+import edu.stanford.nlp.pipeline.Annotation;
+import edu.stanford.nlp.sentiment.SentimentCoreAnnotations.SentimentAnnotatedTree;
+import edu.stanford.nlp.trees.Tree;
+import edu.stanford.nlp.trees.TreeCoreAnnotations;
+import edu.stanford.nlp.util.CoreMap;
+import opennlp.tools.parse_thicket.ArcType;
+import opennlp.tools.parse_thicket.Pair;
+import opennlp.tools.parse_thicket.ParseCorefsBuilder;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;
+
+public class ParseCorefBuilderWithNER extends ParseCorefsBuilder {
+
+	private static ParseCorefBuilderWithNER instanceNER;
+
+	public synchronized static ParseCorefBuilderWithNER  getInstance() {
+		if (instanceNER == null)
+			instanceNER = new ParseCorefBuilderWithNER ();
+
+		return instanceNER;
+	}
+
+
+	AbstractSequenceClassifier<CoreLabel> classifier = null;
+
+	ParseCorefBuilderWithNER() {
+		super();
+		classifier = CRFClassifier.getDefaultClassifier();
+	}
+
+	public ParseThicket buildParseThicket(String text){
+		List<Tree> ptTrees = new ArrayList<Tree>();
+		// all numbering from 1, not 0
+		List<WordWordInterSentenceRelationArc> arcs = new ArrayList<WordWordInterSentenceRelationArc>();
+		List<List<ParseTreeNode>> nodesThicket = new ArrayList<List<ParseTreeNode>>();
+		List<Float> sentimentProfile = new ArrayList<Float>();
+
+		annotation = new Annotation(text);
+		try {
+			pipeline.annotate(annotation);
+			List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
+			List<List<CoreLabel>> nerClassesText = classifier.classify(text);
+			
+
+			int nSent = 0;
+			if (sentences != null && sentences.size() > 0) 
+				for(CoreMap sentence: sentences){
+					List<ParseTreeNode> nodes = new ArrayList<ParseTreeNode>();
+					if (nSent>=nerClassesText.size())
+						break;
+					List<CoreLabel> nerClassesSent = nerClassesText .get(nSent);
+
+					// traversing the words in the current sentence
+					// a CoreLabel is a CoreMap with additional token-specific methods
+					Class<TokensAnnotation> tokenAnn = TokensAnnotation.class;
+					List<CoreLabel> coreLabelList = sentence.get(tokenAnn);
+					int count=1;
+					for (CoreLabel token: coreLabelList ) {
+						if (count-1>=nerClassesSent.size())
+							break;
+						CoreLabel classNerWord = 	nerClassesSent .get(count-1);
+						// this is the text of the token
+						String lemma = token.get(TextAnnotation.class);
+						// this is the POS tag of the token
+						String pos = token.get(PartOfSpeechAnnotation.class);
+						// this is the NER label of the token
+						String ne = token.get(NamedEntityTagAnnotation.class);     
+
+
+						ParseTreeNode p = new ParseTreeNode(lemma, pos, ne, count);
+						String ner = classNerWord .get(CoreAnnotations.AnswerAnnotation.class);
+						if (!ner.equals("O")){
+							Map<String, Object> nerMap = new HashMap<String, Object>();
+							nerMap.put("ner", ner);
+							p.setAttributes(nerMap);
+						}
+						nodes.add(p);
+						count++;
+					}	
+					nSent++;
+					nodesThicket.add(nodes);
+					Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
+					// now sentiment for given sentence
+					Tree sentimentTree = sentence.get(SentimentAnnotatedTree.class);
+					float sentiment = RNNCoreAnnotations.getPredictedClass(sentimentTree);
+					sentimentProfile.add(sentiment);
+					ptTrees.add(tree);
+				}
+		} catch (Exception e) {
+			e.printStackTrace();
+		}
+
+
+		// now coreferences
+		Map<Integer, CorefChain> corefs = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class);
+		List<CorefChain> chains = new ArrayList<CorefChain>(corefs.values());
+		for(CorefChain c: chains){
+			//System.out.println(c);
+			List<CorefMention> mentions = c.getMentionsInTextualOrder();
+			//System.out.println(mentions);
+			if (mentions.size()>1)
+				for(int i=0; i<mentions.size(); i++){
+					for(int j=i+1; j<mentions.size(); j++){
+						CorefMention mi = mentions.get(i), mj=mentions.get(j);
+
+
+						int niSentence = mi.position.get(0);
+						int niWord = mi.startIndex;
+						int njSentence = mj.position.get(0);
+						int njWord = mj.startIndex;
+
+						ArcType arcType = new ArcType("coref-", mj.mentionType+"-"+mj.animacy, 0, 0);
+
+						WordWordInterSentenceRelationArc arc = 
+								new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(niSentence,niWord), 
+										new Pair<Integer, Integer>(njSentence,njWord), mi.mentionSpan, mj.mentionSpan, 
+										arcType);
+						arcs.add(arc);
+					}
+				}
+		}
+		List<WordWordInterSentenceRelationArc> arcsCA = buildCAarcs(nodesThicket);
+		arcs.addAll(arcsCA);
+
+		ParseThicket result = new ParseThicket(ptTrees, arcs);
+		result.setSentimentProfile(sentimentProfile);
+		result.setNodesThicket(nodesThicket);
+		return result;
+	}
+
+	public static void main(String[] args){
+		new ParseCorefBuilderWithNER ().buildParseThicket("No one knows yet what General Prayuth's real intentions are. He has good reason to worry about resistance. "
+				+ "The pro-government Red-Shirt movement is far better organised than eight years ago, and could still be financed by former Prime Minister Thaksin Shinawatra's deep pockets.");
+	}
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/VerbNetProcessor.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/VerbNetProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/VerbNetProcessor.java
new file mode 100644
index 0000000..166d4ff
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/VerbNetProcessor.java
@@ -0,0 +1,267 @@
+package opennlp.tools.parse_thicket;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import edu.mit.jverbnet.data.Frame;
+import edu.mit.jverbnet.data.Frame.FrameBuilder;
+import edu.mit.jverbnet.data.FrameType;
+import edu.mit.jverbnet.data.IFrame;
+import edu.mit.jverbnet.data.IMember;
+import edu.mit.jverbnet.data.IThematicRole;
+import edu.mit.jverbnet.data.IVerbClass;
+import edu.mit.jverbnet.data.IWordnetKey;
+import edu.mit.jverbnet.data.VerbClass;
+import edu.mit.jverbnet.index.IVerbIndex;
+import edu.mit.jverbnet.index.VerbIndex;
+
+public class VerbNetProcessor implements IGeneralizer<Map<String, List<String>>> {
+
+	static VerbNetProcessor instance;
+	static private String pathToVerbnet = null; //new File( "." ).getCanonicalPath()+"/src/test/resources";
+	public static VerbNetProcessor getInstance(String resourceDir) {
+		if (resourceDir==null)
+			try {
+				resourceDir = new File( "." ).getCanonicalPath()+"/src/test/resources";
+			} catch (IOException e) {
+				e.printStackTrace();
+			}
+		pathToVerbnet = resourceDir + "/new_vn";
+		if (instance == null)
+			instance = new VerbNetProcessor();
+
+		return instance;
+	}	
+
+	IVerbIndex index = null;
+
+	private VerbNetProcessor() {
+
+		try {
+			URL url = new URL ("file", null , pathToVerbnet ) ;
+			index = new VerbIndex ( url ) ;
+
+			index . open () ;
+
+		} catch (MalformedURLException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		} catch (IOException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+	}
+
+
+	public IVerbClass getVerbNetForAVerb_____New(String verb){
+		Iterator<IVerbClass> iter = index.iterator();
+		while(iter.hasNext()){
+			IVerbClass v = iter.next();
+
+			if (v.getID().startsWith(verb))
+				return v;
+		}
+		iter = index.iterator();
+		while(iter.hasNext()){
+			IVerbClass v = iter.next();
+			if (!v.getMembers().isEmpty()){
+				for(IMember m: v.getMembers()) {
+					if (m.getName().equals(verb)){
+						return v;
+					}
+				}
+			}
+		}
+		return null;
+	}
+
+
+
+	public IVerbClass getVerbNetForAVerb(String verb){
+		Iterator<IVerbClass> iter = index.iterator();
+		while(iter.hasNext()){
+			IVerbClass v = iter.next();
+
+			if (v.getID().startsWith(verb))
+				return v;
+
+			if (!v.getMembers().isEmpty() && v.getMembers().get(0).getName().equals(verb)){
+				return v;
+			}
+		}
+		return null;
+	}
+
+	public List<Map<String, List<String>>> generalize(Object o1, Object o2) {
+		IVerbClass v1, v2;
+		if ((o1 instanceof String) && (o2 instanceof String)){
+			v1 = getVerbNetForAVerb((String) o1);
+			v2 = getVerbNetForAVerb((String) o2);		
+			return generalize(v1, v2);
+		} else
+
+			v1 = (IVerbClass)o1;
+		v2 = (IVerbClass)o2;
+		List<Map<String, List<String>>> resList = new ArrayList<Map<String, List<String>>>();
+
+		if (v1 ==null || v2==null) // not found
+			return  resList;
+
+		// lists for results
+		List<String> roles = new ArrayList<String>();
+
+		List<IThematicRole> roles1 = v1.getThematicRoles(), roles2 = v2.getThematicRoles();
+		Map<String, List<String>> results = new HashMap<String, List<String>>();
+
+		for(int i=0; i< roles1.size()&& i< roles2.size(); i++){
+			if (roles1.get(i).getType().equals(roles2.get(i).getType())){
+				roles.add(roles1.get(i).getType().toString());
+			} else 
+				roles.add("*");
+		}
+
+		List<IFrame> frames1 = v1.getFrames(), frames2 = v2.getFrames();
+		List<String> patterns1 = new ArrayList<String>(), patterns2 = new ArrayList<String>();
+		for(int i=0; i< frames1.size(); i++){
+			patterns1.add(frames1.get(i).getPrimaryType().getID());
+		}
+		for(int i=0; i< frames2.size(); i++){
+			patterns2.add(frames2.get(i).getPrimaryType().getID());
+		}
+		patterns2.retainAll(patterns1);
+		results.put("phrStr", patterns2) ; 
+
+		List<String> patternsWord1 = new ArrayList<String>(), patternsWord2 = new ArrayList<String>();
+		for(int i=0; i< frames1.size(); i++){
+			try {
+				patternsWord1.add(frames1.get(i).getSecondaryType().getID());
+			} catch (Exception e) {
+				e.printStackTrace();
+			}
+		}
+		for(int i=0; i< frames2.size(); i++){
+			try {
+				if (frames2.get(i).getSecondaryType()!=null && frames2.get(i).getSecondaryType().getID()!=null)
+					patternsWord2.add(frames2.get(i).getSecondaryType().getID());
+			} catch (Exception e) {
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+			}
+		}
+		patternsWord2.retainAll(patternsWord1);
+		results.put("phrDescr", patternsWord2) ; 
+
+		results.put("roles", roles);
+
+		resList.add(results);
+		return resList;
+	}
+
+	// takes a verb and forms its verbnet parameters 
+	// abandon  (leave-51.2 leave-51.2 ) (NP V NP.source ) (Transitivebasically, locative preposition drop of "from" ) (
+	public StringBuilder buildTreeRepresentationForTreeKernelLearning(String verb){
+		StringBuilder sb = new StringBuilder(1000);
+		IVerbClass v;
+		v = getVerbNetForAVerb(verb);
+		if (v==null) // for some reason this verb is not in the vocabulary
+			return null;
+		sb.append(verb + "  (" );
+		List<IThematicRole> roles = v.getThematicRoles();
+
+		for(int i=0; i< roles.size(); i++){
+			sb.append(roles.get(i).getVerbClass().getID().replace(".", "")+" ");
+		}
+		sb.append( ") (" );
+
+		List<IFrame> frames = v.getFrames();
+		for(int i=0; i< frames.size(); i++){
+			sb.append(//" ("+
+		frames.get(i).getPrimaryType().getID().replace(".", "-")+" ");
+		}
+		sb.append( ") (" );
+		for(int i=0; i< frames.size(); i++){
+			sb.append(frames.get(i).getSecondaryType().getID().
+					replace(".", "").replace(",", " ").replace("\"", "-").replace("/", "-").replace("(","").replace(")","")+" ");
+		}
+		sb.append( ") " );
+
+		if (v.getParent()!=null && v.getParent().getThematicRoles()!=null){
+			sb.append( "(" );
+			for(int i=0; i<v.getParent().getThematicRoles().size(); i++){
+				sb.append(v.getParent().getThematicRoles().get(i).getType()+" ");
+			}
+			sb.append( ")" );
+		}
+		return sb;
+	}
+
+	public void testIndex () throws Exception {
+		Iterator<IVerbClass> iter = index.iterator();
+		while(iter.hasNext()){
+			IVerbClass v = iter.next();
+			System.out.println(v.getID() + " +> " + v.getFrames().get(0).getVerbClass().getID() + "  \n ===> " + v.getMembers().get(0).getName()  );
+			List<IThematicRole> roles = v.getThematicRoles();
+			for (IThematicRole r: roles){
+				System.out.println(r.getType());
+			}
+
+			List<IFrame> frames = v.getFrames();
+			for(IFrame f: frames){
+				try {
+					System.out.println(f.getPrimaryType().getID() + " => " + f.getXTag() + " >> "+ f.getSecondaryType().getID() +  " : " + f.getExamples().get(0));
+				} catch (Exception e) {
+					// TODO Auto-generated catch block
+					e.printStackTrace();
+				}
+			}
+		}
+		IVerbClass verb0 =  index.getVerb("hit-18.1");
+		// look up a verb class and print out some info
+		IVerbClass verb = index . getRootVerb ("hit-18.1") ;
+		IMember member = verb . getMembers () . get (0) ;
+		Set < IWordnetKey > keys = member . getWordnetTypes () . keySet () ;
+		IFrame frame = verb . getFrames () . get (0) ;
+		FrameType type = frame . getPrimaryType () ;
+		String example = frame . getExamples () . get (0) ;
+		System . out . println ("id: " + verb . getID () ) ;
+		System . out . println (" first wordnet keys : " + keys ) ;
+		System . out . println (" first frame type : " + type . getID () ) ;
+		System . out . println (" first example : " + example ) ;
+	}
+
+	public static void main(String[] args){
+		String resourceDir = new File(".").getAbsolutePath().replace("/.", "") + "/src/test/resources";
+		VerbNetProcessor proc = VerbNetProcessor.getInstance(resourceDir);
+		/*	try {
+				proc.testIndex();
+			} catch (Exception e) {
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+			}
+		 */	
+
+		System.out.println(proc.buildTreeRepresentationForTreeKernelLearning("abandon"));
+		System.out.println(proc.buildTreeRepresentationForTreeKernelLearning("earn"));
+		
+		List res = proc.generalize("marry", "engage");
+		System.out.println (res);
+
+		res = proc.generalize("assume", "alert");
+		System.out.println (res);
+
+		res = proc.generalize("alert", "warn");
+		System.out.println (res);
+	}
+
+
+
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ExternalRSTImporter.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ExternalRSTImporter.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ExternalRSTImporter.java
new file mode 100644
index 0000000..955bcb8
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ExternalRSTImporter.java
@@ -0,0 +1,264 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.external_rst;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang.StringUtils;
+
+import edu.stanford.nlp.trees.Tree;
+import opennlp.tools.parse_thicket.ArcType;
+import opennlp.tools.parse_thicket.Pair;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;
+import opennlp.tools.parse_thicket.matching.Matcher;
+import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
+import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
+
+public class ExternalRSTImporter extends PT2ThicketPhraseBuilder{
+	private StringDistanceMeasurer strDistProc = new StringDistanceMeasurer ();
+	private String resourceDir = null; 
+	
+	public ExternalRSTImporter(){
+		 try {
+			resourceDir = new File( "." ).getCanonicalPath()+"/src/test/resources";
+		} catch (IOException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+	}
+
+	public List<RstNode>  buildArrayOfRSTnodes(ParseThicket pt, String jotyDumpFileName){
+		String dump=null;
+		try {
+			dump = FileUtils.readFileToString(new File(jotyDumpFileName), Charset.defaultCharset().toString());
+		} catch (IOException e) {
+			e.printStackTrace();
+			return null;
+		}
+		List<RstNode> nodes = new ArrayList<RstNode>(); 
+		String[] lines = dump.split("\n");
+		int startOfDim = StringUtils.lastIndexOf(lines[0], " ");
+		String dimStr = lines[0].substring(startOfDim).replace(")", "").trim();
+		int dim = Integer.parseInt(dimStr);
+		Integer[][] rstArcsIndices = new Integer[dim][dim];
+		for(int i=1; i< lines.length; i++){
+			RstNode node = new RstNode(lines[i]);
+			nodes.add(node);
+		}
+		return nodes;
+	}
+
+	private Map<String, Integer > phraseRstIndex = new HashMap<String, Integer >();
+	private Map<Integer, List<ParseTreeNode> > rstIndexPhrase = new HashMap<Integer, List<ParseTreeNode> > ();
+
+	public List<WordWordInterSentenceRelationArc> buildRSTArcsFromRSTparser( List<RstNode> rstNodes,
+			List<WordWordInterSentenceRelationArc> arcs,
+			Map<Integer, List<List<ParseTreeNode>>> sentNumPhrasesMap, 
+			ParseThicket pt ) {
+		List<WordWordInterSentenceRelationArc> arcsRST = new ArrayList<WordWordInterSentenceRelationArc>();		
+
+		for(int nSentFrom=0; nSentFrom<pt.getSentences().size(); nSentFrom++){
+			for(int nSentTo=nSentFrom+1; nSentTo<pt.getSentences().size(); nSentTo++){
+
+				// label all phrases with EDU
+				List<List<ParseTreeNode>> phrasesFrom = sentNumPhrasesMap.get(nSentFrom);
+				for(List<ParseTreeNode> p: phrasesFrom ){
+					Integer rstIndex = findBestRstNodeTextForAPhrase(p, rstNodes);
+					if (rstIndex!=null){
+						phraseRstIndex.put(p.toString(), rstIndex );
+						rstIndexPhrase.put(rstIndex , p);
+					}
+				}
+				List<List<ParseTreeNode>> phrasesTo = sentNumPhrasesMap.get(nSentTo);
+				for(List<ParseTreeNode> p: phrasesTo ){
+					Integer rstIndex = findBestRstNodeTextForAPhrase(p, rstNodes);
+					if (rstIndex!=null){
+						phraseRstIndex.put(p.toString(), rstIndex );
+						rstIndexPhrase.put(rstIndex , p);
+					}
+				}
+			}
+		}	// for a pair of phrases, discover ^ in RST tree which connects these sentences
+		
+		for( int nSentFrom=0; nSentFrom<pt.getSentences().size(); nSentFrom++){
+			for(int nSentTo=nSentFrom+1; nSentTo<pt.getSentences().size(); nSentTo++){
+				System.out.println("Sent from # = "+nSentFrom + " -- " + "Sent to # = "+nSentTo);
+				
+				List<List<ParseTreeNode>> phrasesFrom = sentNumPhrasesMap.get(nSentFrom);
+				List<List<ParseTreeNode>> phrasesTo = sentNumPhrasesMap.get(nSentTo);
+				for(List<ParseTreeNode> vpFrom: phrasesFrom){
+					for(List<ParseTreeNode> vpTo: phrasesTo){
+						System.out.println("Computing arc between phrases "+ vpFrom + " => " + vpTo);
+						// get two RST nodes 
+						Integer rstNodeFrom = phraseRstIndex.get(vpFrom.toString());
+						Integer rstNodeTo = phraseRstIndex.get(vpTo.toString());
+						if (rstNodeFrom==null || rstNodeTo==null ||  rstNodeFrom >= rstNodeTo)
+							continue;
+						System.out.println("Finding RST path for phrases "+ vpFrom + "' and '"+vpTo);
+						System.out.println("Sent from # = "+nSentFrom + " -- " + "Sent to # = "+nSentTo);
+
+						Integer commonAncestorIndex = findCommonAncestor(rstNodeFrom , rstNodeTo, rstNodes);
+						if (commonAncestorIndex!=null){
+							// and figure out if they can be properly connected by an arc, by navigating RST tree
+							ArcType arcType = new ArcType("rst", rstNodes.get(rstNodeTo).getRel2par(), 0, 0);
+							WordWordInterSentenceRelationArc arcRST = 
+									new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(nSentFrom, vpFrom.get(0).getId()), 
+											new Pair<Integer, Integer>(nSentTo,  vpFrom.get(0).getId()), "", "", arcType);
+							arcsRST.add(arcRST);
+
+						}
+					}
+				} 
+			}
+		}
+
+		return arcsRST;
+	}
+
+	private Integer findAncestorForRSTnode(Integer rstNodeFrom, List<RstNode> rstNodes){
+		RstNode initNode = rstNodes.get(rstNodeFrom);
+		if (initNode.level==null)
+			return null;
+
+		try {
+			int initLevel = initNode.level;
+			int iter=1; // start with moving one step up
+			while (rstNodeFrom-iter>=0) {
+				Integer currLevel= rstNodes.get(rstNodeFrom-iter).level;
+				if ( currLevel!=null && currLevel< initLevel ) // found ancestor
+					return rstNodeFrom-iter;
+				iter++;
+			}
+		} catch (Exception e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+		return null;
+	}
+
+	private Integer findCommonAncestor(Integer rstNodeFrom, Integer rstNodeTo,
+			List<RstNode> rstNodes) {
+		List<Integer> ancestorsFrom = new ArrayList<Integer>() , ancestorsTo = new ArrayList<Integer>();
+		ancestorsFrom.add(rstNodeFrom);  ancestorsTo.add(rstNodeTo);
+		int curLevel = rstNodes.get(rstNodeTo).level;
+		Integer rstNodeFromCurrent = rstNodeFrom,  rstNodeToCurrent = rstNodeTo; 
+
+		while(curLevel>0){
+			if (rstNodeFromCurrent !=null) {
+				rstNodeFromCurrent = findAncestorForRSTnode(rstNodeFromCurrent, rstNodes);
+			}
+			if (rstNodeToCurrent != null){
+				rstNodeToCurrent = findAncestorForRSTnode(rstNodeToCurrent, rstNodes);
+			}
+			if (rstNodeFromCurrent !=null) {
+				ancestorsFrom.add(rstNodeFromCurrent);  
+			}
+			if (rstNodeToCurrent != null)
+				ancestorsTo.add(rstNodeToCurrent);
+
+			List<Integer> ancestorsFromCurr = new ArrayList<Integer>(ancestorsFrom);
+			ancestorsFromCurr.retainAll(ancestorsTo);
+			if (! ancestorsFromCurr.isEmpty()){
+				System.out.println("Found comm ancestor "+rstNodes.get(ancestorsFromCurr.get(0)).toString() + " id =  "+ancestorsFromCurr.get(0) + 
+						" for two RST nodes | id = "+rstNodeFrom + "'"+
+						rstNodes.get(rstNodeFrom).toString() + "' and | id = "+ rstNodeTo + "'"+ rstNodes.get(rstNodeTo).toString()+"'");
+				String rel2par =  rstNodes.get(ancestorsFromCurr.get(0)).rel2par;
+				// if common ancestor is trivial, return null and do not form a link
+				if (rel2par==null) // || rel2par.equals("span"))
+					return null;
+				else
+					return ancestorsFromCurr.get(0);
+			}
+			curLevel--;
+		}
+		return null;
+	}
+
+	private Integer findBestRstNodeTextForAPhrase(List<ParseTreeNode> ps,
+			List<RstNode> rstNodes) {
+		// firt get  the phrase string
+		String phraseStr="";
+		for(ParseTreeNode n: ps){
+			phraseStr+=" "+n.getWord();
+		}
+		phraseStr= phraseStr.trim();
+		if (phraseStr.length()<10){
+			return null;
+		}
+		// now look for closest EDU text from the list of all 
+		double rMin = -10000d; Integer index = -1;
+		int count =0;
+		for(RstNode r: rstNodes){
+			if (r.getText()==null || r.getText().length()<10){
+				count++;
+				continue;
+			}
+			double res =  strDistProc.measureStringDistanceNoStemming(phraseStr, r.getText());
+			if (res > rMin){
+				rMin=res;
+				index = count;
+			}
+			count++;
+		}
+		if (index==-1)
+			return null;
+		System.out.println("Found RST node "+ rstNodes.get(index) +" for phrase ="+phraseStr);
+		return index;
+	}
+
+	/* 
+	 * Building phrases takes a Parse Thicket and forms phrases for each sentence individually
+	 * Then based on built phrases and obtained arcs, it builds arcs for RST
+	 * Finally, based on all formed arcs, it extends phrases with thicket phrases
+	 */
+
+	public List<WordWordInterSentenceRelationArc> buildPT2ptPhrases(ParseThicket pt, String externalRSTresultFilename ) {
+		List<List<ParseTreeNode>> phrasesAllSent = new ArrayList<List<ParseTreeNode>> ();
+		Map<Integer, List<List<ParseTreeNode>>> sentNumPhrases = new HashMap<Integer, List<List<ParseTreeNode>>>();
+		// build regular phrases
+		for(int nSent=0; nSent<pt.getSentences().size(); nSent++){
+			List<ParseTreeNode> sentence = pt.getNodesThicket().get(nSent);
+			Tree ptree = pt.getSentences().get(nSent);
+			//ptree.pennPrint();
+			List<List<ParseTreeNode>> phrases = buildPT2ptPhrasesForASentence(ptree, sentence);
+			System.out.println(phrases);
+			phrasesAllSent.addAll(phrases);
+			sentNumPhrases.put(nSent, phrases);
+
+		}
+		// TODO: code to run joty suite
+		List<RstNode> rstNodes = new ExternalRSTImporter().buildArrayOfRSTnodes(null, resourceDir+externalRSTresultFilename );
+
+		// discover and add RST arcs
+		List<WordWordInterSentenceRelationArc> arcsRST = buildRSTArcsFromRSTparser(  rstNodes, null, sentNumPhrases, pt );
+		System.out.println(arcsRST);
+		return arcsRST;
+
+	}
+
+
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/MatcherExternalRST.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/MatcherExternalRST.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/MatcherExternalRST.java
new file mode 100644
index 0000000..215ee35
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/MatcherExternalRST.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.external_rst;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.parse_thicket.IGeneralizer;
+import opennlp.tools.parse_thicket.ParseCorefBuilderWithNER;
+import opennlp.tools.parse_thicket.ParseCorefsBuilder;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.VerbNetProcessor;
+import opennlp.tools.parse_thicket.matching.GeneralizationResult;
+import opennlp.tools.parse_thicket.matching.Matcher;
+import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
+import opennlp.tools.parse_thicket.matching.PhraseGroupGeneralizer;
+import opennlp.tools.textsimilarity.LemmaPair;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic;
+import opennlp.tools.textsimilarity.SentencePairMatchResult;
+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+
+public class MatcherExternalRST extends Matcher implements IGeneralizer<List<List<ParseTreeNode>>>{
+
+	ParseCorefBuilderWithNERandRST ptBuilderRST = new ParseCorefBuilderWithNERandRST();
+	PT2ThicketPhraseBuilder phraseBuilder = new PT2ThicketPhraseBuilder();
+
+	private String externRSTpath;
+
+	public List<List<ParseTreeChunk>> assessRelevance(String para1, String para2) {
+		// first build PTs for each text
+		ParseThicket pt1 = ptBuilderRST.buildParseThicket(para1);
+		ParseThicket pt2 = ptBuilderRST.buildParseThicket(para2);
+		// then build phrases and rst arcs
+		List<List<ParseTreeNode>> phrs1 = phraseBuilder.buildPT2ptPhrases(pt1);
+		List<List<ParseTreeNode>> phrs2 = phraseBuilder.buildPT2ptPhrases(pt2);
+		// group phrases by type
+		List<List<ParseTreeChunk>> sent1GrpLst = formGroupedPhrasesFromChunksForPara(phrs1), 
+				sent2GrpLst = formGroupedPhrasesFromChunksForPara(phrs2);
+
+		
+		List<List<ParseTreeChunk>> res = pgGen.generalize(sent1GrpLst, sent2GrpLst);
+				
+		return res;
+
+	}
+	
+	// this function is the main entry point into the PT builder if rst arcs are required
+		public ParseThicket buildParseThicketFromTextWithRST(String para){
+			ParseThicket pt = ptBuilderRST.buildParseThicket(para);
+			
+			List<List<ParseTreeNode>> phrs = phraseBuilder.buildPT2ptPhrases(pt);
+			if (pt!=null)
+				pt.setPhrases(phrs);
+			return pt;	
+		}
+	
+	public List<List<ParseTreeChunk>> assessRelevance(List<List<ParseTreeChunk>> para0, String para2) {
+		// first build PTs for each text
+	
+		ParseThicket pt2 = ptBuilder.buildParseThicket(para2);
+		// then build phrases and rst arcs
+		List<List<ParseTreeNode>> phrs2 = phraseBuilder.buildPT2ptPhrases(pt2);
+		// group phrases by type
+		List<List<ParseTreeChunk>> sent2GrpLst = formGroupedPhrasesFromChunksForPara(phrs2);
+		List<List<ParseTreeChunk>> res = pgGen.generalize(para0, sent2GrpLst);
+				
+		return res;
+
+	}
+	
+	
+	public static void main(String[] args){
+		//MatcherExternalRST m = new MatcherExternalRST();
+		
+		
+	}
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/PT2ThicketPhraseBuilderExtrnlRST.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/PT2ThicketPhraseBuilderExtrnlRST.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/PT2ThicketPhraseBuilderExtrnlRST.java
new file mode 100644
index 0000000..b915543
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/PT2ThicketPhraseBuilderExtrnlRST.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.external_rst;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Logger;
+
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;
+import opennlp.tools.parse_thicket.kernel_interface.TreeKernelRunner;
+import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
+import opennlp.tools.parse_thicket.rhetoric_structure.RhetoricStructureArcsBuilder;
+
+import org.apache.commons.io.FileUtils;
+import org.jgrapht.Graph;
+import org.jgrapht.graph.DefaultEdge;
+import org.jgrapht.graph.SimpleGraph;
+
+
+
+
+
+
+
+
+
+
+import edu.stanford.nlp.trees.Tree;
+
+public class PT2ThicketPhraseBuilderExtrnlRST extends PT2ThicketPhraseBuilder{
+
+	private RhetoricStructureArcsBuilder rstBuilder = new RhetoricStructureArcsBuilder();
+	private ExternalRSTImporter externalRstBuilder = new ExternalRSTImporter();
+	private static Logger log = Logger
+		      .getLogger("opennlp.tools.parse_thicket.external_rst.PT2ThicketPhraseBuilderExtrnlRST");
+
+
+	/*
+	 * Building phrases takes a Parse Thicket and forms phrases for each sentence individually
+	 * Then based on built phrases and obtained arcs, it builds arcs for RST
+	 * Finally, based on all formed arcs, it extends phrases with thicket phrases
+	 */
+
+	public List<List<ParseTreeNode>> buildPT2ptPhrases(ParseThicket pt, String text, String externRSTpath) {
+		List<List<ParseTreeNode>> phrasesAllSent = new ArrayList<List<ParseTreeNode>> ();
+		Map<Integer, List<List<ParseTreeNode>>> sentNumPhrases = new HashMap<Integer, List<List<ParseTreeNode>>>();
+		// build regular phrases
+		for(int nSent=0; nSent<pt.getSentences().size(); nSent++){
+			List<ParseTreeNode> sentence = pt.getNodesThicket().get(nSent);
+			Tree ptree = pt.getSentences().get(nSent);
+			//ptree.pennPrint();
+			List<List<ParseTreeNode>> phrases = buildPT2ptPhrasesForASentence(ptree, sentence);
+			System.out.println(phrases);
+			phrasesAllSent.addAll(phrases);
+			sentNumPhrases.put(nSent, phrases);
+
+		}
+		String filename = "default.txt";
+		
+		try {
+			filename = text.split("/n")[0]+".txt";
+			FileUtils.writeStringToFile(new File(externRSTpath+"/"+filename), text, "utf-8");
+		} catch (Exception e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+		// discover and add RST arcs
+		List<WordWordInterSentenceRelationArc> arcsRST =
+				rstBuilder.buildRSTArcsFromMarkersAndCorefs(pt.getArcs(), sentNumPhrases, pt);
+		String[] commandLine1 = new String[]{"python", "Discourse_Segmenter.py",  filename}, commandLine2=
+				new String[]{"python", "Discourse_Parser.py", "tmp.edu"};
+		new TreeKernelRunner().runEXE(commandLine1, externRSTpath);
+		new TreeKernelRunner().runEXE(commandLine2, externRSTpath);
+		
+		// TODO: code to run joty suite
+		List<RstNode> rstNodes = new ExternalRSTImporter().buildArrayOfRSTnodes(null, externRSTpath+"/tmp_doc.dis");
+	
+		// discover and add RST arcs
+		List<WordWordInterSentenceRelationArc> arcsRSTexternal = externalRstBuilder.buildRSTArcsFromRSTparser(  rstNodes, null, sentNumPhrases, pt );
+		System.out.println(arcsRST);
+
+
+		List<WordWordInterSentenceRelationArc> arcs = pt.getArcs();
+		arcs.addAll(arcsRST);
+		arcs.addAll(arcsRSTexternal);
+		pt.setArcs(arcs);
+
+
+		List<List<ParseTreeNode>> expandedPhrases = expandTowardsThicketPhrases(phrasesAllSent, pt.getArcs(), sentNumPhrases, pt);
+		return expandedPhrases;
+	}
+
+	
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseCorefBuilderWithNERandRST.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseCorefBuilderWithNERandRST.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseCorefBuilderWithNERandRST.java
new file mode 100644
index 0000000..9fe9524
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseCorefBuilderWithNERandRST.java
@@ -0,0 +1,268 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket.external_rst;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import edu.arizona.sista.discourse.rstparser.DiscourseTree;
+import edu.arizona.sista.processors.CorefMention;
+import edu.arizona.sista.processors.Document;
+import edu.arizona.sista.processors.Processor;
+import edu.arizona.sista.processors.Sentence;
+import edu.arizona.sista.processors.corenlp.CoreNLPProcessor;
+import edu.arizona.sista.struct.DirectedGraphEdgeIterator;
+import edu.stanford.nlp.ie.AbstractSequenceClassifier;
+import edu.stanford.nlp.ie.crf.CRFClassifier;
+import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.trees.Tree;
+import opennlp.tools.parse_thicket.ArcType;
+import opennlp.tools.parse_thicket.Pair;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;
+import opennlp.tools.parse_thicket.communicative_actions.CommunicativeActionsArcBuilder;
+import scala.Option;
+
+public class ParseCorefBuilderWithNERandRST {	
+	public Processor proc = null;
+	CommunicativeActionsArcBuilder caFinder = new CommunicativeActionsArcBuilder();
+	private static Logger log = Logger
+		      .getLogger("opennlp.tools.parse_thicket.external_rst.ParseCorefBuilderWithNERandRST");
+
+
+	AbstractSequenceClassifier<CoreLabel> classifier = null;
+
+	ParseCorefBuilderWithNERandRST() {
+		super();
+		classifier = CRFClassifier.getDefaultClassifier();
+		proc = new CoreNLPProcessor(true, true, 100);
+	}
+
+	public ParseThicketWithDiscourseTree buildParseThicket(String text){
+		List<Tree> ptTrees = new ArrayList<Tree>();
+		List<WordWordInterSentenceRelationArc> arcs = new ArrayList<WordWordInterSentenceRelationArc>();
+		List<List<ParseTreeNode>> nodesThicket = new ArrayList<List<ParseTreeNode>>();
+
+		Document doc=null;
+        try {
+	        doc = proc.annotate(text, false);
+        } catch (IllegalArgumentException iae) {
+        	log.severe("failed to parse text: "+text);
+        } catch (Exception e) {
+	        e.printStackTrace();
+        }
+        // failed to parse - skip this text
+		if (doc==null)
+			return null;
+		//java.lang.IllegalArgumentException
+		for (Sentence sentence: doc.sentences()) {
+			List<ParseTreeNode> sentenceNodes = new ArrayList<ParseTreeNode>();
+			String[] tokens= sentence.words();
+			for(int i=0; i< tokens.length; i++){
+				//sentence.startOffsets(), " "));
+				//sentence.endOffsets(), " "));
+				ParseTreeNode p = new ParseTreeNode(sentence.words()[i], sentence.tags().get()[i]);
+				p.setId(i+1);
+				if(sentence.entities().isDefined()){
+					p.setNe(sentence.entities().get()[i]);
+				}
+				if(sentence.norms().isDefined()){
+					//p.setNormalizedWord(sentence.norms().get()[i]);
+					p.setNormalizedWord(sentence.lemmas().get()[i]);
+				}
+				sentenceNodes.add(p);
+			}
+
+			if(sentence.dependencies().isDefined()) {
+				int i=0;
+				DirectedGraphEdgeIterator<String> iterator = new
+						DirectedGraphEdgeIterator<String>(sentence.dependencies().get());
+				while(iterator.hasNext()) {
+					scala.Tuple3<Object, Object, String> dep = iterator.next();
+					//System.out.println(" head:" + dep._1() + " modifier:" + dep._2() + " label:" + dep._3());
+					if (i>sentenceNodes.size()-1)
+						break;
+					ParseTreeNode p = sentenceNodes.get(i);
+					p.setHead(dep._1().toString());
+					p.setModifier(dep._2().toString());
+					p.setLabel(dep._3());
+					sentenceNodes.set(i, p);
+					i++;
+				}
+			}
+			if(sentence.syntacticTree().isDefined()) {
+				Tree tree = Tree.valueOf(sentence.syntacticTree().get().toString());
+				ptTrees.add(tree);
+				//tree.pennPrint();
+			}
+			nodesThicket.add(sentenceNodes);
+		}
+
+		if(doc.coreferenceChains().isDefined()) {
+			// these are scala.collection Iterator and Iterable (not Java!)
+			scala.collection.Iterator<scala.collection.Iterable<CorefMention>> chains = doc.coreferenceChains().get().getChains().iterator();
+			while(chains.hasNext()) {
+				scala.collection.Iterator<CorefMention> chain = chains.next().iterator();
+				//System.out.println("Found one coreference chain containing the following mentions:");
+				int numInChain = 0;
+				int[] niSentence = new int[4], niWord = new int[4], startOffset = new int[4], endOffset = new int[4];
+
+				while(chain.hasNext()) {
+					CorefMention mention = chain.next();
+					// note that all these offsets start at 0 too
+					niSentence[numInChain ] = mention.sentenceIndex();
+					niWord[numInChain ] = mention.headIndex();
+					startOffset[numInChain ] = mention.startOffset();
+					endOffset[numInChain ] = mention.endOffset();
+					if (numInChain>=4-1)
+						break;
+					numInChain++;
+					//" headIndex:" + mention.headIndex() +
+					//" startTokenOffset:" + mention.startOffset() +
+					//" endTokenOffset:" + mention.endOffset());
+				}
+				if (numInChain>0) { // more than a single mention
+					for(int i=0; i<numInChain; i++){
+						ArcType arcType = new ArcType("coref-", "", 0, 0);
+
+						WordWordInterSentenceRelationArc arc = 
+								new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(niSentence[i],niWord[i]), 
+										new Pair<Integer, Integer>(niSentence[i+1],niWord[i+1]), 
+									    startOffset[i]+"", startOffset[i+1]+"",
+	      	    					  arcType);
+						arcs.add(arc);
+					}
+				}
+			}
+		}
+
+
+		List<WordWordInterSentenceRelationArc> arcsCA = buildCAarcs(nodesThicket);
+		arcs.addAll(arcsCA);
+		ParseThicketWithDiscourseTree result = new ParseThicketWithDiscourseTree(ptTrees, arcs);
+
+		if(doc.discourseTree().isDefined()) {
+			Option<DiscourseTree> discourseTree = doc.discourseTree();
+
+			//scala.collection.immutable.List<DiscourseTree> scList = discourseTree.toList();
+			scala.collection.Iterator<DiscourseTree> iterator = discourseTree.iterator();
+			while(iterator.hasNext()) {
+				DiscourseTree dt = iterator.next();
+				result.setDt(dt);
+				List<WordWordInterSentenceRelationArc> rstArcs = new ArrayList<WordWordInterSentenceRelationArc>();
+				navigateDiscourseTree(dt, rstArcs, nodesThicket );
+				arcs.addAll(rstArcs);
+				System.out.println(dt);
+				System.out.println("first EDU = "+dt.firstEDU() + "| dt.firstSentence() = "+ dt.firstSentence() + 
+						" \n| last EDU = "+dt.lastEDU() + "| dt.lastSentence() = "+ dt.lastSentence() + 
+						" \n| dt.tokenCount() = " + dt.tokenCount() + "| dt.firstToken " + dt.firstToken() + 
+						" | dt.lastToken() "+ dt.lastToken() + "\n kind =" + dt.kind() + " | text = "+ dt.rawText());
+				StringBuilder sb = new StringBuilder(10000);
+				System.out.println(sb);
+			}
+		}
+
+		result.setOrigText(text);
+		result.setNodesThicket(nodesThicket);
+		
+		result.setDtDump(); // sets the DT representation for TK learning
+		return result;
+	}
+
+	public List<WordWordInterSentenceRelationArc> buildCAarcs(
+			List<List<ParseTreeNode>> nodesThicket) {
+		List<WordWordInterSentenceRelationArc> arcs = new ArrayList<WordWordInterSentenceRelationArc>();
+
+		for(int sentI=0; sentI<nodesThicket.size(); sentI++){
+			for(int sentJ=sentI+1; sentJ<nodesThicket.size(); sentJ++){
+				List<ParseTreeNode> sentenceI = nodesThicket.get(sentI), 
+						sentenceJ = nodesThicket.get(sentJ);
+				Pair<String, Integer[]> caI = caFinder.findCAInSentence(sentenceI);
+				Pair<String, Integer[]> caJ = caFinder.findCAInSentence(sentenceJ);
+				int indexCA1 = caFinder.findCAIndexInSentence(sentenceI);
+				int indexCA2 = caFinder.findCAIndexInSentence(sentenceJ);
+				if (caI==null || caJ==null)
+					continue;
+				Pair<String, Integer[]> caGen = caFinder.generalize(caI, caJ).get(0);
+
+				ArcType arcType = new ArcType("ca", 
+						caGen.getFirst().toString()+printNumArray(caGen.getSecond()), 0, 0);
+				WordWordInterSentenceRelationArc arc = 
+						new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(sentI,indexCA1), 
+								new Pair<Integer, Integer>(sentJ,indexCA2), caI.getFirst(), caJ.getFirst(), 
+								arcType);
+				arcs.add(arc);
+
+			}
+		}
+
+		return arcs;
+	}
+
+	private String printNumArray(Integer[] arr){
+		StringBuffer buf = new StringBuffer();
+		for(Integer i: arr){
+			buf.append(Integer.toString(i)+ " ");
+		}
+		return buf.toString();
+	}
+
+	// creates a list of Arcs objects 'arcs' from the descourse tree dt, using the list of sentences 'nodesThicket' to identify words 
+	// for nodes being connected with these arcs
+	private void navigateDiscourseTree(DiscourseTree dt, List<WordWordInterSentenceRelationArc> arcs,  List<List<ParseTreeNode>> nodesThicket  ) {
+		if (dt.isTerminal()) {
+			return;
+		} else {
+			ArcType arcType = new ArcType("rst", 
+					dt.relationLabel()+ "=>" + dt.kind(), Boolean.compare(dt.relationDirection().equals("LeftToRight"), true),0);
+			String lemmaFrom = nodesThicket.get(dt.firstSentence()).get(dt.firstToken().copy$default$2()).getWord();
+			String lemmaTo = nodesThicket.get(dt.lastSentence()).get(dt.lastToken().copy$default$2()-1).getWord();
+			
+			WordWordInterSentenceRelationArc arc = 
+					new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(dt.firstToken().copy$default$1(), dt.firstToken().copy$default$2()), 
+							new Pair<Integer, Integer>(dt.lastToken().copy$default$1(), dt.lastToken().copy$default$2()), 
+							lemmaFrom,lemmaTo, 
+							arcType);
+			System.out.println(arc);
+			arcs.add(arc);
+			DiscourseTree[] kids = dt.children();
+			if (kids != null) {
+				for (DiscourseTree kid : kids) {
+					navigateDiscourseTree(kid, arcs, nodesThicket);
+				}
+			}
+			return ;
+		}
+	}
+
+	public static void main(String[] args){
+		ParseCorefBuilderWithNERandRST builder = new ParseCorefBuilderWithNERandRST ();
+		String text = "I thought I d tell you a little about what I like to write. And I like to immerse myself in my topics. I just like to dive right in and become sort of a human guinea pig. And I see my life as a series of experiments. So , I work for Esquire magazine , and a couple of years ago I wrote an article called  My Outsourced Life ,  where I hired a team of people in Bangalore , India , to live my life for me. "
+		+ "So they answered my emails. They answered my phone. ";
+		
+		ParseThicket pt = builder.buildParseThicket(text);
+		pt = builder.buildParseThicket(
+				"Dutch accident investigators say that evidence points to pro-Russian rebels as being responsible for shooting down plane. The report indicates where the missile was fired from and identifies who was in control of the territory and pins the downing of the plane on the pro-Russian rebels. "+
+						"However, the Investigative Committee of the Russian Federation believes that the plane was hit by a missile from the air which was not produced in Russia. "+
+						"At the same time, rebels deny that they controlled the territory from which the missile was supposedly fired."
+				);
+	}
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseThicketWithDiscourseTree.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseThicketWithDiscourseTree.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseThicketWithDiscourseTree.java
new file mode 100644
index 0000000..44c843c
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseThicketWithDiscourseTree.java
@@ -0,0 +1,284 @@
+package opennlp.tools.parse_thicket.external_rst;
+
+import java.util.List;
+
+import edu.arizona.sista.discourse.rstparser.DiscourseTree;
+import edu.stanford.nlp.trees.Tree;
+import opennlp.tools.parse_thicket.Pair;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.VerbNetProcessor;
+import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;
+import opennlp.tools.parse_thicket.kernel_interface.TreeExtenderByAnotherLinkedTree;
+
+/*
+ * This is subclass of ParseThicket with the focus on Discourse Tree
+ * It produces a representation of discourse tree for tree kernel learning
+ */
+
+public class ParseThicketWithDiscourseTree extends ParseThicket {
+	private DiscourseTree dt;
+	private String dtDump;
+	private String dtDumpWithPOS;
+	private String dtDumpWithEmbeddedTrees;
+	private String dtDumpWithVerbNet;
+
+	private TreeExtenderByAnotherLinkedTree extender = new TreeExtenderByAnotherLinkedTree();
+	private VerbNetProcessor verbBuilder = VerbNetProcessor.getInstance(null);
+
+	public DiscourseTree getDt() {
+		return dt;
+	}
+	// sets the highest level DT (under further iterations does not set anything)
+	public void setDt(DiscourseTree dt) {
+		if (this.dt==null)
+			this.dt = dt;
+	}
+
+	public ParseThicketWithDiscourseTree(List<Tree> ptTrees, List<WordWordInterSentenceRelationArc> barcs) {
+		super(ptTrees, barcs);
+	}
+
+	public void setDtDump(){
+		StringBuilder sb = new StringBuilder(100000);
+		StringBuilder res = toStringBuilderDTWithPOSSeq(sb, this.dt);
+		dtDumpWithPOS = res.toString();
+
+		sb = new StringBuilder(100000);
+		res = toStringBuilderDT(sb, this.dt);
+		dtDump = res.toString();
+
+		sb = new StringBuilder(100000);
+		res = toStringBuilderDTWithEmbeddedTrees(sb, this.dt);
+		dtDumpWithEmbeddedTrees = res.toString();
+
+		sb = new StringBuilder(100000);
+		res = toStringBuilderDTWithVerbNet(sb, this.dt);
+		dtDumpWithVerbNet = res.toString();
+	}
+	// basic representation of discourse tree 
+	private StringBuilder toStringBuilderDT(StringBuilder sb, DiscourseTree dt) {
+		if (dt.isTerminal()) {
+			if (dt.relationLabel() != null) {
+				sb.append(dt.relationLabel());
+				//sb.append("("+dt.rawText()+")");
+				scala.collection.mutable.StringBuilder sbs = new scala.collection.mutable.StringBuilder(100);
+
+				dt.print(sbs, 0, false, true);
+				String text  =  sbs.replaceAllLiterally("Nucleus TEXT:", "(");
+				text = text.substring(0, text.length()-1)+")";
+				sb.append(text);
+			}
+			return sb;
+		} else {
+			sb.append('(');
+			if (dt.relationLabel() != null) {
+				sb.append(dt.relationLabel());
+			}
+			DiscourseTree[] kids = dt.children();
+			if (kids != null) {
+				for (DiscourseTree kid : kids) {
+					sb.append(' ');
+					toStringBuilderDT(sb, kid);
+				}
+			}
+			return sb.append(')');
+		}
+	}
+
+	private StringBuilder toStringBuilderDTWithPOSSeq(StringBuilder sb, DiscourseTree dt) {
+		if (dt.isTerminal()) {
+			if (dt.relationLabel() != null && dt.relationLabel().length()>2) {
+				sb.append(dt.relationLabel());
+				// different StrBuilder for trees from scala
+				scala.collection.mutable.StringBuilder sbs = new scala.collection.mutable.StringBuilder(100);
+				dt.print(sbs, 0, false, true);
+				String text  =  sbs.replaceAllLiterally("Nucleus TEXT:", "");
+				//text = text.substring(0, text.length()-1)+"";
+				String textDump = substituteTextWithPOStext(text, this.getNodesThicket().get(dt.firstToken().copy$default$1()));
+				sb.append(textDump);
+			}
+			return sb;
+		} else {
+			sb.append('(');
+			if (dt.relationLabel() != null) {
+				sb.append(dt.relationLabel());
+			}
+			DiscourseTree[] kids = dt.children();
+			if (kids != null) {
+				for (DiscourseTree kid : kids) {
+					sb.append(' ');
+					toStringBuilderDTWithPOSSeq(sb, kid);
+				}
+			}
+			return sb.append(')');
+		}
+	}
+
+	private String substituteTextWithPOStext(String text, List<ParseTreeNode> list) {
+		boolean bMatch = false;
+		String[] tokens = text.split(" ");
+		for(int offset = 0; offset<list.size(); offset++ ){	    	
+			List<ParseTreeNode> subList = list.subList(offset, tokens.length+offset);
+			int count = 0;
+			bMatch = true; // if at least one mismatch
+			for(ParseTreeNode n: subList){
+				if (!n.getWord().equals(tokens[count])){
+					bMatch = false;
+					break;
+				} else 
+					count++;
+				if (count>3)
+					break;
+			}
+			if (bMatch){
+				return //"(" + 
+						ParseTreeNode.toTreeRepresentationString(subList); // + ")";
+			}
+		}
+		return null;
+	}
+
+	private StringBuilder toStringBuilderDTWithEmbeddedTrees(StringBuilder sb, DiscourseTree dt) {
+		if (dt.isTerminal()) {
+			if (dt.relationLabel() != null && dt.relationLabel().length()>2) {
+				sb.append(dt.relationLabel());
+				//sb.append("("+dt.rawText()+")");
+				scala.collection.mutable.StringBuilder sbs = new scala.collection.mutable.StringBuilder(100);
+
+				dt.print(sbs, 0, false, true);
+				String text  =  sbs.replaceAllLiterally("Nucleus TEXT:", "");
+				//text = text.substring(0, text.length()-1)+"";
+				substituteTextWithParseTree(sb, text, this.getSentenceTrees().get(dt.firstToken().copy$default$1()));
+			}
+			return sb;
+		} else {
+			sb.append('(');
+			if (dt.relationLabel() != null) {
+				sb.append(dt.relationLabel());
+			}
+			DiscourseTree[] kids = dt.children();
+			if (kids != null) {
+				for (DiscourseTree kid : kids) {
+					sb.append(' ');
+					toStringBuilderDTWithEmbeddedTrees(sb, kid);
+				}
+			}
+			return sb.append(')');
+		}
+	}
+	private void substituteTextWithParseTree(StringBuilder sb, String text, Tree sentenceTree) {
+		String[] tokens = text.split(" ");
+		List<Tree> foundTrees = null;
+		if (tokens.length>1){
+			foundTrees = 
+					extender.getASubtreeWithRootAsNodeForWord1(sentenceTree, sentenceTree, new String[]{tokens[0], tokens[1]});
+		}
+		else{
+			foundTrees = 
+					extender.getASubtreeWithRootAsNodeForWord1(sentenceTree, sentenceTree, new String[]{tokens[0]});
+
+		}
+
+		if (foundTrees == null || foundTrees.size()<1)
+			return;
+
+		extender.toStringBuilder(sb, foundTrees.get(0));
+
+	}
+
+	private StringBuilder toStringBuilderDTWithVerbNet(StringBuilder sb, DiscourseTree dt) {
+		if (dt.isTerminal()) {
+			if (dt.relationLabel() != null && dt.relationLabel().length()>2) {
+				sb.append(dt.relationLabel());
+				//sb.append("("+dt.rawText()+")");
+				scala.collection.mutable.StringBuilder sbs = new scala.collection.mutable.StringBuilder(100);
+
+				dt.print(sbs, 0, false, true);
+				String text  =  sbs.replaceAllLiterally("Nucleus TEXT:", "");
+				String textDump = null;
+				if (text.split(" ").length<100) // if not TOO long, more informative substitution, including VerbNets
+					textDump = substituteTextWithPOStextVerbNet(text, this.getNodesThicket().get(dt.firstToken().copy$default$1()));
+				else // otherwise just lemma-POS chains
+					textDump = substituteTextWithPOStext(text, this.getNodesThicket().get(dt.firstToken().copy$default$1()));
+				
+					
+				sb.append(textDump);
+			}
+			return sb;
+		} else {
+			sb.append('(');
+			if (dt.relationLabel() != null) {
+				sb.append(dt.relationLabel());
+			}
+			DiscourseTree[] kids = dt.children();
+			if (kids != null) {
+				for (DiscourseTree kid : kids) {
+					sb.append(' ');
+					toStringBuilderDTWithVerbNet(sb, kid);
+				}
+			}
+			return sb.append(')');
+		}
+	}
+
+	// substitutes lemma-POS pair instead of just lemma
+	// in case of verb provides moe detailed info
+	private String substituteTextWithPOStextVerbNet(String text, List<ParseTreeNode> list) {
+		boolean bMatch = false;
+		String[] tokens = text.split(" ");
+		for(int offset = 0; offset<list.size(); offset++ ){	    	
+			List<ParseTreeNode> subList = list.subList(offset, tokens.length+offset);
+			int count = 0;
+			bMatch = true; // if at least one mismatch
+			for(ParseTreeNode n: subList){
+				if (!n.getWord().equals(tokens[count])){
+					bMatch = false;
+					break;
+				} else 
+					count++;
+				if (count>3) // three tokens is enough for alignment
+					break;
+			}
+			// alignment found; now 
+			if (bMatch){
+				StringBuilder buf = new StringBuilder();
+				for(ParseTreeNode ch: subList){
+					try {
+	                    if (ch.getPos().startsWith(".") || ch.getPos().startsWith(",") || ch.getPos().startsWith(";") || ch.getPos().startsWith("!"))
+	                    	continue;
+	                    if (ch.getPos().startsWith("VB") && ch.getNormalizedWord()!=null){ // do more info for verbs
+	                    	StringBuilder verbRepr = verbBuilder.
+	                    			buildTreeRepresentationForTreeKernelLearning(ch.getNormalizedWord());
+	                    	if (verbRepr!=null)
+	                    		buf.append(" ("+verbRepr+") ");
+	                    	else
+	                    		buf.append( "("+ch.getWord()+ " " + ch.getPos() + ")" );
+	                    } else { // other than verb
+	                    	buf.append( "("+ch.getWord()+ " " + ch.getPos() + ")" );
+	                    }
+                    } catch (Exception e) {
+	                    e.printStackTrace();
+                    }
+				}
+				return buf.toString().trim();
+			}
+		}
+		return null;
+	}
+
+	public String getDtDump() {
+		return this.dtDump;
+	}
+	public String getDtDumpWithPOS() {
+		return this.dtDumpWithPOS;
+	}
+
+	public String getDtDumpWithEmbeddedTrees() {
+		return this.dtDumpWithEmbeddedTrees;
+	}
+	
+	public String getDtDumpWithVerbNet() {
+		return this.dtDumpWithVerbNet;
+	}
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/RstNode.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/RstNode.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/RstNode.java
new file mode 100644
index 0000000..61e8f13
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/RstNode.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.external_rst;
+
+import org.apache.commons.lang.StringUtils;
+
+import opennlp.tools.similarity.apps.utils.Pair;
+
+public class RstNode {
+	Boolean isNucleus;
+	Pair<Integer, Integer> span;
+	Integer leaf;
+	String rel2par;
+	String text;
+	Integer level;
+	
+	public Boolean getIsNucleus() {
+		return isNucleus;
+	}
+	public void setIsNucleus(Boolean isNucleus) {
+		this.isNucleus = isNucleus;
+	}
+	public Pair<Integer, Integer> getSpan() {
+		return span;
+	}
+	public void setSpan(Pair<Integer, Integer> span) {
+		this.span = span;
+	}
+	public Integer getLeaf() {
+		return leaf;
+	}
+	public void setLeaf(Integer leaf) {
+		this.leaf = leaf;
+	}
+	public String getRel2par() {
+		return rel2par;
+	}
+	public void setRel2par(String rel2par) {
+		this.rel2par = rel2par;
+	}
+	public String getText() {
+		return text;
+	}
+	public void setText(String text) {
+		this.text = text;
+	}
+	
+	public String toString() {
+		String ret = "";
+		if (isNucleus!=null && isNucleus)
+			ret+="Nucleus ";
+		if (span!=null)
+			ret+="["+span.getFirst()+" "+ span.getSecond()+"]";
+		ret += " >> "+ rel2par;
+		if (text!=null)
+			ret+= " >> "+text;
+		return ret;
+	}
+	public RstNode(String line) {
+		if (StringUtils.trim(line).startsWith(")"))
+			return;
+		
+
+		level = line.indexOf("(");
+		line = line.substring(line.indexOf("(")+2);
+		
+		isNucleus = line.substring(0, line.indexOf("(")).indexOf("Nucleus")>-1;
+		line = line.substring(line.indexOf("(")+1);
+		if (line.startsWith("span")){
+			line = line.substring(5);
+			try {
+				span = new Pair<Integer, Integer>();
+				String[] spanStr = line.substring(0, line.indexOf(")")).split(" "); 
+				span.setFirst(Integer.parseInt(spanStr[0]));
+				span.setSecond(Integer.parseInt(spanStr[1]));
+			} catch (Exception e) {
+				e.printStackTrace();
+			}
+			
+		} else if (line.startsWith("leaf")){
+			try {
+				String leafStr = line.substring(5, line.indexOf(")"));
+				leaf = Integer.parseInt(leafStr);
+			} catch (Exception e) {
+				e.printStackTrace();
+			}
+			
+		} else System.err.println("Problem parsing RST results: '"+line);
+		
+		line = line.substring(line.indexOf("rel2par")+8);
+		rel2par = line.substring(0, line.indexOf(")")).trim();
+		
+		text = StringUtils.substringBetween(line, "_!", "_!)");
+
+		
+	}
+
+	 public static void main(String[] args){
+		 RstNode n1 = new RstNode("        ( Nucleus (leaf 7) (rel2par span) (text _!that it usually takes a day_!) )"),
+		 n2 = new RstNode("       )"),
+		 n3 = new RstNode("          ( Satellite (span 15 16) (rel2par Explanation)");
+		 
+	 }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/BracesProcessor.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/BracesProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/BracesProcessor.java
new file mode 100644
index 0000000..b41cd46
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/BracesProcessor.java
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket.kernel_interface;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Stack;
+
+//import org.apache.commons.io.FileUtils;
+
+public class BracesProcessor {
+	private static final int MIN_BRACES_CNT = 5;
+
+	private static final char L_PAREN    = '(';
+	private static final char R_PAREN    = ')';
+	private static final char L_BRACE    = '{';
+	private static final char R_BRACE    = '}';
+	private static final char L_BRACKET  = '[';
+	private static final char R_BRACKET  = ']';
+	private  Stack<Character> stackIncremental = new Stack<Character>();
+	private int count = 0;
+	private Boolean balancedSoFar = true;
+
+	public Boolean getBalancedBracesResult(){
+		if (balancedSoFar)
+			return (stackIncremental.isEmpty() && count> MIN_BRACES_CNT);
+		else 
+			return false;
+	}
+
+	public void analyzeBalancedBracesAddPortionIncremental(String s) {
+
+
+		for (int i = 0; i < s.length(); i++) {
+
+			if      (s.charAt(i) == L_PAREN)   {
+				stackIncremental.push(L_PAREN);
+				count++;
+
+			}
+
+			else if (s.charAt(i) == L_BRACE)   {
+				stackIncremental.push(L_BRACE);
+				count++;
+			}
+
+			else if (s.charAt(i) == L_BRACKET){
+				stackIncremental.push(L_BRACKET);
+				count++;
+			}
+
+			else if (s.charAt(i) == R_PAREN) {
+				if (stackIncremental.isEmpty())        balancedSoFar = false;
+				if (stackIncremental.pop() != L_PAREN) balancedSoFar = false;
+			}
+
+			else if (s.charAt(i) == R_BRACE) {
+				if (stackIncremental.isEmpty())        balancedSoFar = false;
+				if (stackIncremental.pop() != L_BRACE) balancedSoFar = false;
+			}
+
+			else if (s.charAt(i) == R_BRACKET) {
+				if (stackIncremental.isEmpty())        balancedSoFar = false;
+				if (stackIncremental.pop() != L_BRACKET) balancedSoFar = false;
+			}
+
+			// ignore all other characters
+
+		}
+
+	}
+
+	public static boolean isBalanced(String s) {
+		int count = 0;
+		Stack<Character> stack = new Stack<Character>();
+		for (int i = 0; i < s.length(); i++) {
+
+			if      (s.charAt(i) == L_PAREN)   {
+				stack.push(L_PAREN);
+				count++;
+
+			}
+
+			else if (s.charAt(i) == L_BRACE)   {
+				stack.push(L_BRACE);
+				count++;
+			}
+
+			else if (s.charAt(i) == L_BRACKET){
+				stack.push(L_BRACKET);
+				count++;
+			}
+
+			else if (s.charAt(i) == R_PAREN) {
+				if (stack.isEmpty())        return false;
+				if (stack.pop() != L_PAREN) return false;
+			}
+
+			else if (s.charAt(i) == R_BRACE) {
+				if (stack.isEmpty())        return false;
+				if (stack.pop() != L_BRACE) return false;
+			}
+
+			else if (s.charAt(i) == R_BRACKET) {
+				if (stack.isEmpty())        return false;
+				if (stack.pop() != L_BRACKET) return false;
+			}
+
+			// ignore all other characters
+
+		}
+		return (stack.isEmpty());
+	}
+
+	public static boolean checkParentesis(String str)
+	{
+		if (str.isEmpty())
+			return true;
+
+		Stack<Character> stack = new Stack<Character>();
+		for (int i = 0; i < str.length(); i++)
+		{
+			char current = str.charAt(i);
+			if (current == '{' || current == '(' || current == '[')
+			{
+				stack.push(current);
+			}
+
+
+			if (current == '}' || current == ')' || current == ']')
+			{
+				if (stack.isEmpty())
+					return false;
+
+				char last = stack.peek();
+				if (current == '}' && (last == '{' || current == ')')
+						&& last == '(' || (current == ']'
+						&& last == '['))
+					stack.pop();
+				else 
+					return false;
+			}
+
+		}
+
+		return stack.isEmpty();
+	}
+
+	public static boolean isParenthesisMatch(String str) {
+		Stack<Character> stack = new Stack<Character>();
+
+		char c;
+		for(int i=0; i < str.length(); i++) {
+			c = str.charAt(i);
+
+			if(c == '{')
+				return false;
+
+			if(c == '(')
+				stack.push(c);
+
+			if(c == '{') {
+				stack.push(c);
+				if(c == '}')
+					if(stack.empty())
+						return false;
+					else if(stack.peek() == '{')
+						stack.pop();
+			}
+			else if(c == ')')
+				if(stack.empty())
+					return false;
+				else if(stack.peek() == '(')
+					stack.pop();
+				else
+					return false;
+		}
+		return stack.empty();
+	}
+
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/DescriptiveParagraphFromDocExtractor.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/DescriptiveParagraphFromDocExtractor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/DescriptiveParagraphFromDocExtractor.java
new file mode 100644
index 0000000..1c07719
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/DescriptiveParagraphFromDocExtractor.java
@@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket.kernel_interface;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.Tika;
+import org.apache.tika.exception.TikaException;
+
+public class DescriptiveParagraphFromDocExtractor {
+	protected static  Tika tika = new Tika();
+	private static int MIN_PARA_LENGTH = 200, //120, 
+			MIN_NUM_WORDS=15, 
+			MAX_PARA_LENGTH = 500, //200 
+			TEXT_PORTION_FOR_ANALYSIS = 20000, 
+			MAX_PARA_OUTPUT=20;
+	public static String getFirstParagraphFromFile(File f) {
+
+		String text = "";
+		try {
+			try {
+				text = tika.parseToString(f);
+			} catch (TikaException e) {
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+			} 
+			//text = FileUtils.readFileToString(f, null);
+			if (text.length()>TEXT_PORTION_FOR_ANALYSIS)
+				text = text.substring(0, TEXT_PORTION_FOR_ANALYSIS);
+			float avgSentSizeThr = (float)MIN_PARA_LENGTH/4f; //2f
+			String[] portions = text.split("\\.\\n");
+			for(String p: portions){
+				float avgSentSize = (float)p.length()/(float)p.split("\\n\\n").length;
+
+				if (p.length()> MIN_PARA_LENGTH && p.split(" ").length>MIN_NUM_WORDS &&
+						avgSentSize > avgSentSizeThr &&  p.length() < MAX_PARA_LENGTH){
+					return normalizePara(p);
+				}
+			}
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+		// if not a suitable paragraph found, return the whole text
+		if (text.length()>150)
+			text = text.substring(0, 150);
+		return text;
+	}
+
+	public static List<String> getLongParagraphsFromFile(File f) {
+		List<String> results = new ArrayList<String>();
+		String text = "";
+		try {
+			try {
+				text = tika.parseToString(f);
+			} catch (TikaException e) {
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+			} 
+			//text = FileUtils.readFileToString(f, null);
+			if (text.length()>TEXT_PORTION_FOR_ANALYSIS)
+				text = text.substring(0, TEXT_PORTION_FOR_ANALYSIS);
+			float avgSentSizeThr = (float)MIN_PARA_LENGTH/4f; //2f
+			String[] portions = text.split("\\.\\n");
+			if (portions.length<2)
+				portions = text.split("\\n\\n");
+			if (portions.length<2)
+				portions = text.split("\\n \\n");
+			if (portions.length<2){
+				String[] sentences = text.replace('.','&').split(" & ");
+				List<String> portionsLst = new ArrayList<String>();
+				int totalChars = 0;
+				String buffer = "";
+				for(String sent: sentences){
+					totalChars+=sent.length();
+					if (totalChars>MAX_PARA_LENGTH){
+						portionsLst.add(buffer);
+						buffer="";
+						totalChars = 0;
+					} else {
+						buffer+= sent + ". ";
+					}
+				}
+				portions = portionsLst.toArray(new String[0]);
+			}
+			for(String p: portions){
+				try {
+					float avgSentSize = (float)p.length()/(float)p.split("\\n\\n").length;
+
+					if (p.length()> MIN_PARA_LENGTH && p.split(" ").length>MIN_NUM_WORDS &&
+							avgSentSize > avgSentSizeThr) {  
+						if (p.length() < MAX_PARA_LENGTH){
+							results.add(normalizePara(p)); 
+						}
+						else { // reduce length to the latest '.' in substring
+							
+							String pReduced = p;
+							if (p.length()>= MAX_PARA_LENGTH+80)
+								pReduced = p.substring(0, MAX_PARA_LENGTH+80);
+							int indexPeriod = pReduced.lastIndexOf('.');
+							if (indexPeriod>-1){
+								pReduced = pReduced.substring(0, indexPeriod);
+							}
+							results.add(normalizePara(pReduced));
+						}
+						if (results.size()>MAX_PARA_OUTPUT)
+							break;
+					}
+				} catch (Exception e) {
+					e.printStackTrace();
+				}
+			}
+			if (results.size()<1){
+				if (text.length()>= MAX_PARA_LENGTH+80)
+					text = text.substring(0, MAX_PARA_LENGTH+80);
+				results.add(text);
+			}
+
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+		if (results.size()<1){
+			System.err.println("Failed to extract text from "+f.getName());
+		}
+
+		return results;
+	}
+
+	private static String normalizePara(String p){
+		p = p.replaceAll("\\n", " ").replaceAll("\\.\\.", " ").replaceAll("  ", " ");
+		p = p.replaceAll("[^A-Za-z0-9 _\\.,\\!]", "");
+		return p;
+	}
+
+	public static void main(String args[]){
+		List<String> results = getLongParagraphsFromFile(new File(
+				"/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/test/resources/1k/design_doc_posNeg/pos/2IP40 Detail Design Document.pdf"
+				//+ " Online Screening Tool - Delottie.pdf"
+				));
+		System.out.println(results);
+
+		String res = getFirstParagraphFromFile(new File("/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/test/resources/1k/"
+				+ "design_doc/2004Schalk_BCI2000Implementation.pdf"));
+		System.out.println(res);
+		results = getLongParagraphsFromFile(new File("/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/test/resources/1k/"
+				+ "design_doc/2004Schalk_BCI2000Implementation.pdf"));
+		System.out.println(results);
+
+	}
+}