You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by bg...@apache.org on 2016/11/16 09:11:41 UTC

[45/51] [partial] opennlp-sandbox git commit: merge from bgalitsky's own git repo

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/NamedEntityExtractor.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/NamedEntityExtractor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/NamedEntityExtractor.java
new file mode 100644
index 0000000..b766c7c
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/NamedEntityExtractor.java
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.apps.relevanceVocabs.SentimentVocab;
+import opennlp.tools.jsmlearning.ProfileReaderWriter;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.VerbNetProcessor;
+import opennlp.tools.parse_thicket.kernel_interface.DescriptiveParagraphFromDocExtractor;
+import opennlp.tools.parse_thicket.matching.Matcher;
+import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
+import opennlp.tools.similarity.apps.utils.Pair;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.TextProcessor;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang.StringUtils;
+
+public class NamedEntityExtractor {
+	protected static Matcher matcher;
+	private static int PARA_LENGTH_IN_SENTENCES = 5, PARA_LENGTH = 250;
+	protected ArrayList<File> queue = new ArrayList<File>();
+	protected static PT2ThicketPhraseBuilder phraseBuilder;
+	protected static SentimentVocab sVocab = SentimentVocab.getInstance();
+	String resourceDirSentimentList = null;
+	Set<String> sentimentVcb = new HashSet<String> ();
+
+	static {
+		synchronized (NamedEntityExtractor.class) {
+			matcher = new Matcher();
+			phraseBuilder = new PT2ThicketPhraseBuilder();
+		}
+	}
+
+	public NamedEntityExtractor(){
+		try {
+			resourceDirSentimentList = new File( "." ).getCanonicalPath()+"/src/test/resources/opinions/sentiment_listReduced.csv";
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+		List<String[]> sentimentList=null;
+		sentimentList = ProfileReaderWriter.readProfiles(resourceDirSentimentList);
+		for(String[] line: sentimentList){
+			sentimentVcb.add(line[0]);
+		}
+	}
+
+	protected boolean isSentimentWord(String word){
+		if (sentimentVcb.contains(word))
+			return true;
+		else
+			return false;		
+	}
+
+	public EntityExtractionResult extractEntities(String para){
+		List<List<ParseTreeNode>> extractedNERs = new ArrayList<List<ParseTreeNode>>();
+		List<String> extractedNERsWords = new ArrayList<String>();
+		List<List<ParseTreeNode>> extractedSentimentPhrases = 
+				new ArrayList<List<ParseTreeNode>>();
+		EntityExtractionResult result = new EntityExtractionResult();
+
+		ParseThicket pt = null;
+
+		System.out.println("Processing paragraph of length "+para.length() + " | "+ para);
+		pt = matcher.buildParseThicketFromTextWithRST(para);
+		List<List<ParseTreeNode>> nodeList = pt.getSentenceNodes();
+
+
+		for(List<ParseTreeNode> sentence: nodeList){
+			//System.out.println("   Processing sentence: "+ sentence);
+			boolean bInsideNER = false; 
+			String currentPhrase = "";
+			List<ParseTreeNode> currentPhraseNode = new ArrayList<ParseTreeNode>(); 
+			for(ParseTreeNode word: sentence){
+				if (isNERforPhraseExtraction(word)){
+					//System.out.println("++Found word ="+word + " | NER="+ word.getNe());
+					if (bInsideNER){
+						currentPhrase += " "+word.getWord();
+						currentPhraseNode.add(word);
+					} else {
+						bInsideNER=true;
+						currentPhrase = word.getWord();
+						currentPhraseNode.add(word);
+					}
+				} else {
+					if (bInsideNER){
+						if (currentPhrase.indexOf(' ')>-1) // at least two tokens
+							extractedNERsWords.add(currentPhrase);
+							extractedNERs.add(currentPhraseNode);
+						currentPhrase = "";
+						bInsideNER=false;
+					} else {
+						// do nothing, continue scan
+					}
+				}
+			}
+			if (currentPhrase.length()>1 && currentPhrase.indexOf(' ')>-1){
+				extractedNERs.add(currentPhraseNode);
+				extractedNERsWords.add(currentPhrase);
+			}
+
+			Set<String> foundSentimentWords = new HashSet<String>();
+			// now we extract phrases
+			List<List<ParseTreeNode>> phrases = pt.getPhrases();
+			for(List<ParseTreeNode> phrase: phrases){
+				// find a noun phrase under sentiment
+				try {
+					for(int i = phrase.size()-1; i>-1; i--){
+						ParseTreeNode word = phrase.get(i);
+						if ((isSentimentWord(word.getWord()) ||
+								sVocab.isSentimentWord(word.getWord()) && !foundSentimentWords.contains(word.getWord()) )){
+							foundSentimentWords.add(word.getWord());
+							System.out.println("Sentim = " + word.getWord() + " | Found opinionated phrase "+phrase.toString());
+							if (phrase.size()>1 && phrase.size()<7)
+								extractedSentimentPhrases.add(phrase);			
+							break;
+						}
+					}
+				} catch (Exception e) {
+					e.printStackTrace();
+				}
+			}
+
+		} 
+		
+		extractedSentimentPhrases = reduceExtractedPhrases(extractedSentimentPhrases);
+		
+		result.setExtractedNER(extractedNERs);
+		result.setExtractedNERWords(extractedNERsWords);
+		result.setExtractedSentimentPhrases(extractedSentimentPhrases);
+		return result;
+	}
+
+	private List<List<ParseTreeNode>> reduceExtractedPhrases(List<List<ParseTreeNode>> extractedSentimentPhrases) {
+	    List<Integer> idsToDelete = new ArrayList<Integer>();
+		for(int i = 0; i<extractedSentimentPhrases.size(); i++){
+			for(int j = i+1; j<extractedSentimentPhrases.size(); j++){
+				String phrStr1 = ParseTreeNode.toWordString(extractedSentimentPhrases.get(i));
+				String phrStr2 = ParseTreeNode.toWordString(extractedSentimentPhrases.get(j));
+				if (phrStr1 .indexOf(phrStr2 )>-1)
+					idsToDelete.add(j);
+			}
+		}
+		List<List<ParseTreeNode>> resultPhrases = new ArrayList<List<ParseTreeNode>>();
+		for(int i = 0; i<extractedSentimentPhrases.size(); i++){
+			if (!idsToDelete.contains(i))
+				resultPhrases .add(extractedSentimentPhrases.get(i));
+		}
+	    return resultPhrases ;
+    }
+
+	private boolean isNERforPhraseExtraction(ParseTreeNode word){
+		if ((word.getNe().equals("ORGANIZATION") ||word.getNe().equals("LOCATION") || word.getNe().equals("PERSON") ) &&
+				(word.getPos().startsWith("NN") || word.getPos().startsWith("PR") || word.getPos().startsWith("IN")|| 
+						word.getPos().startsWith("JJ") || word.getPos().startsWith("DT")  ))
+			return true;
+
+		return false;
+
+	}
+
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/PersonExtractor.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/PersonExtractor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/PersonExtractor.java
new file mode 100644
index 0000000..cb04154
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/PersonExtractor.java
@@ -0,0 +1,96 @@
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+
+public class PersonExtractor extends NamedEntityExtractor {
+	private boolean isNERforPhraseExtraction(ParseTreeNode word){
+		if ((word.getNe().equals("PERSON") ) &&
+				(word.getPos().startsWith("NN") || word.getPos().startsWith("PR") || word.getPos().startsWith("IN")|| 
+						word.getPos().startsWith("JJ") || word.getPos().startsWith("DT")  ))
+			return true;
+
+		return false;
+
+	}
+	
+	public EntityExtractionResult extractEntities(String para){
+		List<List<ParseTreeNode>> extractedNERs = new ArrayList<List<ParseTreeNode>>();
+		List<String> extractedNERsWords = new ArrayList<String>();
+		List<List<ParseTreeNode>> extractedSentimentPhrases = 
+				new ArrayList<List<ParseTreeNode>>();
+		EntityExtractionResult result = new EntityExtractionResult();
+
+		ParseThicket pt = null;
+
+		System.out.println("Processing paragraph of length "+para.length() + " | "+ para);
+		pt = matcher.buildParseThicketFromTextWithRST(para);
+		List<List<ParseTreeNode>> nodeList = pt.getSentenceNodes();
+
+
+		for(List<ParseTreeNode> sentence: nodeList){
+			System.out.println("   Processing sentence: "+ sentence);
+			boolean bInsideNER = false; 
+			String currentPhrase = "";
+			List<ParseTreeNode> currentPhraseNode = new ArrayList<ParseTreeNode>(); 
+			for(ParseTreeNode word: sentence){
+				if (isNERforPhraseExtraction(word)){
+					System.out.println("++Found word ="+word + " | NER="+ word.getNe());
+					if (bInsideNER){
+						currentPhrase += " "+word.getWord();
+						currentPhraseNode.add(word);
+					} else {
+						bInsideNER=true;
+						currentPhrase = word.getWord();
+						currentPhraseNode.add(word);
+					}
+				} else {
+					if (bInsideNER){
+						if (currentPhrase.indexOf(' ')>-1) // at least two tokens
+							extractedNERsWords.add(currentPhrase);
+							extractedNERs.add(currentPhraseNode);
+						currentPhrase = "";
+						bInsideNER=false;
+					} else {
+						// do nothing, continue scan
+					}
+				}
+			}
+			if (currentPhrase.length()>1 && currentPhrase.indexOf(' ')>-1){
+				extractedNERs.add(currentPhraseNode);
+				extractedNERsWords.add(currentPhrase);
+			}
+
+			Set<String> foundSentimentWords = new HashSet<String>();
+			// now we extract phrases
+			List<List<ParseTreeNode>> phrases = phraseBuilder.buildPT2ptPhrases(pt);
+			for(List<ParseTreeNode> phrase: phrases){
+				// find a noun phrase under sentiment
+				try {
+					for(int i = phrase.size()-1; i>-1; i--){
+						ParseTreeNode word = phrase.get(i);
+						if ((isSentimentWord(word.getWord()) ||
+								sVocab.isSentimentWord(word.getWord()) && !foundSentimentWords.contains(word.getWord()) )){
+							foundSentimentWords.add(word.getWord());
+							System.out.println("Found opinionated phrase "+phrase.toString());
+							extractedSentimentPhrases.add(phrase);			
+							break;
+						}
+					}
+				} catch (Exception e) {
+					e.printStackTrace();
+				}
+			}
+
+		} 
+		result.setExtractedNER(extractedNERs);
+		result.setExtractedNERWords(extractedNERsWords);
+		result.setExtractedSentimentPhrases(extractedSentimentPhrases);
+		return result;
+	}
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentencePhraseGivenAWordGetter.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentencePhraseGivenAWordGetter.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentencePhraseGivenAWordGetter.java
new file mode 100644
index 0000000..86cd2dc
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentencePhraseGivenAWordGetter.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.apps.relevanceVocabs.SentimentVocab;
+import opennlp.tools.jsmlearning.ProfileReaderWriter;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.VerbNetProcessor;
+import opennlp.tools.parse_thicket.kernel_interface.DescriptiveParagraphFromDocExtractor;
+import opennlp.tools.parse_thicket.matching.Matcher;
+import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
+import opennlp.tools.similarity.apps.utils.Pair;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.TextProcessor;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang.StringUtils;
+
+public class SentencePhraseGivenAWordGetter {
+	protected static Matcher matcher;
+	protected ArrayList<File> queue = new ArrayList<File>();
+	protected static PT2ThicketPhraseBuilder phraseBuilder;
+
+
+	static {
+		synchronized (SentencePhraseGivenAWordGetter.class) {
+			matcher = new Matcher();
+			phraseBuilder = new PT2ThicketPhraseBuilder();
+		}
+	}
+
+	public SentencePhraseGivenAWordGetter(){
+	}
+
+	public EntityExtractionResult extractEntities(String para, String keyword){
+		List<List<ParseTreeNode>> extractedPhrases = new ArrayList<List<ParseTreeNode>>();
+
+		EntityExtractionResult result = new EntityExtractionResult();
+
+		ParseThicket pt =  matcher.buildParseThicketFromTextWithRST(para);
+
+		List<List<ParseTreeNode>> phrases = pt.getPhrases();
+		for(List<ParseTreeNode> phrase: phrases){
+			// find a noun phrase under sentiment
+			try {
+				for(int i = 0; i<phrase.size(); i++){
+					ParseTreeNode word = phrase.get(i);
+					if (word.getWord().toLowerCase().equals(keyword.toLowerCase())){
+						extractedPhrases.add(phrase);		
+						break;
+					}
+				}
+			} catch (Exception e) {
+				e.printStackTrace();
+			}
+		}
+
+		result.setExtractedSentimentPhrases(extractedPhrases);
+		return result;
+	}
+
+
+	public static void main(String[] args){
+		SentencePhraseGivenAWordGetter self = new SentencePhraseGivenAWordGetter();
+		EntityExtractionResult result = self.extractEntities("However i put a foam panel inside the main case if i do not have my headphones or an iPad to brace the mac book", 
+				"panel");
+		System.out.println(result.getExtractedSentimentPhrases());
+	}
+}
+
+
+/*
+ 3 phrases are given as a result
+ * 
+[[<2>SBAR'i':FW, <3>SBAR'put':VBD, <4>SBAR'a':DT, <5>SBAR'foam':NN, <6>SBAR'panel':NN, <7>SBAR'inside':IN, <8>SBAR'the':DT, <9>SBAR'main':JJ, <10>SBAR'case':NN, <11>SBAR'if':IN, <12>SBAR'i':FW, 
+<13>SBAR'do':VBP, <14>SBAR'not':RB, <15>SBAR'have':VB, <16>SBAR'my':PRP$, <17>SBAR'headphones':NNS, <18>SBAR'or':CC, <19>SBAR'an':DT, <20>SBAR'iPad':NN, <21>SBAR'to':TO, 
+<22>SBAR'brace':VB, <23>SBAR'the':DT, <24>SBAR'mac':NN, <25>SBAR'book':NN], 
+
+[<3>VP'put':VBD, <4>VP'a':DT, <5>VP'foam':NN, <6>VP'panel':NN, <7>VP'inside':IN, <8>VP'the':DT, <9>VP'main':JJ, <10>VP'case':NN, <11>VP'if':IN, <12>VP'i':FW, <13>VP'do':VBP, 
+<14>VP'not':RB, <15>VP'have':VB, <16>VP'my':PRP$, <17>VP'headphones':NNS, <18>VP'or':CC, <19>VP'an':DT, <20>VP'iPad':NN, <21>VP'to':TO, <22>VP'brace':VB, <23>VP'the':DT, 
+<24>VP'mac':NN, <25>VP'book':NN], 
+
+[<4>NP'a':DT, <5>NP'foam':NN, <6>NP'panel':NN]]
+
+*/

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentimentCoreAnnotations.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentimentCoreAnnotations.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentimentCoreAnnotations.java
new file mode 100644
index 0000000..1efe428
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentimentCoreAnnotations.java
@@ -0,0 +1,41 @@
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import edu.stanford.nlp.ling.CoreAnnotation;
+
+import edu.stanford.nlp.trees.Tree;
+
+/**
+ * Annotations specific to the Sentiment project.  In case there are
+ * other projects that use the same RNN machinery, including the RNN
+ * core annotations, this lets a sentence have a tree attached where
+ * that tree specifically has the sentiment annotations.
+ *
+ * @author John Bauer
+ */
+public class SentimentCoreAnnotations {
+
+  /**
+   * A tree which contains the annotations used for the Sentiment
+   * task.  After forwardPropagate has been called, the Tree will have
+   * prediction, etc. attached to it.
+   */
+  public static class SentimentAnnotatedTree implements CoreAnnotation<Tree> {
+    @Override
+    public Class<Tree> getType() {
+      return Tree.class;
+    }
+  }
+
+
+  /**
+   * The final label given for a sentence.  Set by the
+   * SentimentAnnotator and used by various forms of text output.
+   */
+  public static class SentimentClass implements CoreAnnotation<String> {
+    @Override
+    public Class<String> getType() {
+      return String.class;
+    }
+  }
+}
+

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/StopList.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/StopList.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/StopList.java
new file mode 100755
index 0000000..ad0f791
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/StopList.java
@@ -0,0 +1,401 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Hashtable;
+import java.util.Iterator;
+import java.util.List;
+
+import opennlp.tools.stemmer.PStemmer;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+
+public class StopList {
+    private static StopList m_StopList = null;
+    private static Hashtable<String, HashSet<String>> m_stopHash = new Hashtable<String, HashSet<String>>();
+    public static final Log logger = LogFactory.getLog(StopList.class);
+    private static final String DEFAULT_STOPLIST = "STANDARD";
+    public static String resourceDir =null;
+    private static PStemmer stemmer = new PStemmer();
+
+    static {
+        synchronized (StopList.class) {
+            try {
+                LoadStopList();
+            } catch (IOException e) {
+                // TODO Auto-generated catch block
+                e.printStackTrace();
+            }
+        }
+    }
+
+    /**
+     * Get the StopList singleton instance.
+     * 
+     * @return The StopList
+     */
+    static public synchronized StopList getInstance() {
+
+        if (m_StopList == null) {
+            m_StopList = new StopList();
+
+            try {
+                m_StopList.LoadStopList();
+            } catch (Exception e) {
+
+            }
+        }
+        return m_StopList;
+    }
+
+    static public synchronized StopList getInstance(String dir) {
+        resourceDir = dir;
+        if (m_StopList == null) {
+            m_StopList = new StopList();
+
+            try {
+                m_StopList.LoadStopList();
+            } catch (Exception e) {
+
+            }
+        }
+        return m_StopList;
+    }
+
+    private static void LoadStopList() throws IOException {
+
+        File dir = new File(resourceDir + "/maps");
+        String[] children = dir.list();
+        if (children == null) {
+            System.err.println("Problem reading Stop Lists!");
+        } else {
+            for (int i = 0; i < children.length; i++) {
+                String fn = children[i];
+                if (fn.endsWith(".vcb")) {
+                    String fileName = resourceDir + "/maps/" + fn;
+                    File f = new File(fileName);
+                    loadStopListFile(f);
+                }
+            }
+        }
+    }
+
+    private static void loadStopListFile(File f) throws FileNotFoundException {
+
+        FileReader fileReader = new FileReader(f);
+        BufferedReader in = new BufferedReader(fileReader);
+
+        String str = new String();
+        boolean fLine = true;
+        HashSet<String> t = new HashSet<String>();
+        String listName = "";
+
+        try {
+            while ((str = in.readLine()) != null) {
+                if (fLine && str.length() > 0) {
+                    fLine = false;
+                    listName = str;
+                } else {
+                    t.add(str);
+                }
+            }
+        } catch (IOException ioe) {
+
+        } finally {
+            try {
+                if (in != null) {
+                    in.close();
+                }
+                if (fileReader != null) {
+                    fileReader.close();
+                }
+            } catch (IOException ioe) {
+                ioe.printStackTrace();
+            }
+        }
+
+        if (listName.length() > 0) {
+            HashSet<String> l = m_stopHash.get(listName);
+            if (l != null) {
+                synchronized (l) {
+                    m_stopHash.put(listName, t);
+                }
+            } else {
+                m_stopHash.put(listName, t);
+            }
+        }
+    }
+
+    /**
+     * Is the given word in the stop words list? Uses the defaut "STANDARD"
+     * stoplist
+     * 
+     * @param str
+     *            The word to check
+     * @return is a stop word
+     */
+    public static boolean isStopWord(String str) {
+        boolean retVal = false;
+        if (m_stopHash.containsKey(DEFAULT_STOPLIST))
+            retVal = m_stopHash.get(DEFAULT_STOPLIST).contains(str);
+        return retVal;
+    }
+
+    public static boolean isFirstName(String str) {
+        boolean retVal = false;
+        if (m_stopHash.containsKey("FIRST_NAMES"))
+            retVal = m_stopHash.get("FIRST_NAMES").contains(str.toUpperCase());
+        return retVal;
+    }
+
+    public String getRandomFirstName() {
+        HashSet<String> firstNames = m_stopHash.get("FIRST_NAMES");
+        int indexRand = (int) (Math.random() * new Float(firstNames.size()));
+        Iterator iter = firstNames.iterator();
+        for (int i = 0; i < indexRand; i++) {
+            iter.next();
+        }
+        return ((String) iter.next()).toLowerCase();
+    }
+
+    public static boolean isCommonWord(String str) {
+        if (str == null)
+            return true;
+        String stemmed="";
+		try {
+			stemmed = stemmer.stem(str).toLowerCase();
+		} catch (Exception e) {
+			//stemming exceptions are not informative, jiust ignore wthis word
+			//e.printStackTrace();
+		}
+
+        boolean retVal = false;
+        if (m_stopHash.containsKey("ENG_DICT"))
+            retVal = m_stopHash.get("ENG_DICT").contains(stemmed);
+        return retVal;
+    }
+
+    public boolean isCommonEventWord(String str) {
+        if (str == null)
+            return true;
+        boolean retVal = false;
+
+        try {
+            String stemmed = str.toLowerCase();
+
+            if (m_stopHash.containsKey("fREQUENTEVENTNAMEWORDS"))
+                retVal = m_stopHash.get("fREQUENTEVENTNAMEWORDS").contains(
+                        stemmed);
+        } catch (Exception e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        }
+        return retVal;
+    }
+
+    /**
+     * Is the given word in the stop words list provided?
+     * 
+     * @param str
+     *            The word to check
+     * @param stop_list
+     *            the name of the stoplist to check against
+     * @return is a stop word
+     */
+    public static boolean isStopWord(String str, String stop_list) {
+        boolean retVal = false;
+        if (m_stopHash.containsKey(stop_list))
+            retVal = m_stopHash.get(stop_list).contains(str);
+        return retVal;
+    }
+
+    public boolean isStopWordAll(String str) {
+        return isStopWord(str);
+    }
+
+    public HashSet<String> getStopListMap(String name) {
+        return m_stopHash.get(name);
+    }
+
+    public static List<List<String>> preFilterCommonEnglishExpressions(
+            List<String> userLikes) {
+        List<List<String>> results = new ArrayList<List<String>>();
+
+        List<String> resultUserLikes = new ArrayList<String>(), potentialCategs = new ArrayList<String>();
+        if (userLikes.size() < 6) {// too short, do not filter
+            results.add(userLikes);
+            results.add(potentialCategs);
+            return results;
+
+        }
+
+        for (String like : userLikes) {
+            like = like.toLowerCase();
+            if (!StringUtils.isAlphanumeric(like.replace(" ", ""))) {
+                logger.info("removed isAlphanumeric " + like);
+                continue;
+            }
+
+            if (StringUtils.isNumeric(like)) {
+                logger.info("removed isNumericSpace " + like);
+                continue;
+            }
+
+            if (like.length() < 4) {
+                logger.info("removed too short likes " + like);
+                continue;
+            }
+            boolean existFirstName = false, allWordsCommonEnglish = true, bStop = false;
+            String[] comps = like.split(" ");
+            StringBuffer buf = new StringBuffer();
+            for (String word : comps) {
+                boolean isCommon = isCommonWord(word);
+                boolean isName = isFirstName(word);
+                if (!isCommon)
+                    allWordsCommonEnglish = false;
+                if (isName)
+                    existFirstName = true;
+                if (isStopWord(word) || word.length() < 3)
+                    bStop = true;
+                else
+                    buf.append(word + " ");
+            } // / does not have to include stop word
+            if (!existFirstName && allWordsCommonEnglish && comps.length < 3) {
+                logger.info("moved to category:  NoFirstName+AllCommonEng+ShorterThan3 "
+                        + like);
+
+                continue;
+            }
+            if (!existFirstName && allWordsCommonEnglish && comps.length == 1) {
+                logger.info("moved to category: NoFirstName+AllCommonEng+Short1word "
+                        + like);
+                potentialCategs.add(like);
+                continue;
+            }
+
+            if (existFirstName && comps.length == 1) {
+                logger.info("removed : only first name, no last name " + like);
+
+                continue;
+            }
+
+            resultUserLikes.add(buf.toString().trim());
+
+        }
+
+        resultUserLikes = new ArrayList<String>(new HashSet<String>(
+                resultUserLikes));
+        if (resultUserLikes.size() > 1) {
+            results.add(resultUserLikes);
+            results.add(potentialCategs);
+            return results;
+        }
+
+        else {// do not do reduction
+            results.add(userLikes);
+            results.add(potentialCategs);
+            return results;
+        }
+    }
+
+    public static boolean isAcceptableIndividualLikes(String like) {
+        StopList finder = StopList.getInstance();
+        like = like.toLowerCase();
+        if (!StringUtils.isAlphanumeric(like.replace(" ", ""))) {
+            logger.info("removed isAlphanumeric " + like);
+            return false;
+        }
+
+        if (StringUtils.isNumeric(like)) {
+            logger.info("removed isNumericSpace " + like);
+            return false;
+        }
+
+        if (like.length() < 4) {
+            logger.info("removed too short likes " + like);
+            return false;
+        }
+        boolean existFirstName = false, allWordsCommonEnglish = true, bStop = false;
+        String[] comps = like.split(" ");
+        StringBuffer buf = new StringBuffer();
+        for (String word : comps) {
+            boolean isCommon = finder.isCommonWord(word);
+            boolean isName = finder.isFirstName(word);
+            if (!isCommon)
+                allWordsCommonEnglish = false;
+            if (isName)
+                existFirstName = true;
+            if (finder.isStopWord(word) || word.length() < 3)
+                bStop = true;
+            else
+                buf.append(word + " ");
+        } // / does not have to include stop word
+        if (!existFirstName && allWordsCommonEnglish && comps.length < 3) {
+            logger.info("  NoFirstName+AllCommonEng+ShorterThan3 " + like);
+
+            return false;
+        }
+        if (!existFirstName && allWordsCommonEnglish && comps.length == 1) {
+            logger.info(" NoFirstName+AllCommonEng+Short1word " + like);
+
+            return false;
+        }
+
+        if (existFirstName && comps.length == 1) {
+            logger.info("removed : only first name, no last name " + like);
+
+            return false;
+        }
+
+        return true;
+    }
+
+    @SuppressWarnings("all")
+    public static void main(String[] args) {
+
+        StopList list = StopList
+                .getInstance("/Users/borisgalitsky/Documents/workspace/opennlp-similarity/src/test/resources/");
+        Boolean b = list.isCommonWord("demonstration");
+
+        String fname = list.getRandomFirstName();
+
+        b = list.isCommonEventWord("tour");
+        b = list.isCommonEventWord("dance");
+        b = list.isCommonEventWord("salsa");
+        b = list.isCommonEventWord("center");
+        b = list.isCommonEventWord("family");
+
+      
+
+        b = isAcceptableIndividualLikes("forest glen");
+        b = isAcceptableIndividualLikes("drive");
+        b = isAcceptableIndividualLikes("house");
+        b = isAcceptableIndividualLikes("Timothy Kloug");
+        b = isAcceptableIndividualLikes("Mamma Mia");
+
+    }
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicAsOpinionMinerRunner.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicAsOpinionMinerRunner.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicAsOpinionMinerRunner.java
new file mode 100644
index 0000000..f4d56aa
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicAsOpinionMinerRunner.java
@@ -0,0 +1,117 @@
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang3.StringUtils;
+
+import opennlp.tools.jsmlearning.ProfileReaderWriter;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+
+public class TopicAsOpinionMinerRunner {
+	private List<File> queue;
+	private final static String reviewSource = "/Users/bgalitsky/Documents/solr/example/exampledocs/publication_page0.json";
+	NamedEntityExtractor neExtractor = new NamedEntityExtractor();
+	Set<String> allPhrases = new HashSet<String>();
+	
+	public void processJSONfileWithReviews(){
+		List<String[]> report = new ArrayList<String[]>();
+		report.add(new String[] { "text", "phrases of potential interest list" , });
+
+		
+		String content=null;
+		try {
+			content = FileUtils.readFileToString(new File(reviewSource));
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+		String[] texts = StringUtils.substringsBetween(content, "summary\":\"", "\"");
+		for(String text: texts){
+			report.clear();
+			EntityExtractionResult result = neExtractor.extractEntities(text);
+			//report.add(new String[]{text});
+			allPhrases.addAll(result.extractedNERWords);
+			allPhrases = new HashSet<String>(allPhrases);
+			for(String p: allPhrases){
+				report.add(new String[]{p});
+			}
+			/*
+			String[] phrases = (String[])result.extractedNERWords.toArray(new String[0]);
+			if (phrases!=null && phrases.length>0)
+				report.add(phrases);
+			*/
+			/*report.add((String[])result.extractedSentimentPhrases.toArray(new String[0]));
+			List<String> stringPhrases = new ArrayList<String>(),
+					nodePhrases = new ArrayList<String>();
+			for(List<ParseTreeNode> chList: result.extractedSentimentPhrases){
+				String buf = "", nodeBuf="";
+				for(ParseTreeNode ch: chList){
+					buf+=ch.getWord()+ " ";
+					nodeBuf+=ch.toString()+ " ";
+				}
+				stringPhrases.add(buf.trim());
+				nodePhrases.add(nodeBuf.trim());
+			}
+			report.add((String[])stringPhrases.toArray(new String[0]));
+			report.add((String[])nodePhrases.toArray(new String[0]));
+			*/
+			
+			ProfileReaderWriter.writeReport(report, "phrasesExtracted3.csv");
+		}
+	}
+
+	private void addFiles(File file) {
+
+		if (!file.exists()) {
+			System.out.println(file + " does not exist.");
+
+			if (file.isDirectory()) {
+				for (File f : file.listFiles()) {
+					if (f.getName().startsWith("."))
+						continue;
+					addFiles(f);
+					System.out.println(f.getName());
+				}
+			} else {
+				queue.add(file);
+
+			}
+		}
+	}
+	
+	public static void main(String[] args){
+		TopicAsOpinionMinerRunner runner = new TopicAsOpinionMinerRunner();
+		runner.processJSONfileWithReviews();
+
+	}
+}
+
+/*
+	public void processDirectory(String path){
+		List<String[]> report = new ArrayList<String[]>();
+		report.add(new String[] { "filename", "named entity list", "phrases of potential interest list" });
+
+		List<String> allNamedEntities = new ArrayList<String>();
+
+		addFiles(new File(path));
+		for(File f: queue){
+			List<String> entities = (List<String>) extractEntities(f.getAbsolutePath()).getFirst();
+			List<String> opinions = (List<String>) extractEntities(f.getAbsolutePath()).getSecond();
+			report.add(new String[]{ f.getName(), entities.toString(),  opinions.toString()});	
+			ProfileReaderWriter.writeReport(report, "nameEntitiesExtracted.csv");
+
+			allNamedEntities.addAll(entities);
+
+			allNamedEntities = new ArrayList<String>(new HashSet<String> (allNamedEntities ));
+
+
+		}
+		ProfileReaderWriter.writeReport(report, "nameEntitiesTopicsOfInterestExtracted.csv");
+	} 
+} */

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicPhraseExtractor.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicPhraseExtractor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicPhraseExtractor.java
new file mode 100644
index 0000000..a704f22
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicPhraseExtractor.java
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.apps.relevanceVocabs.SentimentVocab;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.matching.Matcher;
+
+public class TopicPhraseExtractor {
+	Matcher matcher = new Matcher();
+
+	// sentiment vocabulary for phrase under the focus of sentiment
+	SentimentVocab sVocab = SentimentVocab.getInstance();
+	//This is used to create an XML with phrases. The same class for acro  & phrases
+
+	public EntityExtractionResult extractEntities(String para){
+		EntityExtractionResult result = new EntityExtractionResult();
+		List<String> extractedNerPhrasesStr = new ArrayList<String>(), 
+				extractedNerExactStr = new ArrayList<String>(),
+				extractedSentimentPhrasesStr = 
+				new ArrayList<String>(), extractedNONSentimentPhrasesStr = 
+				new ArrayList<String>(), extractedNerPhraseTags = new ArrayList<String>();
+		// no need to change to extract more/less phrases
+		ParseThicket pt = matcher.buildParseThicketFromTextWithRST(para);
+
+		List<List<ParseTreeNode>> extractedSentimentPhrases = new ArrayList<List<ParseTreeNode>>(), 
+				extractedNONSentimentPhrases = new ArrayList<List<ParseTreeNode>>(),
+				extractedNerPhrases = new ArrayList<List<ParseTreeNode>>(),
+						extractedNerExactPhrases= new ArrayList<List<ParseTreeNode>>();
+		//TODO document examples / cases for each rule
+		// now we extract phrases
+		List<List<ParseTreeNode>> phrases = pt.getPhrases();
+		List<Float> sentimentProfile = pt.getSentimentProfile();
+		for(List<ParseTreeNode> phrase: phrases){
+
+			// find a noun phrase under sentiment
+			boolean bAccept = true, bNER = false;
+
+			String phraseStr = asString(phrase);
+
+
+			if (!phrase.get(0).getPhraseType().equals("NP") && !phrase.get(0).getPhraseType().equals("VP") )	
+				bAccept = false;
+
+			boolean bSentiment = false;
+			for(ParseTreeNode word: phrase){
+				if (sVocab.isSentimentWord(word.getWord())){
+					bSentiment=true;
+					break;
+				}
+			}
+
+			String nerTagConfirmed = null;
+			for(ParseTreeNode word: phrase){
+				// no Named Entity
+				String nerTag = isNERforPhraseExtraction(word);
+				if (nerTag!=null){
+					bNER = true;
+					nerTagConfirmed = nerTag;
+				}
+
+				// no numbers nor prepositions
+				if (word.getPos().startsWith("CD") || word.getPos().indexOf("PRP")>-1 )
+					bAccept = false;
+			}
+			if (!bAccept)
+				continue;
+			// was 7 -> 2
+			if (phrase.size()>7 || phrase.size()<2)
+				bAccept = false;
+
+			if (phrase.get(0).getPos().equals("DT") && phrase.size()<3)
+				bAccept = false;
+			if (!bAccept)
+				continue;
+
+			String cleanedPhraseStr = cleanPhraseString(phraseStr);
+			if (cleanedPhraseStr==null)
+				bAccept = false;
+
+			if (bAccept){
+				if (bNER){
+					extractedNerPhrases.add(phrase);
+					extractedNerPhrasesStr.add(phraseStr);
+					extractedNerPhraseTags.add(nerTagConfirmed );
+					// forming exact NER
+					List<ParseTreeNode> phraseNER_exact = new ArrayList<ParseTreeNode>();
+					String nerExactStr = "";
+					for(ParseTreeNode word: phrase){
+						String ner = isNERforPhraseExtraction(word);
+						if (ner!=null && ner.equals(nerTagConfirmed)){
+							phraseNER_exact.add(word);
+							nerExactStr+=" "+word.getWord();
+						}
+					}
+					nerExactStr.trim();
+					extractedNerExactPhrases.add(phraseNER_exact);
+					extractedNerExactStr.add(nerExactStr);
+				}
+				else if (bSentiment) {
+					extractedSentimentPhrasesStr.add(cleanedPhraseStr);					
+					extractedSentimentPhrases.add(phrase);
+				} else {
+					extractedNONSentimentPhrasesStr.add(cleanedPhraseStr);					
+					extractedNONSentimentPhrases.add(phrase);
+				}
+			}
+		} 
+
+		result.setExtractedSentimentPhrases(extractedSentimentPhrases);
+		result.setExtractedSentimentPhrasesStr(extractedSentimentPhrasesStr);
+
+		result.setExtractedNONSentimentPhrases(extractedNONSentimentPhrases);
+		result.setExtractedNONSentimentPhrasesStr(extractedNONSentimentPhrasesStr);
+		
+		result.setExtractedNerPhrases(extractedNerPhrases);
+		result.setExtractedNerPhrasesStr(extractedNerPhrasesStr);
+		result.setExtractedNerPhraseTags(extractedNerPhraseTags);
+		
+		result.setExtractedNerExactPhrases(extractedNerExactPhrases);
+		result.setExtractedNerExactStr(extractedNerExactStr);
+
+		result.setSentimentProfile(sentimentProfile );
+
+		return result;
+	}
+
+
+
+
+
+
+	private String cleanPhraseString(String phraseStr) {
+		String p = phraseStr.toLowerCase();
+
+		if (p.startsWith("*") || p.startsWith("&") || p.startsWith("$"))
+			return null;
+
+		if (p.startsWith("this ") || p.startsWith("other "))
+			return null;
+
+		if (p.startsWith("a "))
+			p = p.substring(2, p.length());
+		if (p.startsWith("the "))
+			p = p.substring(4, p.length());
+		if (p.startsWith(", "))
+			p = p.substring(2, p.length());
+
+		return p;
+	}
+
+	private String asString(List<ParseTreeNode> phrase) {
+		String buf = "";
+		for(ParseTreeNode p: phrase)
+			buf+=p.getWord()+" ";
+		return buf.trim();
+	}
+
+	private String isNERforPhraseExtraction(ParseTreeNode word){
+		if (word.getNe() == null)
+			return null;
+		
+
+		if (!(word.getPos().startsWith("NN") || word.getPos().startsWith("PR") || word.getPos().startsWith("IN")|| 
+				word.getPos().startsWith("JJ") || word.getPos().startsWith("DT")))
+			return null;
+				
+
+		if (word.getNe().equals("ORGANIZATION"))
+				return "ORGANIZATION";
+		if(word.getNe().equals("LOCATION"))
+			return "LOCATION";
+					
+		if(word.getNe().equals("PERSON") ) 
+			return "PERSON";
+		
+		if(word.getNe().equals("MONEY") ) 
+			return "MONEY";
+		if(word.getNe().equals("DATE") ) 
+			return "DATE";
+		if(word.getNe().equals("TIME") ) 
+			return "TIME";
+
+		return null;
+
+	}
+}
+
+/*
+ * Na�ve  sentiment prediction systems work just by looking at words in isolation, giving positive points for positive words and negative points for negative words and then summing up these points. That way, the order of words is ignored and important information is lost. The deep learning model of (Socher et al 2013) builds a representation of whole sentences based on the sentence structure. It computes the sentiment based on how words compose the meaning of longer phrases. However, in most applications just taking individual sentences into account do not give accurate results and rhetoric information needs to be taken into account to determine the overall sentiment of a paragraph and then back to the individual sentence level.
+ */
+

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterEngineRunner.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterEngineRunner.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterEngineRunner.java
new file mode 100644
index 0000000..6de3180
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterEngineRunner.java
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.io.File;
+import java.io.IOException;
+import java.lang.reflect.Array;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang3.StringUtils;
+
+import au.com.bytecode.opencsv.CSVWriter;
+import opennlp.tools.jsmlearning.ProfileReaderWriter;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+
+public class TwitterEngineRunner {
+	private List<File> queue;
+	private final static String twSource = "/Users/bgalitsky/Documents/workspace/TwitterMiner/data/TwitterArtistsDynamicsTot12_07.csv";
+	TwitterFilter neExtractor = new TwitterFilter();
+	private static int iWind = 80;
+
+	public void processTweetFile(int nRun){
+		List<String[]> report = new ArrayList<String[]>(), ful_less =  new ArrayList<String[]>();
+		List<String> meaningLESS = new ArrayList<String>(), meaningFUL = new ArrayList<String>();
+		report.add(new String[] { "text", "phrases of potential interest list" , });
+
+		List<String[]> texts = ProfileReaderWriter.readProfiles(twSource);
+		int offset = iWind*nRun;
+		
+		//for(int i=offset; i< offset+iWind; i++){
+			
+		//	String[] text = texts.get(i);
+		for(String[] text: texts){
+			List<String> textDeduped = new ArrayList<String>(new HashSet<String>(Arrays.asList(text)));
+			EntityExtractionResult result = null;
+			if (text==null || text.length<4)
+				continue;
+
+			for(int nInLine=3; nInLine<textDeduped.size(); nInLine++){
+				if (textDeduped.get(nInLine).length()>180)
+					continue;
+				
+				String cleanedTweet = textDeduped.get(nInLine).replace("/\\bs\\@+/ig","");
+				try {
+					result = neExtractor.extractEntities(cleanedTweet);
+				} catch (Exception e) {
+					e.printStackTrace();
+					continue;
+				}
+				report.add(new String[]{text[0],text[nInLine]});
+				report.add((String[])result.extractedNERWords.toArray(new String[0]));
+				//report.add((String[])result.extractedSentimentPhrases.toArray(new String[0]));
+				List<String> stringPhrases = new ArrayList<String>(),
+						nodePhrases = new ArrayList<String>();
+				Boolean bMeaningf = false;
+
+				//stringPhrases.add(""); nodePhrases.add(""); // to make report more readable
+				for(List<ParseTreeNode> chList: result.extractedSentimentPhrases){
+					String buf = "", nodeBuf="";
+					for(ParseTreeNode ch: chList){
+						buf+=ch.getWord()+ " ";
+						nodeBuf+=ch.toString()+ " ";
+					}
+					stringPhrases.add(buf.trim());
+					nodePhrases.add(nodeBuf.trim());
+				}
+				// selecting MEANINGFULL
+				if (nodePhrases.size()>1){
+					if ((nodePhrases.get(0).indexOf(">VP'")>-1 || nodePhrases.get(0).indexOf(">NNP'")>-1) &&
+							(nodePhrases.get(1).indexOf(">VP'")>-1 || nodePhrases.get(1).indexOf(">NNP'")>-1)){
+						bMeaningf = true;
+
+					}
+				}
+
+				report.add((String[])stringPhrases.toArray(new String[0]));
+				report.add((String[])nodePhrases.toArray(new String[0]));
+				if (bMeaningf){
+					report.add(new String[]{"===", "MEANINGFUL tweet"});
+					if (!meaningFUL.contains(cleanedTweet))
+						meaningFUL.add(cleanedTweet);
+				} else {
+					if (!meaningLESS.contains(cleanedTweet))
+						meaningLESS.add(cleanedTweet);
+				}
+
+				int count = 0;
+				ful_less.clear();
+				for(String less: meaningLESS ){
+					String fl = "";
+					if (count<meaningFUL.size())
+						fl = meaningFUL.get(count);
+					ful_less.add(new String[]{less, fl});
+					count++;
+				}
+
+				report.add(new String[]{"-----------------------------------------------------"});
+					ProfileReaderWriter.writeReport(report, "phrasesExtractedFromTweets3_"+nRun+".csv");
+					ProfileReaderWriter.writeReport(ful_less, "ful_lessTweets3_"+nRun+".csv");
+				
+			}
+		}
+	}
+
+
+	public static void main(String[] args){
+		TwitterEngineRunner runner = new TwitterEngineRunner();
+		int nRun = Integer.parseInt(args[0]);
+		runner.processTweetFile(nRun);
+
+	}
+}
+
+/*
+	public void processDirectory(String path){
+		List<String[]> report = new ArrayList<String[]>();
+		report.add(new String[] { "filename", "named entity list", "phrases of potential interest list" });
+
+		List<String> allNamedEntities = new ArrayList<String>();
+
+		addFiles(new File(path));
+		for(File f: queue){
+			List<String> entities = (List<String>) extractEntities(f.getAbsolutePath()).getFirst();
+			List<String> opinions = (List<String>) extractEntities(f.getAbsolutePath()).getSecond();
+			report.add(new String[]{ f.getName(), entities.toString(),  opinions.toString()});	
+			ProfileReaderWriter.writeReport(report, "nameEntitiesExtracted.csv");
+
+			allNamedEntities.addAll(entities);
+
+			allNamedEntities = new ArrayList<String>(new HashSet<String> (allNamedEntities ));
+
+
+		}
+		ProfileReaderWriter.writeReport(report, "nameEntitiesTopicsOfInterestExtracted.csv");
+	} 
+} */
+
+

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterFilter.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterFilter.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterFilter.java
new file mode 100644
index 0000000..0e5053d
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterFilter.java
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.apps.relevanceVocabs.SentimentVocab;
+import opennlp.tools.jsmlearning.ProfileReaderWriter;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.VerbNetProcessor;
+import opennlp.tools.parse_thicket.kernel_interface.DescriptiveParagraphFromDocExtractor;
+import opennlp.tools.parse_thicket.matching.Matcher;
+import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
+import opennlp.tools.similarity.apps.utils.Pair;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.TextProcessor;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang.StringUtils;
+
+public class TwitterFilter {
+	protected static Matcher matcher;
+	private static int PARA_LENGTH_IN_SENTENCES = 5, PARA_LENGTH = 250;
+	protected ArrayList<File> queue = new ArrayList<File>();
+	protected static PT2ThicketPhraseBuilder phraseBuilder;
+	protected static SentimentVocab sVocab = SentimentVocab.getInstance();
+	String resourceDirSentimentList = null;
+	Set<String> sentimentVcb = new HashSet<String> ();
+
+	static {
+		synchronized (TwitterFilter.class) {
+			matcher = new Matcher();
+			phraseBuilder = new PT2ThicketPhraseBuilder();
+		}
+	}
+
+	public TwitterFilter(){
+		try {
+			resourceDirSentimentList = new File( "." ).getCanonicalPath()+"/src/test/resources/opinions/sentiment_listReduced.csv";
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+		List<String[]> sentimentList=null;
+		sentimentList = ProfileReaderWriter.readProfiles(resourceDirSentimentList);
+		for(String[] line: sentimentList){
+			sentimentVcb.add(line[0]);
+		}
+	}
+
+	private boolean isSentimentWord(String word){
+		if (sentimentVcb.contains(word))
+			return true;
+		else
+			return false;		
+	}
+
+	public EntityExtractionResult extractEntities(String para){
+		List<List<ParseTreeNode>> extractedNERs = new ArrayList<List<ParseTreeNode>>();
+		List<String> extractedNERsWords = new ArrayList<String>();
+		List<List<ParseTreeNode>> extractedSentimentPhrases = 
+				new ArrayList<List<ParseTreeNode>>();
+		EntityExtractionResult result = new EntityExtractionResult();
+
+		ParseThicket pt = null;
+
+		System.out.println("Processing paragraph of length "+para.length() + " | "+ para);
+		pt = matcher.buildParseThicketFromTextWithRST(para);
+		List<List<ParseTreeNode>> nodeList = pt.getSentenceNodes();
+
+
+		for(List<ParseTreeNode> sentence: nodeList){
+			System.out.println("   Processing sentence: "+ sentence);
+			boolean bInsideNER = false; 
+			String currentPhrase = "";
+			List<ParseTreeNode> currentPhraseNode = new ArrayList<ParseTreeNode>(); 
+			for(ParseTreeNode word: sentence){
+				if (isNERforPhraseExtraction(word)){
+					System.out.println("++Found word ="+word + " | NER="+ word.getNe());
+					if (bInsideNER){
+						currentPhrase += " "+word.getWord();
+						currentPhraseNode.add(word);
+					} else {
+						bInsideNER=true;
+						currentPhrase = word.getWord();
+						currentPhraseNode.add(word);
+					}
+				} else {
+					if (bInsideNER){
+						if (currentPhrase.indexOf(' ')>-1) // at least two tokens
+							extractedNERsWords.add(currentPhrase);
+							extractedNERs.add(currentPhraseNode);
+						currentPhrase = "";
+						bInsideNER=false;
+					} else {
+						// do nothing, continue scan
+					}
+				}
+			}
+			if (currentPhrase.length()>1 && currentPhrase.indexOf(' ')>-1){
+				extractedNERs.add(currentPhraseNode);
+				extractedNERsWords.add(currentPhrase);
+			}
+
+			Set<String> foundSentimentWords = new HashSet<String>();
+			// now we extract phrases
+			List<List<ParseTreeNode>> phrases = pt.getPhrases();
+			for(List<ParseTreeNode> phrase: phrases){
+				// find a noun phrase under sentiment
+				try {
+					for(int i = phrase.size()-1; i>-1; i--){
+						ParseTreeNode word = phrase.get(i);
+						if ((isSentimentWord(word.getWord()) ||
+								sVocab.isSentimentWord(word.getWord()) && !foundSentimentWords.contains(word.getWord()) )){
+							foundSentimentWords.add(word.getWord());
+							System.out.println("Found opinionated phrase "+phrase.toString());
+							extractedSentimentPhrases.add(phrase);			
+							break;
+						}
+					}
+				} catch (Exception e) {
+					e.printStackTrace();
+				}
+			}
+
+		} 
+		result.setExtractedNER(extractedNERs);
+		result.setExtractedNERWords(extractedNERsWords);
+		result.setExtractedSentimentPhrases(extractedSentimentPhrases);
+		return result;
+	}
+
+
+
+	private boolean isNERforPhraseExtraction(ParseTreeNode word){
+		if ((word.getNe().equals("ORGANIZATION") ||word.getNe().equals("LOCATION") || word.getNe().equals("PERSON") ) &&
+				(word.getPos().startsWith("NN") || word.getPos().startsWith("PR") || word.getPos().startsWith("IN")|| 
+						word.getPos().startsWith("JJ") || word.getPos().startsWith("DT")  ))
+			return true;
+
+		return false;
+
+	}
+
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMiner.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMiner.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMiner.java
new file mode 100644
index 0000000..a138de6
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMiner.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang.StringUtils;
+
+import opennlp.tools.similarity.apps.utils.PageFetcher;
+
+public class YouTubeMiner {
+	private PageFetcher fetcher = new PageFetcher();
+	public YouTubeMinerResult getData(String url){
+		YouTubeMinerResult result = new YouTubeMinerResult();
+		String content = fetcher.fetchOrigHTML(url);
+		try {
+			FileUtils.writeStringToFile(new File(url.replace(':', '_').replace('/', '_')), content);
+		} catch (IOException e1) {
+			// TODO Auto-generated catch block
+			e1.printStackTrace();
+		}
+		if (url.indexOf("channel")>-1){
+			try { //subscriber-count" title="30" 
+				String subscribersStr = StringUtils.substringBetween(content,"subscriber-count", "tabindex");
+				String dirtyNumber = StringUtils.substringBetween(subscribersStr, "title=\"", "\"");
+				String cleanNumber = dirtyNumber.replaceAll("[^\\x00-\\x7F]", "");
+				if (cleanNumber!=null){
+					int subscribers = Integer.parseInt(cleanNumber );
+					result.subscribers = subscribers;
+				} else {
+					System.err.println("Not found data for 'subscriber-count', 'tabindex'");
+				}
+			} catch (NumberFormatException e) {
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+			}
+		} else {
+			try {
+
+				String subscribersStr = StringUtils.substringBetween(content,"subscriber-count", "tabindex");
+				String dirtyNumber = StringUtils.substringBetween(subscribersStr, "title=\"", "\"").replace(" ", "");
+				if (dirtyNumber!=null){
+					int subscribers = Integer.parseInt(dirtyNumber );
+					result.subscribers = subscribers;
+				} else {
+					System.err.println("Not found data for 'subscriber-count', 'tabindex'");
+				}
+
+				String viewsStrDirty = StringUtils.substringBetween(content,
+						//"div class=\"watch-view-count\">"," views</div>");
+						//view-count">12 \u043f\u0440\u043e\u0441\u043c\u043e\u0442\u0440\u043e\u0432</div>
+						"view-count","<div>");
+				String viewsStr = StringUtils.substringBetween(viewsStrDirty,">", " ");
+				if (viewsStr!=null){
+					int views = Integer.parseInt(viewsStr );
+					result.views = views;
+				} else {
+					System.err.println("Not found data for 'view-count','<div>'");
+				}
+			} catch (NumberFormatException e) {
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+			}
+		}
+
+		return result;
+	}
+
+
+
+
+	public static void main(String[] args){
+		YouTubeMiner  miner = new YouTubeMiner();
+		System.out.println(miner.getData("https://www.youtube.com/channel/UC-maQbG5eUS5c1wmaTnLwTA"));
+		System.out.println(miner.getData("https://www.youtube.com/watch?v=U6X4VT9dVr8"));
+		System.out.println(miner.getData("https://www.youtube.com/watch?v=kH-AQnta714"));
+		System.out.println(miner.getData("https://www.youtube.com/watch?v=pWb50Kn1ShQ"));
+	}
+}
+
+

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMinerResult.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMinerResult.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMinerResult.java
new file mode 100644
index 0000000..86c8e9d
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMinerResult.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.opinion_processor;
+
+public class YouTubeMinerResult {
+	public int likes;
+	public int subscribers;
+	public int views;
+	
+	boolean isPromisingYoungIndividual(){
+		if (subscribers>0)
+			if (subscribers>10 && subscribers< 20000)
+				return true;
+		if (views>0)
+			if (views>10 && views< 20000)
+				return true;
+		return false;
+
+	}
+	
+	public String toString(){
+		return "views :"+ views + "| subscribers = "+ subscribers;
+	}
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/LinguisticPatternStructure.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/LinguisticPatternStructure.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/LinguisticPatternStructure.java
new file mode 100755
index 0000000..3c88b41
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/LinguisticPatternStructure.java
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.pattern_structure;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.fca.ConceptLattice;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+
+public class LinguisticPatternStructure extends PhrasePatternStructure {
+
+	public LinguisticPatternStructure(int objectCounts, int attributeCounts) {
+		super(objectCounts, attributeCounts);
+		
+		ConceptLattice cl = null;
+	}
+	
+	public void AddExtentToAncestors(LinkedHashSet<Integer>extent, int curNode) {
+		//
+		if (conceptList.get(curNode).parents.size()>0){
+			for (int parent : conceptList.get(curNode).parents){
+				conceptList.get(parent).addExtents(extent);
+				AddExtentToAncestors(extent, parent);
+			}
+		}	
+	}
+	
+	public int AddIntent(List<List<ParseTreeChunk>> intent, LinkedHashSet<Integer>extent,int generator) {
+		System.out.println("debug");
+		System.out.println("called for " + intent);
+		//printLattice();
+		int generator_tmp = GetMaximalConcept(intent, generator);
+		generator = generator_tmp;
+		if (conceptList.get(generator).intent.equals(intent)) {
+			System.out.println("at generator:" + conceptList.get(generator).intent);
+			System.out.println("to add:" + intent);
+			System.out.println("already generated");
+			AddExtentToAncestors(extent, generator);	
+			return generator;
+		}
+		Set<Integer> generatorParents = conceptList.get(generator).parents;
+		Set<Integer> newParents = new HashSet<Integer>();
+		for (int candidate : generatorParents) {
+			if (!intent.containsAll(conceptList.get(candidate).intent)) {
+				List<List<ParseTreeChunk>> intersection = md
+				.matchTwoSentencesGroupedChunksDeterministic(intent, conceptList.get(candidate).intent);
+				LinkedHashSet<Integer> new_extent = new LinkedHashSet<Integer>();
+				new_extent.addAll(conceptList.get(candidate).extent);
+				new_extent.addAll(extent);
+				if (intent.size()!=intersection.size()){
+					System.out.println("recursive call (inclusion)");
+					System.out.println(intent + "----" + intersection);
+					candidate = AddIntent(intersection,new_extent, candidate);
+				}
+			}
+			
+			boolean addParents = true;
+			System.out.println("now iterating over parents");
+			Iterator<Integer> iterator = newParents.iterator();
+			while (iterator.hasNext()) {
+				Integer parent = iterator.next();
+				if (conceptList.get(parent).intent.containsAll(conceptList.get(candidate).intent)) {
+					addParents = false;
+					break;
+				}
+				else {
+					if (conceptList.get(candidate).intent.containsAll(conceptList.get(parent).intent)) {
+						iterator.remove();
+					}
+				}
+			}
+			if (addParents) {
+				newParents.add(candidate);
+			}
+		}
+		System.out.println("size of lattice: " + conceptList.size());
+		PhraseConcept newConcept = new PhraseConcept();
+		newConcept.setIntent(intent);
+
+		LinkedHashSet<Integer> new_extent = new LinkedHashSet<Integer>();
+		new_extent.addAll(conceptList.get(generator).extent);
+		new_extent.addAll(extent);
+		newConcept.addExtents(new_extent);
+		
+		newConcept.setPosition(conceptList.size());
+		conceptList.add(newConcept);
+		conceptList.get(generator).parents.add(newConcept.position);
+		conceptList.get(newConcept.position).childs.add(generator);
+		for (int newParent: newParents) {
+			if (conceptList.get(generator).parents.contains(newParent)) {
+				conceptList.get(generator).parents.remove(newParent);
+				conceptList.get(newParent).childs.remove(generator);
+			}
+			conceptList.get(newConcept.position).parents.add(newParent);
+			conceptList.get(newParent).addExtents(new_extent);
+			AddExtentToAncestors(new_extent, newParent);
+			conceptList.get(newParent).childs.add(newConcept.position);
+		}
+		return newConcept.position;
+	}
+	
+	public void printLatticeExtended() {
+		for (int i = 0; i < conceptList.size(); ++i) {
+			printConceptByPositionExtended(i);
+		}
+	}
+	
+	public void printConceptByPositionExtended(int index) {
+		System.out.println("Concept at position " + index);
+		conceptList.get(index).printConceptExtended();
+	}
+	
+	
+	public int [][] toContext(int extentCardinality){
+		
+		int newAttrCount = conceptList.size();
+		ArrayList<PhraseConcept> cList = new ArrayList<PhraseConcept>();
+		cList.addAll(conceptList);	
+		boolean run = true;
+		int k=0;
+		while (run && k<conceptList.size()){
+			if (conceptList.get(k).intent.size() == attributeCount){
+				if (conceptList.get(k).extent.size() == 0)
+					for (Integer i:conceptList.get(k).parents)
+						cList.remove(i);
+				cList.remove(k);
+				run=false;
+			}
+			else
+				k+=1;	
+		}
+		
+		run = true;
+		k=0;
+		while (run && k<=newAttrCount){
+			if (cList.get(k).extent.size()==0)
+				k++;
+				run = false;
+		}
+		newAttrCount = cList.size();
+		Set<Integer> nodeExtend;
+		int [][] binaryContext = new int[extentCardinality][newAttrCount];
+		for (int j = 0; j<newAttrCount; j++){
+			nodeExtend = cList.get(j).extent;
+			for (Integer i: nodeExtend){
+				binaryContext[i][j]=1;
+			}
+		}
+		return binaryContext;
+	}
+	
+	
+	
+	public void logStability(){
+		int min_delta = -1, delta = -1;
+		float sum = 0;
+		for (int i = 0; i < conceptList.size(); ++i) {
+			min_delta = Integer.MAX_VALUE;
+			sum = 0;
+			PhraseConcept pc = conceptList.get(i);
+			Set<Integer> childs = pc.childs;
+			for (Integer j: childs) {
+				delta = pc.extent.size() - conceptList.get(j).extent.size();
+				if (delta<min_delta)
+					min_delta = delta;
+				sum += Math.pow(2, -delta);
+			}
+			pc.intLogStabilityBottom=-(Math.log(sum)/Math.log(2.0));
+			pc.intLogStabilityUp = min_delta;
+		}
+	}
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PatternStructureWriter.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PatternStructureWriter.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PatternStructureWriter.java
new file mode 100755
index 0000000..c3f5688
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PatternStructureWriter.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.pattern_structure;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+
+import opennlp.tools.fca.ConceptLattice;
+import opennlp.tools.fca.FormalConcept;
+
+public class PatternStructureWriter {
+	
+	public void WriteStatsToTxt(String filename, PhrasePatternStructure ps){
+			
+		String formatStr = "[%5.2f; %5.2f]  %s   %s%n";
+		Writer writer = null;
+
+		try {
+		    writer = new BufferedWriter(new OutputStreamWriter(
+		          new FileOutputStream(filename), "utf-8"));
+		    writer.write("PatternStructure size: " + ps.conceptList.size()+ " with " + ps.objectCount + "objects\n");
+		    
+		    for (PhraseConcept c : ps.conceptList){
+		    	writer.write(String.format(formatStr,c.intLogStabilityBottom, c.intLogStabilityUp, c.extent, c.intent));
+		    }
+		    writer.close();
+		    
+		} catch (IOException ex) {
+			System.err.println(ex.getMessage());
+		} finally {
+		   try {writer.close();} catch (Exception ex) {}
+		}
+	}
+	
+
+		public static void main(String[] args) {
+			
+		}
+}
+			
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/request_response_recognizer/TreeKernelBasedRecognizerOfRequest_Response.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/request_response_recognizer/TreeKernelBasedRecognizerOfRequest_Response.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/request_response_recognizer/TreeKernelBasedRecognizerOfRequest_Response.java
new file mode 100644
index 0000000..e33e089
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/request_response_recognizer/TreeKernelBasedRecognizerOfRequest_Response.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.request_response_recognizer;
+
+
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.VerbNetProcessor;
+import opennlp.tools.parse_thicket.external_rst.MatcherExternalRST;
+import opennlp.tools.parse_thicket.external_rst.ParseThicketWithDiscourseTree;
+import opennlp.tools.parse_thicket.kernel_interface.TreeKernelBasedClassifierMultiplePara;
+
+/*
+ * This class performs TK learning based on parse thicket which includes RST relations only 
+ * based on Surdeanu at al RST parser. It does sentence parsing and NLP pipeline of 
+ * Surdeanu's wrapper of Stanford NLP
+ */
+public class TreeKernelBasedRecognizerOfRequest_Response extends TreeKernelBasedClassifierMultiplePara{
+
+	private MatcherExternalRST matcherRST = new MatcherExternalRST();
+
+	protected List<String> formTreeKernelStructuresMultiplePara(List<String> texts, String flag) {
+		//TODO
+		this.setShortRun();	
+		List<String> extendedTreesDumpTotal = new ArrayList<String>();
+		try {
+
+			for(String text: texts){
+				// get the parses from original documents, and form the training dataset
+				try {
+					System.out.print("About to build pt with external rst from "+text + "\n...");
+					ParseThicket pt = matcherRST.buildParseThicketFromTextWithRST(text);
+					if (pt == null)
+						continue;
+					System.out.print("About to build extended forest with external rst...");
+					List<String> extendedTreesDump =  // use direct option (true
+							buildReptresentationForDiscourseTreeAndExtensions((ParseThicketWithDiscourseTree)pt, true);
+					for(String line: extendedTreesDump)
+						extendedTreesDumpTotal.add(flag + " |BT| "+line + " |ET| ");
+					System.out.println("DONE");
+				} catch (Exception e) {
+					e.printStackTrace();
+				}
+			}
+		} catch (Exception e) {
+			e.printStackTrace();
+		}
+		return extendedTreesDumpTotal;
+	}
+
+	private List<String> buildReptresentationForDiscourseTreeAndExtensions(ParseThicketWithDiscourseTree pt, boolean bDirectDT){
+		List<String> extendedTreesDump = new ArrayList<String>();
+		if (!bDirectDT)
+			// option 1: use RST relation for extended trees 
+			extendedTreesDump = treeExtender.buildForestForRSTArcs(pt);
+		else {
+			// option 2: use DT directly
+			extendedTreesDump.add(pt.getDtDump());
+		    extendedTreesDump.add(pt.getDtDumpWithPOS());
+		    extendedTreesDump.add(pt.getDtDumpWithEmbeddedTrees());
+		    extendedTreesDump.add(pt.getDtDumpWithVerbNet());
+		}		
+		return extendedTreesDump;
+	}
+	
+	public static void main(String[] args){
+		VerbNetProcessor p = VerbNetProcessor.
+				getInstance("/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources"); 
+
+		TreeKernelBasedRecognizerOfRequest_Response proc = new TreeKernelBasedRecognizerOfRequest_Response();
+		proc.setKernelPath("/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources/tree_kernel/");
+		proc.trainClassifier(
+				YahooAnswersTrainingSetCreator.origFilesDir,
+				YahooAnswersTrainingSetCreator.origFilesDir.replace("/text", "/neg_text")
+				);
+	}
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/request_response_recognizer/YahooAnswersTrainingSetCreator.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/request_response_recognizer/YahooAnswersTrainingSetCreator.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/request_response_recognizer/YahooAnswersTrainingSetCreator.java
new file mode 100644
index 0000000..c060c95
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/request_response_recognizer/YahooAnswersTrainingSetCreator.java
@@ -0,0 +1,118 @@
+package opennlp.tools.parse_thicket.request_response_recognizer;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.similarity.apps.BingQueryRunner;
+
+import org.apache.commons.io.FileUtils;
+
+public class YahooAnswersTrainingSetCreator {
+	protected List<File> queuePos = new ArrayList<File>(), queueNeg = new ArrayList<File>();
+	public static String origFilesDir = "/Users/bgalitsky/Downloads/NewCategoryIdentification/text";
+	//private BingQueryRunner searcher = new BingQueryRunner();
+	protected void addFilesPos(File file) {
+
+		if (!file.exists()) {
+			System.out.println(file + " does not exist.");
+		}
+		if (file.isDirectory()) {
+			for (File f : file.listFiles()) {
+				addFilesPos(f);
+				System.out.println(f.getName());
+			}
+		} else {
+			queuePos.add(file);
+		}
+	}
+	
+	protected void addFilesNeg(File file) {
+
+		if (!file.exists()) {
+			System.out.println(file + " does not exist.");
+		}
+		if (file.isDirectory()) {
+			for (File f : file.listFiles()) {
+				addFilesNeg(f);
+				System.out.println(f.getName());
+			}
+		} else {
+			queueNeg.add(file);
+		}
+	}
+	
+	public void formNegTrainingSet(String posPath , String negPath){
+		 if (!new File(negPath).exists())
+			 new File(negPath).mkdir();
+		 
+		addFilesPos(new File(posPath));
+		for(int i=0; i< queuePos.size()-1; i+=2){ //take two files at a time
+			File f1 = queuePos.get(i), f2 = queuePos.get(i+1);
+			String content1 = null, content2 = null;
+            try {
+	            content1 = FileUtils.readFileToString(f1);
+	            content2 = FileUtils.readFileToString(f2);
+            } catch (IOException e) {
+	            e.printStackTrace();
+            }
+			String[] portions1 = content1.split("\n\n");
+			String[] portions2 = content2.split("\n\n");
+
+			portions1 = splitIntoRR(portions1, content1);
+			portions2 = splitIntoRR(portions2, content2);
+			if (portions1==null || portions2==null)
+				continue;
+			// do cross-breeding
+			try {
+	            FileUtils.writeStringToFile(new File(negPath+"/" + f1.getName()+".txt"),
+	            		portions1[0] + "\n\n" + portions2[1] );
+	            FileUtils.writeStringToFile(new File(negPath+"/" + f2.getName()+".txt"),
+	            		portions2[0] + "\n\n" + portions1[1] );
+            } catch (IOException e) {
+	            e.printStackTrace();
+            }
+		}
+		
+		
+	}
+	private String[] splitIntoRR(String[] portions, String content) {
+		if (portions.length<2 ){
+			portions = content.replace("?","#_#").split("#_#");
+		}
+		if (portions.length<2 ){
+			portions = content.split("\n");
+		}
+		if (portions.length<2)
+			return null;
+		if (portions.length>2){
+			String q= "", a = "";
+			boolean bQ = true;
+			for(int p=0; p<portions.length; p++){
+				if ( bQ )
+					q+=portions[p]+" \n";
+				else
+					a +=portions[p]+" \n";
+				
+				if (portions[p].endsWith("?")){
+					bQ=false;
+				}
+
+			}
+			if (!bQ) {
+				portions = new String[2];
+				portions[0] = q;
+				portions[1] = a;
+			} else
+				return null;
+		}
+		
+		return portions;
+    }
+	
+	public static void main(String[] args){
+		String dir = YahooAnswersTrainingSetCreator.origFilesDir;
+		new YahooAnswersTrainingSetCreator().formNegTrainingSet(dir, dir.replace("/text", "/neg_text"));
+	}
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingRelatedSpellingQueryRunner.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingRelatedSpellingQueryRunner.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingRelatedSpellingQueryRunner.java
new file mode 100644
index 0000000..025403c
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingRelatedSpellingQueryRunner.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.similarity.apps;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.util.ArrayList;
+import java.util.List;
+
+import net.billylieurance.azuresearch.AzureSearchRelatedSearchQuery;
+import net.billylieurance.azuresearch.AzureSearchRelatedSearchResult;
+import net.billylieurance.azuresearch.AzureSearchResultSet;
+import net.billylieurance.azuresearch.AzureSearchSpellingSuggestionQuery;
+import net.billylieurance.azuresearch.AzureSearchSpellingSuggestionResult;
+import net.billylieurance.azuresearch.AzureSearchWebQuery;
+import net.billylieurance.azuresearch.AzureSearchWebResult;
+
+public class BingRelatedSpellingQueryRunner extends BingQueryRunner{
+	private AzureSearchRelatedSearchQuery aq = new AzureSearchRelatedSearchQuery ();
+	private AzureSearchSpellingSuggestionQuery  ssq = new AzureSearchSpellingSuggestionQuery ();
+	
+	
+	public List<HitBase> runSearch(String query, int nRes) {
+		aq.setAppid(BING_KEY);
+		aq.setQuery(query);		
+		aq.setPerPage(nRes);
+		aq.doQuery();
+		
+		List<HitBase> results = new ArrayList<HitBase> ();
+		AzureSearchResultSet<AzureSearchRelatedSearchResult> ars = aq.getQueryResult();
+		
+		for (AzureSearchRelatedSearchResult anr : ars){
+		    HitBase h = new HitBase();
+		    h.setTitle(anr.getTitle());
+		    h.setUrl(anr.getBingUrl());
+		    results.add(h);
+		}
+		return results;
+	}
+	
+	public List<HitBase> runSSSearch(String query, int nRes) {
+		ssq.setAppid(BING_KEY);
+		ssq.setQuery(query);		
+		ssq.setPerPage(nRes);
+		ssq.doQuery();
+		
+		List<HitBase> results = new ArrayList<HitBase> ();
+		AzureSearchResultSet<AzureSearchSpellingSuggestionResult> ars = ssq.getQueryResult();
+		
+		for ( AzureSearchSpellingSuggestionResult anr : ars){
+		    HitBase h = new HitBase();
+		    h.setTitle(anr.getTitle());
+		    h.setAbstractText(anr.getValue());
+		   
+		    results.add(h);
+		}
+		return results;
+	}
+	
+	public static void main(String[] args) {
+		BingRelatedSpellingQueryRunner self = new BingRelatedSpellingQueryRunner();
+	    try {
+	    	self.setLang("es-MX");
+	    	self.setKey(
+	    			"e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=");
+	      List<HitBase> resp = self
+	          .runSearch("clear Sess", 10);
+	      System.out.print(resp.get(0));
+	      
+	      resp = self
+		          .runSSSearch("clear Sess", 10);
+		      System.out.print(resp.get(0));
+	    } catch (Exception e) {
+	      // TODO Auto-generated catch block
+	      e.printStackTrace();
+	    }
+	}
+}