You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by bg...@apache.org on 2016/11/22 13:05:21 UTC
[07/11] opennlp-sandbox git commit: removed stanford nlp refs

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/LinguisticPhraseManager.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/LinguisticPhraseManager.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/LinguisticPhraseManager.java
deleted file mode 100644
index 0f53ec5..0000000
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/LinguisticPhraseManager.java
+++ /dev/null
@@ -1,591 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.tools.parse_thicket.opinion_processor;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.concurrent.ConcurrentHashMap;
-import org.apache.commons.lang3.StringUtils;
-import opennlp.tools.jsmlearning.ProfileReaderWriter;
-import opennlp.tools.similarity.apps.BingQueryRunner;
-import opennlp.tools.similarity.apps.HitBase;
-import opennlp.tools.similarity.apps.utils.ValueSortMap;
-import opennlp.tools.stemmer.PStemmer;
-import opennlp.tools.textsimilarity.ParseTreeChunk;
-import opennlp.tools.textsimilarity.TextProcessor;
-
-public class LinguisticPhraseManager {
-	private Map<String, Integer> freq = new ConcurrentHashMap<String, Integer>();
-	
-	// the purpose to init this static object is to show the path to resources
-	private static StopList stop = StopList.getInstance(new File(".").getAbsolutePath().replace(".","")+ "src/test/resources/");
-
-	// this list will be overwritten by the external synonyms.csv
-	private static String[][] synonymPairs = new String[][]{};
-	private PStemmer stemmer = new PStemmer();
-
-	private List<ParseTreeChunk> lingPhrases = new ArrayList<ParseTreeChunk>();
-	private List<String> standardizedTopics = new ArrayList<String>();
-	// map which shows for each ling phrase the list of ling phrases with the same head noun it belongs
-	private Map<ParseTreeChunk, List<ParseTreeChunk>> entry_group = new ConcurrentHashMap<ParseTreeChunk, List<ParseTreeChunk>>();
-
-	//  map which shows for each string phrase the list of ling phrases with the same head noun it belongs
-	private Map<String, List<ParseTreeChunk>> std_group = new ConcurrentHashMap<String, List<ParseTreeChunk>>();
-
-	private BingQueryRunner runner = new BingQueryRunner();
-	private static final int MIN_NUMBER_OF_PHRASES_TO_CONSIDER = 3;//2; 5
-	private static final int MIN_LENGTH_OF_WORD_TO_CONSIDER = 3;
-	// this function takes a log of a chain of the nodes of parse trees and builds their instances
-	// the phrases should only be VP or NP, otherwise an exception should be thrown
-	
-	
-
-	private String resourceDir;
-	public LinguisticPhraseManager(){
-		try {
-			resourceDir  = new File( "." ).getCanonicalPath()+"/src/main/resources/";
-			List<String[]> vocabs = ProfileReaderWriter.readProfiles(resourceDir+"/synonyms.csv");
-			synonymPairs = new String[vocabs.size()][2];
-			int count = 0;
-			for(String[] line: vocabs){
-				try {
-					synonymPairs[count] = line;
-					count++;
-	            } catch (Exception e) {
-	                e.printStackTrace();
-	            }
-			}
-			
-		} catch (Exception e) {
-			e.printStackTrace();
-		}		
-	}
-
-	private ParseTreeChunk parseLingPhraseIntoParseTreeChunk(String phrStr){
-		ParseTreeChunk ch = new ParseTreeChunk();
-		List<String> POSs = new ArrayList<String>(), lemmas = new ArrayList<String>();
-
-		String[] parts = phrStr.replace("]","").split(", <");
-
-		ch.setMainPOS( StringUtils.substringBetween(phrStr, ">", "'"));
-		try {
-			for(String part: parts){
-				String lemma = StringUtils.substringBetween(part, "P'", "':").toLowerCase();
-				String pos = part.substring(part.indexOf(":")+1, part.length());
-
-				if (pos==null || lemma ==null){
-					continue;
-				}
-				POSs.add(pos.trim());
-				lemmas.add(lemma.trim());
-				ch.setPOSs(POSs); ch.setLemmas(lemmas);
-			}
-		} catch (Exception e) {
-			// we expect exceptions if extracted phrases are NEITHER NP nor VP
-			// empty chunk will be given which will not create a new topic
-			e.printStackTrace();
-		}
-
-		return ch;
-	}
-
-	// this is a constructor with an array of extraction files
-	// optimized for performance
-	// only topics occurring more than MIN_NUMBER_OF_PHRASES_TO_CONSIDER times will be considered
-	public LinguisticPhraseManager(String[] loadPaths){
-		List<String[]> columns = new ArrayList<String[]>();
-		for(String file: loadPaths){
-			columns.addAll(ProfileReaderWriter.readProfiles( file));
-		}
-
-		for(String[] l: columns){
-			if (l.length<3 || l[1]==null || l[2]==null)
-				continue;
-			String word = l[1].toLowerCase().trim();
-			if (word.indexOf("=>")>-1)
-				continue;
-
-			word = isAcceptableStringPhrase(word);
-			if (word==null)
-				continue;
-
-			if (!freq.containsKey(word)) {
-				freq.put(word, 1);
-
-			} else {
-				freq.put(word, freq.get(word) + 1);
-				// once we reached the count for a topic, create it
-				if (freq.get(word)==MIN_NUMBER_OF_PHRASES_TO_CONSIDER){
-					ParseTreeChunk ch = parseLingPhraseIntoParseTreeChunk(l[2]);
-					ch = isAcceptableLingPhrase(ch);
-					if (ch==null)
-						continue;
-					lingPhrases.add(ch);
-				}
-			}		  
-		}
-		// we dont need frequency data any more
-		freq.clear();
-	}
-
-	// this is a default constructor with a single topic extraction file
-	// not optimized for performance
-	public LinguisticPhraseManager(String loadPath){
-		List<String[]> columns = ProfileReaderWriter.readProfiles( loadPath);
-		for(String[] l: columns){
-			if (l.length<3 || l[1]==null || l[2]==null)
-				continue;
-			String word = l[1].toLowerCase().trim();
-			if (word.indexOf("=>")>-1)
-				continue;
-
-			word = isAcceptableStringPhrase(word);
-			if (word==null)
-				continue;
-
-			if (!freq.containsKey(word)) {
-
-				ParseTreeChunk ch = parseLingPhraseIntoParseTreeChunk(l[2]);
-				ch = isAcceptableLingPhrase(ch);
-				if (ch==null)
-					continue;
-				freq.put(word, 1);
-				lingPhrases.add(ch);
-			} else {
-				freq.put(word, freq.get(word) + 1);
-			}		  
-
-
-		}
-		freq = ValueSortMap.sortMapByValue(freq, false);
-
-
-	}
-	// removing prepositions and articles in case it has not worked at phrase forming stage
-	private String isAcceptableStringPhrase(String word) {
-		if (word.startsWith("to "))
-			return null;
-		if (word.startsWith("a "))
-			return word.substring(2, word.length());
-
-		if (word.endsWith(" !") || word.endsWith(" ."))
-			return word.substring(0, word.length()-2).trim();
-
-		return word;
-	}
-	// we only accept NP 
-	private ParseTreeChunk isAcceptableLingPhrase(ParseTreeChunk ch) {
-		if (!ch.getMainPOS().equals("NP"))
-			return null;
-
-
-		return ch;
-	}
-
-	// groups are sets of phrases with the same head noun
-	// put all phrases in a group. Have a map from each phrase to its group: the list of members
-	public void doLingGrouping(){
-		for(int i=0; i< lingPhrases.size(); i++){
-			for(int j=i+1; j< lingPhrases.size(); j++){
-				ParseTreeChunk chI = lingPhrases.get(i);
-				ParseTreeChunk chJ = lingPhrases.get(j);
-				if (chI.getLemmas().get(chI.getLemmas().size()-1).equals(chJ.getLemmas().get(chJ.getLemmas().size()-1))
-						&& chI.getPOSs().get(chI.getLemmas().size()-1).startsWith("NN") ){
-					List<ParseTreeChunk> values = null; 
-					if( chI.getLemmas().size()<chJ.getLemmas().size()){		
-
-						if (values == null)
-							values = new ArrayList<ParseTreeChunk>();
-						values.add(chI);
-						entry_group.put(chJ, values);
-					} else {
-						values = entry_group.get(chI);
-						if (values == null)
-							values = new ArrayList<ParseTreeChunk>();
-						values.add(chJ);
-						entry_group.put(chI, values);
-					}
-				}
-			}
-		}
-
-
-	}
-
-	public List<String> formStandardizedTopic(){
-		Set<ParseTreeChunk> keys = entry_group.keySet();
-		for(ParseTreeChunk k: keys){
-			List<ParseTreeChunk> lingPhrases = entry_group.get(k);		
-			for(int i=0; i< lingPhrases.size(); i++)
-				for(int j=i+1; j< lingPhrases.size(); j++){
-					ParseTreeChunk chI = lingPhrases.get(i);
-					ParseTreeChunk chJ = lingPhrases.get(j);
-					List<String> lemmas = new ArrayList<String>(chI.getLemmas());
-					lemmas.retainAll(chJ.getLemmas());
-					if (lemmas.size()<2)
-						continue;
-					String buf = ""; List<String> candTopicLst = new ArrayList<String>();
-					for(String w: lemmas){
-						if (w.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER)
-							continue;
-						if (!StringUtils.isAlpha(w))
-							continue;
-						// find POS of w
-						boolean bAccept = false;
-						for(int iw=0; iw<chI.getLemmas().size(); iw++){
-							if (w.equals(chI.getLemmas().get(iw))){
-								if (chI.getPOSs().get(iw).startsWith("NN") || chI.getPOSs().get(iw).startsWith("JJ")
-										|| chI.getPOSs().get(iw).startsWith("VB"))
-									bAccept=true;
-							}
-						}
-						if (bAccept){
-							//buf+=w+" ";
-							String ws = substituteSynonym(w);
-							candTopicLst.add(ws);
-						}
-					}
-					// remove duplicates like 'new new house'
-					//candTopicLst = new ArrayList<String>(new HashSet<String>(candTopicLst));
-					for(String w: candTopicLst){
-						buf+=w+" ";
-					}
-
-					buf = buf.trim();
-					if (buf.indexOf(' ')<0)
-						continue;
-
-					if (!standardizedTopics.contains(buf)){
-						standardizedTopics.add(buf);		
-						std_group.put(buf, lingPhrases);
-					}
-				}
-		}
-		cleanUpStandardizedTopics();
-
-		return standardizedTopics;
-	}
-
-	public void cleanUpStandardizedTopics(){
-		List<String> toDelete = new ArrayList<String>();
-		for(int i=0; i< standardizedTopics.size(); i++)
-			for(int j=i+1; j< standardizedTopics.size(); j++){
-				List<String> t1 = TextProcessor.fastTokenize(standardizedTopics.get(i), false);
-				List<String> t2 = TextProcessor.fastTokenize(standardizedTopics.get(j), false);
-				for(int k=0; k< t1.size(); k++){
-					t1.set(k, stemmer.stem(t1.get(k)));
-				}
-				for(int k=0; k< t2.size(); k++){
-					t2.set(k, stemmer.stem(t2.get(k)));
-				} 
-				// check if lists are equal
-				if (t1.size()!=t2.size())
-					continue;
-				//if in two phrases once all keywords are tokenized, one phrase annihilates another, 
-				t1.removeAll(t2);
-				if (t1.isEmpty()){ 
-					if (standardizedTopics.get(i).length()> standardizedTopics.get(j).length()){
-						toDelete.add(standardizedTopics.get(i));
-						// TODO update std_group entry
-						System.out.println("Removing '" + standardizedTopics.get(i) + "' because of '" + standardizedTopics.get(j) );
-						List<ParseTreeChunk> stJ = std_group.get(standardizedTopics.get(j));
-						stJ.addAll(std_group.get(standardizedTopics.get(i)));
-						stJ = new ArrayList<ParseTreeChunk>(new HashSet<ParseTreeChunk>(stJ));
-						std_group.put(standardizedTopics.get(j), stJ);
-					}
-					else {
-						toDelete.add(standardizedTopics.get(j));
-						System.out.println("Removing '" + standardizedTopics.get(j) + "' because of '" + standardizedTopics.get(i) );
-						List<ParseTreeChunk> stI = std_group.get(standardizedTopics.get(i));
-						stI.addAll(std_group.get(standardizedTopics.get(j)));
-						stI = new ArrayList<ParseTreeChunk>(new HashSet<ParseTreeChunk>(stI));
-						std_group.put(standardizedTopics.get(i), stI);
-					}
-
-				}
-			}
-		for(String d: toDelete){
-			//System.out.println("Removed '" + d + "'");
-			standardizedTopics.remove(d);
-		}
-	}
-
-	// substitute synonyms according to internal vocab
-	private String substituteSynonym(String w) {
-		try {
-			for(String[] pair: synonymPairs){
-				if (w.equals(pair[0]))
-					return pair[1];
-			}
-		} catch (Exception e) {
-			e.printStackTrace();
-		}
-		return w;
-	}
-
-	public void generateGroupingReport(String reportName){
-		List<String[]>  report = new ArrayList<String[]>();
-		Set<ParseTreeChunk> chs = entry_group.keySet();
-		report.add(new String[]{"string phrase" , "class", "linguistic phrase",  "list of ling phrases class representatives"});
-
-		for(ParseTreeChunk ch: chs){
-			String head = ch.getLemmas().get(ch.getLemmas().size()-1);
-			List<ParseTreeChunk> values = entry_group.get(ch);
-			if (values.size()<6)
-				head = "";
-			report.add(new String[]{ch.toWordOnlyString(), head,  ch.toString(),  values.toString()});
-		}
-		ProfileReaderWriter.writeReport(report, reportName);
-	}
-
-	//final merge floor-floors-flooring as head nound with phrase update
-	public void applyLastRoundOfAggregation(){
-		//merge <floor - floors - flooring>
-		/*
-			List<ParseTreeChunk> entries =  new ArrayList<ParseTreeChunk>(entry_group.keySet());
-			for(int i=0; i< entries.size(); i++){
-				for(int j=i+1; j< entries.size(); j++){
-					ParseTreeChunk chI = entries.get(i);
-					ParseTreeChunk chJ = entries.get(j);
-					String headI = getLastElement(chI.getLemmas());
-					String headJ = getLastElement(chJ.getLemmas());
-					if (headI==null || headI.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER  || 
-							headJ==null || headJ.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER )
-						continue;
-
-					if (headI.indexOf(headJ)>-1){
-						//leave headJ
-						List<ParseTreeChunk> valuesToAddTo = entry_group.get(chJ);
-						List<ParseTreeChunk> valuesBeingAdded = entry_group.get(chI);
-						if (valuesToAddTo==null || valuesBeingAdded == null)
-							continue;
-						valuesToAddTo.addAll(valuesBeingAdded);
-						entry_group.put(chJ, valuesToAddTo);
-						entry_group.remove(chI);
-						System.out.println("Deleting entry '"+ headI +"' and moving group to entry '"+ headJ +"'");
-					} else if (headJ.indexOf(headI)>-1){
-						//leave headJ
-						List<ParseTreeChunk> valuesToAddTo = entry_group.get(chI);
-						List<ParseTreeChunk> valuesBeingAdded = entry_group.get(chJ);
-						if (valuesToAddTo==null || valuesBeingAdded == null)
-							continue;
-						valuesToAddTo.addAll(valuesBeingAdded);
-						entry_group.put(chI, valuesToAddTo);
-						entry_group.remove(chJ);
-						System.out.println("Deleting entry '"+ headJ +"' and moving group to entry '"+ headI +"'");
-					}
-
-				}
-			}
-		 */
-		for(int i = 0; i<standardizedTopics.size(); i++ )
-			for(int j = i+1; j<standardizedTopics.size(); j++ ){
-				String headI = extractHeadNounFromPhrase(standardizedTopics.get(i));
-				String headJ = extractHeadNounFromPhrase(standardizedTopics.get(j));
-				// if the same word do nothing
-				if (headI.equals(headJ))
-					continue;
-
-				//only if one is sub-word of another
-				if (headI.indexOf(headJ)>-1){
-
-					if (!properSubWordForm(headI, headJ))
-						continue;
-					//entry 'I' will be updated
-					String newKey = standardizedTopics.get(i).replace(headI, headJ);
-
-					List<ParseTreeChunk> stI = std_group.get(standardizedTopics.get(i));
-					List<ParseTreeChunk> stInew = std_group.get(newKey);
-					//if (stInew!=null && !stInew.isEmpty())
-					//	stI.addAll(stInew);
-					if(stI==null)
-						continue;
-					std_group.put(newKey, stI);
-					std_group.remove(standardizedTopics.get(i));
-					System.out.println("Deleted entry for key '"+ standardizedTopics.get(i) +"' and created  '"+ newKey +"'");
-					standardizedTopics.set(i, newKey);
-
-				} else if (headJ.indexOf(headI)>-1){
-					if (!properSubWordForm(headJ, headI))
-						continue;
-					//entry 'J' will be updated
-					String newKey = standardizedTopics.get(j).replace(headJ, headI);
-
-					List<ParseTreeChunk> stJ = std_group.get(standardizedTopics.get(j));
-					List<ParseTreeChunk> stJnew = std_group.get(newKey);
-					//if (stJnew!=null && !stJnew.isEmpty())
-					//	stJ.addAll(stJnew);
-					if(stJ==null)
-						continue;
-					std_group.put(newKey, stJ);
-					std_group.remove(standardizedTopics.get(j));
-					System.out.println("Deleted entry for key '"+ standardizedTopics.get(j) +"' and created  '"+ newKey +"'");
-					standardizedTopics.set(j, newKey);
-				}
-			}
-
-
-
-	}
-
-	private boolean properSubWordForm(String headI, String headJ) {
-		String suffix = headI.replace(headJ, "");
-		if (suffix.equals("s") || suffix.equals("ing") //|| suffix.equals("er") 
-				|| suffix.equals("rooms") ||
-				suffix.equals("") || suffix.equals("counter") ||
-				suffix.equals("room") || suffix.equals("back"))
-			return true;
-
-		//System.out.println("Wrong word '"+ headI + "'reduction into '" + headJ +"'");
-		return false;
-	}
-
-	//generates report 
-	public void generateStdTopicReport(String reportName){
-		List<String[]>  report = new ArrayList<String[]>();
-		report.add(new String[]{"category", "topic", "sub-topics", "phrase instances" });
-
-		for(String t: standardizedTopics){
-
-			String bufCover = "";
-			int count = 0;
-			List<ParseTreeChunk> ptcList = std_group.get(t);
-			if (ptcList == null)
-				continue;
-			for(ParseTreeChunk ch: ptcList){
-				List<String> candidate = TextProcessor.fastTokenize(ch.toWordOnlyString(), false);
-				List<String> tList = TextProcessor.fastTokenize(t, false);
-				List<String> tListChk = new ArrayList<String>(tList);
-
-				tListChk.removeAll(candidate);
-				// fully covered by phrase instance
-				if (!tListChk.isEmpty() || ch.toWordOnlyString().equals(t)){
-					continue;
-				}
-
-				boolean bCovered = true;
-				
-				for(String ts: tList){
-					boolean bCandWordsIsCovered = false;
-					for(String s: candidate){
-						if ((s.indexOf(ts)>-1) )//  && properSubWordForm(s, ts))
-							bCandWordsIsCovered = true;
-					}
-					if (!bCandWordsIsCovered){
-						bCovered = false;
-						break;
-					}
-				}
-				if (!bCovered)
-					continue;
-				bufCover+=ch.toWordOnlyString()+ " # ";
-				count++;
-				if (count > 40)
-					break;
-
-			}
-			if (bufCover.endsWith(" # "))
-				bufCover = bufCover.substring(0, bufCover.length()-3).trim();
-
-			String buf = "";
-			count = 0;
-			// only up to 40 instances of phrases per 1-st level topic
-			for(ParseTreeChunk ch: ptcList){
-				buf+=ch.toWordOnlyString()+ "|";
-				count++;
-				if (count > 40)
-					break;
-			}
-			
-			//TODO uncomment
-			//t = spell.getSpellCheckResult(t);
-			report.add(new String[]{extractHeadNounFromPhrase(t), t, bufCover, buf //, std_group.get(t).toString()
-			});
-		}
-		
-		
-		ProfileReaderWriter.writeReport(report, reportName);
-	}
-	// get a last word from a phrase (supposed to be a head noun)
-	private String extractHeadNounFromPhrase(String topic){
-		String[] tops = topic.split(" ");
-		int len = tops.length;
-		if (len>1){
-			return tops[len-1];
-		}
-		else return topic;
-	}
-
-	// get last elem of a list
-	private String getLastElement(List<String> arrayList ){
-		if (arrayList != null && !arrayList.isEmpty()) {
-			return arrayList.get(arrayList.size()-1);
-		}
-		return null;
-	}
-	/*
-	 * Using Bing API to check if an extracted phrase can be found on the web, therefore is a meaningful phrase 
-	 */
-	public List<String> verifyTopic(){
-		Set<String> phrases = freq.keySet();
-		List<String> approvedPhrases = new ArrayList<String>();
-		for(String p: phrases){
-			List<HitBase> hits = runner.runSearch("\""+p+"\"");
-			for(HitBase h: hits){
-				String lookup = h.getTitle() + " " + h.getAbstractText();
-				if (lookup.indexOf(p)>-1){
-					approvedPhrases.add(p);
-					break;
-				}
-			}
-		}
-		return approvedPhrases;
-	}
-
-	public Set<String> getPhraseLookup(){
-		return freq.keySet();
-	}
-
-	// using phrase frequency to filter phrases
-	public boolean isAcceptablePhrase(String phrase){
-		Integer count = freq.get(phrase.toLowerCase().trim());
-		if (count==null)
-			return false;
-
-		if (count>0 && count < 10000)
-			return true;
-		return false;
-	}
-
-	public static void main(String[] args){
-		LinguisticPhraseManager man = new  LinguisticPhraseManager(
-				"/Users/bgalitsky/Documents/workspace/move_com/phrasesOfInterest.csv");
-		man.doLingGrouping();
-		man.generateGroupingReport("topics_groups7_mergedHeads.csv");
-		List<String> stdTopics = man.formStandardizedTopic();
-		man.applyLastRoundOfAggregation();
-		man.generateStdTopicReport("std_topics7_mergedHeads.csv");
-		System.out.println(stdTopics);
-
-	}
-}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/NamedEntityExtractor.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/NamedEntityExtractor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/NamedEntityExtractor.java
deleted file mode 100644
index b766c7c..0000000
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/NamedEntityExtractor.java
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.tools.parse_thicket.opinion_processor;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-import opennlp.tools.apps.relevanceVocabs.SentimentVocab;
-import opennlp.tools.jsmlearning.ProfileReaderWriter;
-import opennlp.tools.parse_thicket.ParseThicket;
-import opennlp.tools.parse_thicket.ParseTreeNode;
-import opennlp.tools.parse_thicket.VerbNetProcessor;
-import opennlp.tools.parse_thicket.kernel_interface.DescriptiveParagraphFromDocExtractor;
-import opennlp.tools.parse_thicket.matching.Matcher;
-import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
-import opennlp.tools.similarity.apps.utils.Pair;
-import opennlp.tools.textsimilarity.ParseTreeChunk;
-import opennlp.tools.textsimilarity.TextProcessor;
-
-import org.apache.commons.io.FileUtils;
-import org.apache.commons.lang.StringUtils;
-
-public class NamedEntityExtractor {
-	protected static Matcher matcher;
-	private static int PARA_LENGTH_IN_SENTENCES = 5, PARA_LENGTH = 250;
-	protected ArrayList<File> queue = new ArrayList<File>();
-	protected static PT2ThicketPhraseBuilder phraseBuilder;
-	protected static SentimentVocab sVocab = SentimentVocab.getInstance();
-	String resourceDirSentimentList = null;
-	Set<String> sentimentVcb = new HashSet<String> ();
-
-	static {
-		synchronized (NamedEntityExtractor.class) {
-			matcher = new Matcher();
-			phraseBuilder = new PT2ThicketPhraseBuilder();
-		}
-	}
-
-	public NamedEntityExtractor(){
-		try {
-			resourceDirSentimentList = new File( "." ).getCanonicalPath()+"/src/test/resources/opinions/sentiment_listReduced.csv";
-		} catch (IOException e) {
-			e.printStackTrace();
-		}
-		List<String[]> sentimentList=null;
-		sentimentList = ProfileReaderWriter.readProfiles(resourceDirSentimentList);
-		for(String[] line: sentimentList){
-			sentimentVcb.add(line[0]);
-		}
-	}
-
-	protected boolean isSentimentWord(String word){
-		if (sentimentVcb.contains(word))
-			return true;
-		else
-			return false;		
-	}
-
-	public EntityExtractionResult extractEntities(String para){
-		List<List<ParseTreeNode>> extractedNERs = new ArrayList<List<ParseTreeNode>>();
-		List<String> extractedNERsWords = new ArrayList<String>();
-		List<List<ParseTreeNode>> extractedSentimentPhrases = 
-				new ArrayList<List<ParseTreeNode>>();
-		EntityExtractionResult result = new EntityExtractionResult();
-
-		ParseThicket pt = null;
-
-		System.out.println("Processing paragraph of length "+para.length() + " | "+ para);
-		pt = matcher.buildParseThicketFromTextWithRST(para);
-		List<List<ParseTreeNode>> nodeList = pt.getSentenceNodes();
-
-
-		for(List<ParseTreeNode> sentence: nodeList){
-			//System.out.println("   Processing sentence: "+ sentence);
-			boolean bInsideNER = false; 
-			String currentPhrase = "";
-			List<ParseTreeNode> currentPhraseNode = new ArrayList<ParseTreeNode>(); 
-			for(ParseTreeNode word: sentence){
-				if (isNERforPhraseExtraction(word)){
-					//System.out.println("++Found word ="+word + " | NER="+ word.getNe());
-					if (bInsideNER){
-						currentPhrase += " "+word.getWord();
-						currentPhraseNode.add(word);
-					} else {
-						bInsideNER=true;
-						currentPhrase = word.getWord();
-						currentPhraseNode.add(word);
-					}
-				} else {
-					if (bInsideNER){
-						if (currentPhrase.indexOf(' ')>-1) // at least two tokens
-							extractedNERsWords.add(currentPhrase);
-							extractedNERs.add(currentPhraseNode);
-						currentPhrase = "";
-						bInsideNER=false;
-					} else {
-						// do nothing, continue scan
-					}
-				}
-			}
-			if (currentPhrase.length()>1 && currentPhrase.indexOf(' ')>-1){
-				extractedNERs.add(currentPhraseNode);
-				extractedNERsWords.add(currentPhrase);
-			}
-
-			Set<String> foundSentimentWords = new HashSet<String>();
-			// now we extract phrases
-			List<List<ParseTreeNode>> phrases = pt.getPhrases();
-			for(List<ParseTreeNode> phrase: phrases){
-				// find a noun phrase under sentiment
-				try {
-					for(int i = phrase.size()-1; i>-1; i--){
-						ParseTreeNode word = phrase.get(i);
-						if ((isSentimentWord(word.getWord()) ||
-								sVocab.isSentimentWord(word.getWord()) && !foundSentimentWords.contains(word.getWord()) )){
-							foundSentimentWords.add(word.getWord());
-							System.out.println("Sentim = " + word.getWord() + " | Found opinionated phrase "+phrase.toString());
-							if (phrase.size()>1 && phrase.size()<7)
-								extractedSentimentPhrases.add(phrase);			
-							break;
-						}
-					}
-				} catch (Exception e) {
-					e.printStackTrace();
-				}
-			}
-
-		} 
-		
-		extractedSentimentPhrases = reduceExtractedPhrases(extractedSentimentPhrases);
-		
-		result.setExtractedNER(extractedNERs);
-		result.setExtractedNERWords(extractedNERsWords);
-		result.setExtractedSentimentPhrases(extractedSentimentPhrases);
-		return result;
-	}
-
-	private List<List<ParseTreeNode>> reduceExtractedPhrases(List<List<ParseTreeNode>> extractedSentimentPhrases) {
-	    List<Integer> idsToDelete = new ArrayList<Integer>();
-		for(int i = 0; i<extractedSentimentPhrases.size(); i++){
-			for(int j = i+1; j<extractedSentimentPhrases.size(); j++){
-				String phrStr1 = ParseTreeNode.toWordString(extractedSentimentPhrases.get(i));
-				String phrStr2 = ParseTreeNode.toWordString(extractedSentimentPhrases.get(j));
-				if (phrStr1 .indexOf(phrStr2 )>-1)
-					idsToDelete.add(j);
-			}
-		}
-		List<List<ParseTreeNode>> resultPhrases = new ArrayList<List<ParseTreeNode>>();
-		for(int i = 0; i<extractedSentimentPhrases.size(); i++){
-			if (!idsToDelete.contains(i))
-				resultPhrases .add(extractedSentimentPhrases.get(i));
-		}
-	    return resultPhrases ;
-    }
-
-	private boolean isNERforPhraseExtraction(ParseTreeNode word){
-		if ((word.getNe().equals("ORGANIZATION") ||word.getNe().equals("LOCATION") || word.getNe().equals("PERSON") ) &&
-				(word.getPos().startsWith("NN") || word.getPos().startsWith("PR") || word.getPos().startsWith("IN")|| 
-						word.getPos().startsWith("JJ") || word.getPos().startsWith("DT")  ))
-			return true;
-
-		return false;
-
-	}
-
-
-}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/PersonExtractor.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/PersonExtractor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/PersonExtractor.java
deleted file mode 100644
index cb04154..0000000
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/PersonExtractor.java
+++ /dev/null
@@ -1,96 +0,0 @@
-package opennlp.tools.parse_thicket.opinion_processor;
-
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-import opennlp.tools.parse_thicket.ParseThicket;
-import opennlp.tools.parse_thicket.ParseTreeNode;
-
-public class PersonExtractor extends NamedEntityExtractor {
-	private boolean isNERforPhraseExtraction(ParseTreeNode word){
-		if ((word.getNe().equals("PERSON") ) &&
-				(word.getPos().startsWith("NN") || word.getPos().startsWith("PR") || word.getPos().startsWith("IN")|| 
-						word.getPos().startsWith("JJ") || word.getPos().startsWith("DT")  ))
-			return true;
-
-		return false;
-
-	}
-	
-	public EntityExtractionResult extractEntities(String para){
-		List<List<ParseTreeNode>> extractedNERs = new ArrayList<List<ParseTreeNode>>();
-		List<String> extractedNERsWords = new ArrayList<String>();
-		List<List<ParseTreeNode>> extractedSentimentPhrases = 
-				new ArrayList<List<ParseTreeNode>>();
-		EntityExtractionResult result = new EntityExtractionResult();
-
-		ParseThicket pt = null;
-
-		System.out.println("Processing paragraph of length "+para.length() + " | "+ para);
-		pt = matcher.buildParseThicketFromTextWithRST(para);
-		List<List<ParseTreeNode>> nodeList = pt.getSentenceNodes();
-
-
-		for(List<ParseTreeNode> sentence: nodeList){
-			System.out.println("   Processing sentence: "+ sentence);
-			boolean bInsideNER = false; 
-			String currentPhrase = "";
-			List<ParseTreeNode> currentPhraseNode = new ArrayList<ParseTreeNode>(); 
-			for(ParseTreeNode word: sentence){
-				if (isNERforPhraseExtraction(word)){
-					System.out.println("++Found word ="+word + " | NER="+ word.getNe());
-					if (bInsideNER){
-						currentPhrase += " "+word.getWord();
-						currentPhraseNode.add(word);
-					} else {
-						bInsideNER=true;
-						currentPhrase = word.getWord();
-						currentPhraseNode.add(word);
-					}
-				} else {
-					if (bInsideNER){
-						if (currentPhrase.indexOf(' ')>-1) // at least two tokens
-							extractedNERsWords.add(currentPhrase);
-							extractedNERs.add(currentPhraseNode);
-						currentPhrase = "";
-						bInsideNER=false;
-					} else {
-						// do nothing, continue scan
-					}
-				}
-			}
-			if (currentPhrase.length()>1 && currentPhrase.indexOf(' ')>-1){
-				extractedNERs.add(currentPhraseNode);
-				extractedNERsWords.add(currentPhrase);
-			}
-
-			Set<String> foundSentimentWords = new HashSet<String>();
-			// now we extract phrases
-			List<List<ParseTreeNode>> phrases = phraseBuilder.buildPT2ptPhrases(pt);
-			for(List<ParseTreeNode> phrase: phrases){
-				// find a noun phrase under sentiment
-				try {
-					for(int i = phrase.size()-1; i>-1; i--){
-						ParseTreeNode word = phrase.get(i);
-						if ((isSentimentWord(word.getWord()) ||
-								sVocab.isSentimentWord(word.getWord()) && !foundSentimentWords.contains(word.getWord()) )){
-							foundSentimentWords.add(word.getWord());
-							System.out.println("Found opinionated phrase "+phrase.toString());
-							extractedSentimentPhrases.add(phrase);			
-							break;
-						}
-					}
-				} catch (Exception e) {
-					e.printStackTrace();
-				}
-			}
-
-		} 
-		result.setExtractedNER(extractedNERs);
-		result.setExtractedNERWords(extractedNERsWords);
-		result.setExtractedSentimentPhrases(extractedSentimentPhrases);
-		return result;
-	}
-}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentencePhraseGivenAWordGetter.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentencePhraseGivenAWordGetter.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentencePhraseGivenAWordGetter.java
deleted file mode 100644
index 86cd2dc..0000000
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentencePhraseGivenAWordGetter.java
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.tools.parse_thicket.opinion_processor;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-import opennlp.tools.apps.relevanceVocabs.SentimentVocab;
-import opennlp.tools.jsmlearning.ProfileReaderWriter;
-import opennlp.tools.parse_thicket.ParseThicket;
-import opennlp.tools.parse_thicket.ParseTreeNode;
-import opennlp.tools.parse_thicket.VerbNetProcessor;
-import opennlp.tools.parse_thicket.kernel_interface.DescriptiveParagraphFromDocExtractor;
-import opennlp.tools.parse_thicket.matching.Matcher;
-import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
-import opennlp.tools.similarity.apps.utils.Pair;
-import opennlp.tools.textsimilarity.ParseTreeChunk;
-import opennlp.tools.textsimilarity.TextProcessor;
-
-import org.apache.commons.io.FileUtils;
-import org.apache.commons.lang.StringUtils;
-
-public class SentencePhraseGivenAWordGetter {
-	protected static Matcher matcher;
-	protected ArrayList<File> queue = new ArrayList<File>();
-	protected static PT2ThicketPhraseBuilder phraseBuilder;
-
-
-	static {
-		synchronized (SentencePhraseGivenAWordGetter.class) {
-			matcher = new Matcher();
-			phraseBuilder = new PT2ThicketPhraseBuilder();
-		}
-	}
-
-	public SentencePhraseGivenAWordGetter(){
-	}
-
-	public EntityExtractionResult extractEntities(String para, String keyword){
-		List<List<ParseTreeNode>> extractedPhrases = new ArrayList<List<ParseTreeNode>>();
-
-		EntityExtractionResult result = new EntityExtractionResult();
-
-		ParseThicket pt =  matcher.buildParseThicketFromTextWithRST(para);
-
-		List<List<ParseTreeNode>> phrases = pt.getPhrases();
-		for(List<ParseTreeNode> phrase: phrases){
-			// find a noun phrase under sentiment
-			try {
-				for(int i = 0; i<phrase.size(); i++){
-					ParseTreeNode word = phrase.get(i);
-					if (word.getWord().toLowerCase().equals(keyword.toLowerCase())){
-						extractedPhrases.add(phrase);		
-						break;
-					}
-				}
-			} catch (Exception e) {
-				e.printStackTrace();
-			}
-		}
-
-		result.setExtractedSentimentPhrases(extractedPhrases);
-		return result;
-	}
-
-
-	public static void main(String[] args){
-		SentencePhraseGivenAWordGetter self = new SentencePhraseGivenAWordGetter();
-		EntityExtractionResult result = self.extractEntities("However i put a foam panel inside the main case if i do not have my headphones or an iPad to brace the mac book", 
-				"panel");
-		System.out.println(result.getExtractedSentimentPhrases());
-	}
-}
-
-
-/*
- 3 phrases are given as a result
- * 
-[[<2>SBAR'i':FW, <3>SBAR'put':VBD, <4>SBAR'a':DT, <5>SBAR'foam':NN, <6>SBAR'panel':NN, <7>SBAR'inside':IN, <8>SBAR'the':DT, <9>SBAR'main':JJ, <10>SBAR'case':NN, <11>SBAR'if':IN, <12>SBAR'i':FW, 
-<13>SBAR'do':VBP, <14>SBAR'not':RB, <15>SBAR'have':VB, <16>SBAR'my':PRP$, <17>SBAR'headphones':NNS, <18>SBAR'or':CC, <19>SBAR'an':DT, <20>SBAR'iPad':NN, <21>SBAR'to':TO, 
-<22>SBAR'brace':VB, <23>SBAR'the':DT, <24>SBAR'mac':NN, <25>SBAR'book':NN], 
-
-[<3>VP'put':VBD, <4>VP'a':DT, <5>VP'foam':NN, <6>VP'panel':NN, <7>VP'inside':IN, <8>VP'the':DT, <9>VP'main':JJ, <10>VP'case':NN, <11>VP'if':IN, <12>VP'i':FW, <13>VP'do':VBP, 
-<14>VP'not':RB, <15>VP'have':VB, <16>VP'my':PRP$, <17>VP'headphones':NNS, <18>VP'or':CC, <19>VP'an':DT, <20>VP'iPad':NN, <21>VP'to':TO, <22>VP'brace':VB, <23>VP'the':DT, 
-<24>VP'mac':NN, <25>VP'book':NN], 
-
-[<4>NP'a':DT, <5>NP'foam':NN, <6>NP'panel':NN]]
-
-*/

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentimentCoreAnnotations.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentimentCoreAnnotations.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentimentCoreAnnotations.java
deleted file mode 100644
index 1efe428..0000000
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentimentCoreAnnotations.java
+++ /dev/null
@@ -1,41 +0,0 @@
-package opennlp.tools.parse_thicket.opinion_processor;
-
-import edu.stanford.nlp.ling.CoreAnnotation;
-
-import edu.stanford.nlp.trees.Tree;
-
-/**
- * Annotations specific to the Sentiment project.  In case there are
- * other projects that use the same RNN machinery, including the RNN
- * core annotations, this lets a sentence have a tree attached where
- * that tree specifically has the sentiment annotations.
- *
- * @author John Bauer
- */
-public class SentimentCoreAnnotations {
-
-  /**
-   * A tree which contains the annotations used for the Sentiment
-   * task.  After forwardPropagate has been called, the Tree will have
-   * prediction, etc. attached to it.
-   */
-  public static class SentimentAnnotatedTree implements CoreAnnotation<Tree> {
-    @Override
-    public Class<Tree> getType() {
-      return Tree.class;
-    }
-  }
-
-
-  /**
-   * The final label given for a sentence.  Set by the
-   * SentimentAnnotator and used by various forms of text output.
-   */
-  public static class SentimentClass implements CoreAnnotation<String> {
-    @Override
-    public Class<String> getType() {
-      return String.class;
-    }
-  }
-}
-

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/StopList.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/StopList.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/StopList.java
deleted file mode 100755
index ad0f791..0000000
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/StopList.java
+++ /dev/null
@@ -1,401 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.tools.parse_thicket.opinion_processor;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.Hashtable;
-import java.util.Iterator;
-import java.util.List;
-
-import opennlp.tools.stemmer.PStemmer;
-
-import org.apache.commons.lang.StringUtils;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-
-public class StopList {
-    private static StopList m_StopList = null;
-    private static Hashtable<String, HashSet<String>> m_stopHash = new Hashtable<String, HashSet<String>>();
-    public static final Log logger = LogFactory.getLog(StopList.class);
-    private static final String DEFAULT_STOPLIST = "STANDARD";
-    public static String resourceDir =null;
-    private static PStemmer stemmer = new PStemmer();
-
-    static {
-        synchronized (StopList.class) {
-            try {
-                LoadStopList();
-            } catch (IOException e) {
-                // TODO Auto-generated catch block
-                e.printStackTrace();
-            }
-        }
-    }
-
-    /**
-     * Get the StopList singleton instance.
-     * 
-     * @return The StopList
-     */
-    static public synchronized StopList getInstance() {
-
-        if (m_StopList == null) {
-            m_StopList = new StopList();
-
-            try {
-                m_StopList.LoadStopList();
-            } catch (Exception e) {
-
-            }
-        }
-        return m_StopList;
-    }
-
-    static public synchronized StopList getInstance(String dir) {
-        resourceDir = dir;
-        if (m_StopList == null) {
-            m_StopList = new StopList();
-
-            try {
-                m_StopList.LoadStopList();
-            } catch (Exception e) {
-
-            }
-        }
-        return m_StopList;
-    }
-
-    private static void LoadStopList() throws IOException {
-
-        File dir = new File(resourceDir + "/maps");
-        String[] children = dir.list();
-        if (children == null) {
-            System.err.println("Problem reading Stop Lists!");
-        } else {
-            for (int i = 0; i < children.length; i++) {
-                String fn = children[i];
-                if (fn.endsWith(".vcb")) {
-                    String fileName = resourceDir + "/maps/" + fn;
-                    File f = new File(fileName);
-                    loadStopListFile(f);
-                }
-            }
-        }
-    }
-
-    private static void loadStopListFile(File f) throws FileNotFoundException {
-
-        FileReader fileReader = new FileReader(f);
-        BufferedReader in = new BufferedReader(fileReader);
-
-        String str = new String();
-        boolean fLine = true;
-        HashSet<String> t = new HashSet<String>();
-        String listName = "";
-
-        try {
-            while ((str = in.readLine()) != null) {
-                if (fLine && str.length() > 0) {
-                    fLine = false;
-                    listName = str;
-                } else {
-                    t.add(str);
-                }
-            }
-        } catch (IOException ioe) {
-
-        } finally {
-            try {
-                if (in != null) {
-                    in.close();
-                }
-                if (fileReader != null) {
-                    fileReader.close();
-                }
-            } catch (IOException ioe) {
-                ioe.printStackTrace();
-            }
-        }
-
-        if (listName.length() > 0) {
-            HashSet<String> l = m_stopHash.get(listName);
-            if (l != null) {
-                synchronized (l) {
-                    m_stopHash.put(listName, t);
-                }
-            } else {
-                m_stopHash.put(listName, t);
-            }
-        }
-    }
-
-    /**
-     * Is the given word in the stop words list? Uses the defaut "STANDARD"
-     * stoplist
-     * 
-     * @param str
-     *            The word to check
-     * @return is a stop word
-     */
-    public static boolean isStopWord(String str) {
-        boolean retVal = false;
-        if (m_stopHash.containsKey(DEFAULT_STOPLIST))
-            retVal = m_stopHash.get(DEFAULT_STOPLIST).contains(str);
-        return retVal;
-    }
-
-    public static boolean isFirstName(String str) {
-        boolean retVal = false;
-        if (m_stopHash.containsKey("FIRST_NAMES"))
-            retVal = m_stopHash.get("FIRST_NAMES").contains(str.toUpperCase());
-        return retVal;
-    }
-
-    public String getRandomFirstName() {
-        HashSet<String> firstNames = m_stopHash.get("FIRST_NAMES");
-        int indexRand = (int) (Math.random() * new Float(firstNames.size()));
-        Iterator iter = firstNames.iterator();
-        for (int i = 0; i < indexRand; i++) {
-            iter.next();
-        }
-        return ((String) iter.next()).toLowerCase();
-    }
-
-    public static boolean isCommonWord(String str) {
-        if (str == null)
-            return true;
-        String stemmed="";
-		try {
-			stemmed = stemmer.stem(str).toLowerCase();
-		} catch (Exception e) {
-			//stemming exceptions are not informative, jiust ignore wthis word
-			//e.printStackTrace();
-		}
-
-        boolean retVal = false;
-        if (m_stopHash.containsKey("ENG_DICT"))
-            retVal = m_stopHash.get("ENG_DICT").contains(stemmed);
-        return retVal;
-    }
-
-    public boolean isCommonEventWord(String str) {
-        if (str == null)
-            return true;
-        boolean retVal = false;
-
-        try {
-            String stemmed = str.toLowerCase();
-
-            if (m_stopHash.containsKey("fREQUENTEVENTNAMEWORDS"))
-                retVal = m_stopHash.get("fREQUENTEVENTNAMEWORDS").contains(
-                        stemmed);
-        } catch (Exception e) {
-            // TODO Auto-generated catch block
-            e.printStackTrace();
-        }
-        return retVal;
-    }
-
-    /**
-     * Is the given word in the stop words list provided?
-     * 
-     * @param str
-     *            The word to check
-     * @param stop_list
-     *            the name of the stoplist to check against
-     * @return is a stop word
-     */
-    public static boolean isStopWord(String str, String stop_list) {
-        boolean retVal = false;
-        if (m_stopHash.containsKey(stop_list))
-            retVal = m_stopHash.get(stop_list).contains(str);
-        return retVal;
-    }
-
-    public boolean isStopWordAll(String str) {
-        return isStopWord(str);
-    }
-
-    public HashSet<String> getStopListMap(String name) {
-        return m_stopHash.get(name);
-    }
-
-    public static List<List<String>> preFilterCommonEnglishExpressions(
-            List<String> userLikes) {
-        List<List<String>> results = new ArrayList<List<String>>();
-
-        List<String> resultUserLikes = new ArrayList<String>(), potentialCategs = new ArrayList<String>();
-        if (userLikes.size() < 6) {// too short, do not filter
-            results.add(userLikes);
-            results.add(potentialCategs);
-            return results;
-
-        }
-
-        for (String like : userLikes) {
-            like = like.toLowerCase();
-            if (!StringUtils.isAlphanumeric(like.replace(" ", ""))) {
-                logger.info("removed isAlphanumeric " + like);
-                continue;
-            }
-
-            if (StringUtils.isNumeric(like)) {
-                logger.info("removed isNumericSpace " + like);
-                continue;
-            }
-
-            if (like.length() < 4) {
-                logger.info("removed too short likes " + like);
-                continue;
-            }
-            boolean existFirstName = false, allWordsCommonEnglish = true, bStop = false;
-            String[] comps = like.split(" ");
-            StringBuffer buf = new StringBuffer();
-            for (String word : comps) {
-                boolean isCommon = isCommonWord(word);
-                boolean isName = isFirstName(word);
-                if (!isCommon)
-                    allWordsCommonEnglish = false;
-                if (isName)
-                    existFirstName = true;
-                if (isStopWord(word) || word.length() < 3)
-                    bStop = true;
-                else
-                    buf.append(word + " ");
-            } // / does not have to include stop word
-            if (!existFirstName && allWordsCommonEnglish && comps.length < 3) {
-                logger.info("moved to category:  NoFirstName+AllCommonEng+ShorterThan3 "
-                        + like);
-
-                continue;
-            }
-            if (!existFirstName && allWordsCommonEnglish && comps.length == 1) {
-                logger.info("moved to category: NoFirstName+AllCommonEng+Short1word "
-                        + like);
-                potentialCategs.add(like);
-                continue;
-            }
-
-            if (existFirstName && comps.length == 1) {
-                logger.info("removed : only first name, no last name " + like);
-
-                continue;
-            }
-
-            resultUserLikes.add(buf.toString().trim());
-
-        }
-
-        resultUserLikes = new ArrayList<String>(new HashSet<String>(
-                resultUserLikes));
-        if (resultUserLikes.size() > 1) {
-            results.add(resultUserLikes);
-            results.add(potentialCategs);
-            return results;
-        }
-
-        else {// do not do reduction
-            results.add(userLikes);
-            results.add(potentialCategs);
-            return results;
-        }
-    }
-
-    public static boolean isAcceptableIndividualLikes(String like) {
-        StopList finder = StopList.getInstance();
-        like = like.toLowerCase();
-        if (!StringUtils.isAlphanumeric(like.replace(" ", ""))) {
-            logger.info("removed isAlphanumeric " + like);
-            return false;
-        }
-
-        if (StringUtils.isNumeric(like)) {
-            logger.info("removed isNumericSpace " + like);
-            return false;
-        }
-
-        if (like.length() < 4) {
-            logger.info("removed too short likes " + like);
-            return false;
-        }
-        boolean existFirstName = false, allWordsCommonEnglish = true, bStop = false;
-        String[] comps = like.split(" ");
-        StringBuffer buf = new StringBuffer();
-        for (String word : comps) {
-            boolean isCommon = finder.isCommonWord(word);
-            boolean isName = finder.isFirstName(word);
-            if (!isCommon)
-                allWordsCommonEnglish = false;
-            if (isName)
-                existFirstName = true;
-            if (finder.isStopWord(word) || word.length() < 3)
-                bStop = true;
-            else
-                buf.append(word + " ");
-        } // / does not have to include stop word
-        if (!existFirstName && allWordsCommonEnglish && comps.length < 3) {
-            logger.info("  NoFirstName+AllCommonEng+ShorterThan3 " + like);
-
-            return false;
-        }
-        if (!existFirstName && allWordsCommonEnglish && comps.length == 1) {
-            logger.info(" NoFirstName+AllCommonEng+Short1word " + like);
-
-            return false;
-        }
-
-        if (existFirstName && comps.length == 1) {
-            logger.info("removed : only first name, no last name " + like);
-
-            return false;
-        }
-
-        return true;
-    }
-
-    @SuppressWarnings("all")
-    public static void main(String[] args) {
-
-        StopList list = StopList
-                .getInstance("/Users/borisgalitsky/Documents/workspace/opennlp-similarity/src/test/resources/");
-        Boolean b = list.isCommonWord("demonstration");
-
-        String fname = list.getRandomFirstName();
-
-        b = list.isCommonEventWord("tour");
-        b = list.isCommonEventWord("dance");
-        b = list.isCommonEventWord("salsa");
-        b = list.isCommonEventWord("center");
-        b = list.isCommonEventWord("family");
-
-      
-
-        b = isAcceptableIndividualLikes("forest glen");
-        b = isAcceptableIndividualLikes("drive");
-        b = isAcceptableIndividualLikes("house");
-        b = isAcceptableIndividualLikes("Timothy Kloug");
-        b = isAcceptableIndividualLikes("Mamma Mia");
-
-    }
-}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicAsOpinionMinerRunner.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicAsOpinionMinerRunner.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicAsOpinionMinerRunner.java
deleted file mode 100644
index f4d56aa..0000000
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicAsOpinionMinerRunner.java
+++ /dev/null
@@ -1,117 +0,0 @@
-package opennlp.tools.parse_thicket.opinion_processor;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.commons.io.FileUtils;
-import org.apache.commons.lang3.StringUtils;
-
-import opennlp.tools.jsmlearning.ProfileReaderWriter;
-import opennlp.tools.parse_thicket.ParseTreeNode;
-import opennlp.tools.textsimilarity.ParseTreeChunk;
-
-public class TopicAsOpinionMinerRunner {
-	private List<File> queue;
-	private final static String reviewSource = "/Users/bgalitsky/Documents/solr/example/exampledocs/publication_page0.json";
-	NamedEntityExtractor neExtractor = new NamedEntityExtractor();
-	Set<String> allPhrases = new HashSet<String>();
-	
-	public void processJSONfileWithReviews(){
-		List<String[]> report = new ArrayList<String[]>();
-		report.add(new String[] { "text", "phrases of potential interest list" , });
-
-		
-		String content=null;
-		try {
-			content = FileUtils.readFileToString(new File(reviewSource));
-		} catch (IOException e) {
-			e.printStackTrace();
-		}
-		String[] texts = StringUtils.substringsBetween(content, "summary\":\"", "\"");
-		for(String text: texts){
-			report.clear();
-			EntityExtractionResult result = neExtractor.extractEntities(text);
-			//report.add(new String[]{text});
-			allPhrases.addAll(result.extractedNERWords);
-			allPhrases = new HashSet<String>(allPhrases);
-			for(String p: allPhrases){
-				report.add(new String[]{p});
-			}
-			/*
-			String[] phrases = (String[])result.extractedNERWords.toArray(new String[0]);
-			if (phrases!=null && phrases.length>0)
-				report.add(phrases);
-			*/
-			/*report.add((String[])result.extractedSentimentPhrases.toArray(new String[0]));
-			List<String> stringPhrases = new ArrayList<String>(),
-					nodePhrases = new ArrayList<String>();
-			for(List<ParseTreeNode> chList: result.extractedSentimentPhrases){
-				String buf = "", nodeBuf="";
-				for(ParseTreeNode ch: chList){
-					buf+=ch.getWord()+ " ";
-					nodeBuf+=ch.toString()+ " ";
-				}
-				stringPhrases.add(buf.trim());
-				nodePhrases.add(nodeBuf.trim());
-			}
-			report.add((String[])stringPhrases.toArray(new String[0]));
-			report.add((String[])nodePhrases.toArray(new String[0]));
-			*/
-			
-			ProfileReaderWriter.writeReport(report, "phrasesExtracted3.csv");
-		}
-	}
-
-	private void addFiles(File file) {
-
-		if (!file.exists()) {
-			System.out.println(file + " does not exist.");
-
-			if (file.isDirectory()) {
-				for (File f : file.listFiles()) {
-					if (f.getName().startsWith("."))
-						continue;
-					addFiles(f);
-					System.out.println(f.getName());
-				}
-			} else {
-				queue.add(file);
-
-			}
-		}
-	}
-	
-	public static void main(String[] args){
-		TopicAsOpinionMinerRunner runner = new TopicAsOpinionMinerRunner();
-		runner.processJSONfileWithReviews();
-
-	}
-}
-
-/*
-	public void processDirectory(String path){
-		List<String[]> report = new ArrayList<String[]>();
-		report.add(new String[] { "filename", "named entity list", "phrases of potential interest list" });
-
-		List<String> allNamedEntities = new ArrayList<String>();
-
-		addFiles(new File(path));
-		for(File f: queue){
-			List<String> entities = (List<String>) extractEntities(f.getAbsolutePath()).getFirst();
-			List<String> opinions = (List<String>) extractEntities(f.getAbsolutePath()).getSecond();
-			report.add(new String[]{ f.getName(), entities.toString(),  opinions.toString()});	
-			ProfileReaderWriter.writeReport(report, "nameEntitiesExtracted.csv");
-
-			allNamedEntities.addAll(entities);
-
-			allNamedEntities = new ArrayList<String>(new HashSet<String> (allNamedEntities ));
-
-
-		}
-		ProfileReaderWriter.writeReport(report, "nameEntitiesTopicsOfInterestExtracted.csv");
-	} 
-} */

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicPhraseExtractor.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicPhraseExtractor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicPhraseExtractor.java
deleted file mode 100644
index a704f22..0000000
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicPhraseExtractor.java
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.tools.parse_thicket.opinion_processor;
-
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-import opennlp.tools.apps.relevanceVocabs.SentimentVocab;
-import opennlp.tools.parse_thicket.ParseThicket;
-import opennlp.tools.parse_thicket.ParseTreeNode;
-import opennlp.tools.parse_thicket.matching.Matcher;
-
-public class TopicPhraseExtractor {
-	Matcher matcher = new Matcher();
-
-	// sentiment vocabulary for phrase under the focus of sentiment
-	SentimentVocab sVocab = SentimentVocab.getInstance();
-	//This is used to create an XML with phrases. The same class for acro  & phrases
-
-	public EntityExtractionResult extractEntities(String para){
-		EntityExtractionResult result = new EntityExtractionResult();
-		List<String> extractedNerPhrasesStr = new ArrayList<String>(), 
-				extractedNerExactStr = new ArrayList<String>(),
-				extractedSentimentPhrasesStr = 
-				new ArrayList<String>(), extractedNONSentimentPhrasesStr = 
-				new ArrayList<String>(), extractedNerPhraseTags = new ArrayList<String>();
-		// no need to change to extract more/less phrases
-		ParseThicket pt = matcher.buildParseThicketFromTextWithRST(para);
-
-		List<List<ParseTreeNode>> extractedSentimentPhrases = new ArrayList<List<ParseTreeNode>>(), 
-				extractedNONSentimentPhrases = new ArrayList<List<ParseTreeNode>>(),
-				extractedNerPhrases = new ArrayList<List<ParseTreeNode>>(),
-						extractedNerExactPhrases= new ArrayList<List<ParseTreeNode>>();
-		//TODO document examples / cases for each rule
-		// now we extract phrases
-		List<List<ParseTreeNode>> phrases = pt.getPhrases();
-		List<Float> sentimentProfile = pt.getSentimentProfile();
-		for(List<ParseTreeNode> phrase: phrases){
-
-			// find a noun phrase under sentiment
-			boolean bAccept = true, bNER = false;
-
-			String phraseStr = asString(phrase);
-
-
-			if (!phrase.get(0).getPhraseType().equals("NP") && !phrase.get(0).getPhraseType().equals("VP") )	
-				bAccept = false;
-
-			boolean bSentiment = false;
-			for(ParseTreeNode word: phrase){
-				if (sVocab.isSentimentWord(word.getWord())){
-					bSentiment=true;
-					break;
-				}
-			}
-
-			String nerTagConfirmed = null;
-			for(ParseTreeNode word: phrase){
-				// no Named Entity
-				String nerTag = isNERforPhraseExtraction(word);
-				if (nerTag!=null){
-					bNER = true;
-					nerTagConfirmed = nerTag;
-				}
-
-				// no numbers nor prepositions
-				if (word.getPos().startsWith("CD") || word.getPos().indexOf("PRP")>-1 )
-					bAccept = false;
-			}
-			if (!bAccept)
-				continue;
-			// was 7 -> 2
-			if (phrase.size()>7 || phrase.size()<2)
-				bAccept = false;
-
-			if (phrase.get(0).getPos().equals("DT") && phrase.size()<3)
-				bAccept = false;
-			if (!bAccept)
-				continue;
-
-			String cleanedPhraseStr = cleanPhraseString(phraseStr);
-			if (cleanedPhraseStr==null)
-				bAccept = false;
-
-			if (bAccept){
-				if (bNER){
-					extractedNerPhrases.add(phrase);
-					extractedNerPhrasesStr.add(phraseStr);
-					extractedNerPhraseTags.add(nerTagConfirmed );
-					// forming exact NER
-					List<ParseTreeNode> phraseNER_exact = new ArrayList<ParseTreeNode>();
-					String nerExactStr = "";
-					for(ParseTreeNode word: phrase){
-						String ner = isNERforPhraseExtraction(word);
-						if (ner!=null && ner.equals(nerTagConfirmed)){
-							phraseNER_exact.add(word);
-							nerExactStr+=" "+word.getWord();
-						}
-					}
-					nerExactStr.trim();
-					extractedNerExactPhrases.add(phraseNER_exact);
-					extractedNerExactStr.add(nerExactStr);
-				}
-				else if (bSentiment) {
-					extractedSentimentPhrasesStr.add(cleanedPhraseStr);					
-					extractedSentimentPhrases.add(phrase);
-				} else {
-					extractedNONSentimentPhrasesStr.add(cleanedPhraseStr);					
-					extractedNONSentimentPhrases.add(phrase);
-				}
-			}
-		} 
-
-		result.setExtractedSentimentPhrases(extractedSentimentPhrases);
-		result.setExtractedSentimentPhrasesStr(extractedSentimentPhrasesStr);
-
-		result.setExtractedNONSentimentPhrases(extractedNONSentimentPhrases);
-		result.setExtractedNONSentimentPhrasesStr(extractedNONSentimentPhrasesStr);
-		
-		result.setExtractedNerPhrases(extractedNerPhrases);
-		result.setExtractedNerPhrasesStr(extractedNerPhrasesStr);
-		result.setExtractedNerPhraseTags(extractedNerPhraseTags);
-		
-		result.setExtractedNerExactPhrases(extractedNerExactPhrases);
-		result.setExtractedNerExactStr(extractedNerExactStr);
-
-		result.setSentimentProfile(sentimentProfile );
-
-		return result;
-	}
-
-
-
-
-
-
-	private String cleanPhraseString(String phraseStr) {
-		String p = phraseStr.toLowerCase();
-
-		if (p.startsWith("*") || p.startsWith("&") || p.startsWith("$"))
-			return null;
-
-		if (p.startsWith("this ") || p.startsWith("other "))
-			return null;
-
-		if (p.startsWith("a "))
-			p = p.substring(2, p.length());
-		if (p.startsWith("the "))
-			p = p.substring(4, p.length());
-		if (p.startsWith(", "))
-			p = p.substring(2, p.length());
-
-		return p;
-	}
-
-	private String asString(List<ParseTreeNode> phrase) {
-		String buf = "";
-		for(ParseTreeNode p: phrase)
-			buf+=p.getWord()+" ";
-		return buf.trim();
-	}
-
-	private String isNERforPhraseExtraction(ParseTreeNode word){
-		if (word.getNe() == null)
-			return null;
-		
-
-		if (!(word.getPos().startsWith("NN") || word.getPos().startsWith("PR") || word.getPos().startsWith("IN")|| 
-				word.getPos().startsWith("JJ") || word.getPos().startsWith("DT")))
-			return null;
-				
-
-		if (word.getNe().equals("ORGANIZATION"))
-				return "ORGANIZATION";
-		if(word.getNe().equals("LOCATION"))
-			return "LOCATION";
-					
-		if(word.getNe().equals("PERSON") ) 
-			return "PERSON";
-		
-		if(word.getNe().equals("MONEY") ) 
-			return "MONEY";
-		if(word.getNe().equals("DATE") ) 
-			return "DATE";
-		if(word.getNe().equals("TIME") ) 
-			return "TIME";
-
-		return null;
-
-	}
-}
-
-/*
- * Na�ve  sentiment prediction systems work just by looking at words in isolation, giving positive points for positive words and negative points for negative words and then summing up these points. That way, the order of words is ignored and important information is lost. The deep learning model of (Socher et al 2013) builds a representation of whole sentences based on the sentence structure. It computes the sentiment based on how words compose the meaning of longer phrases. However, in most applications just taking individual sentences into account do not give accurate results and rhetoric information needs to be taken into account to determine the overall sentiment of a paragraph and then back to the individual sentence level.
- */
-

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterEngineRunner.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterEngineRunner.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterEngineRunner.java
deleted file mode 100644
index 6de3180..0000000
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterEngineRunner.java
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.tools.parse_thicket.opinion_processor;
-
-import java.io.File;
-import java.io.IOException;
-import java.lang.reflect.Array;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.List;
-
-import org.apache.commons.io.FileUtils;
-import org.apache.commons.lang3.StringUtils;
-
-import au.com.bytecode.opencsv.CSVWriter;
-import opennlp.tools.jsmlearning.ProfileReaderWriter;
-import opennlp.tools.parse_thicket.ParseTreeNode;
-import opennlp.tools.textsimilarity.ParseTreeChunk;
-
-public class TwitterEngineRunner {
-	private List<File> queue;
-	private final static String twSource = "/Users/bgalitsky/Documents/workspace/TwitterMiner/data/TwitterArtistsDynamicsTot12_07.csv";
-	TwitterFilter neExtractor = new TwitterFilter();
-	private static int iWind = 80;
-
-	public void processTweetFile(int nRun){
-		List<String[]> report = new ArrayList<String[]>(), ful_less =  new ArrayList<String[]>();
-		List<String> meaningLESS = new ArrayList<String>(), meaningFUL = new ArrayList<String>();
-		report.add(new String[] { "text", "phrases of potential interest list" , });
-
-		List<String[]> texts = ProfileReaderWriter.readProfiles(twSource);
-		int offset = iWind*nRun;
-		
-		//for(int i=offset; i< offset+iWind; i++){
-			
-		//	String[] text = texts.get(i);
-		for(String[] text: texts){
-			List<String> textDeduped = new ArrayList<String>(new HashSet<String>(Arrays.asList(text)));
-			EntityExtractionResult result = null;
-			if (text==null || text.length<4)
-				continue;
-
-			for(int nInLine=3; nInLine<textDeduped.size(); nInLine++){
-				if (textDeduped.get(nInLine).length()>180)
-					continue;
-				
-				String cleanedTweet = textDeduped.get(nInLine).replace("/\\bs\\@+/ig","");
-				try {
-					result = neExtractor.extractEntities(cleanedTweet);
-				} catch (Exception e) {
-					e.printStackTrace();
-					continue;
-				}
-				report.add(new String[]{text[0],text[nInLine]});
-				report.add((String[])result.extractedNERWords.toArray(new String[0]));
-				//report.add((String[])result.extractedSentimentPhrases.toArray(new String[0]));
-				List<String> stringPhrases = new ArrayList<String>(),
-						nodePhrases = new ArrayList<String>();
-				Boolean bMeaningf = false;
-
-				//stringPhrases.add(""); nodePhrases.add(""); // to make report more readable
-				for(List<ParseTreeNode> chList: result.extractedSentimentPhrases){
-					String buf = "", nodeBuf="";
-					for(ParseTreeNode ch: chList){
-						buf+=ch.getWord()+ " ";
-						nodeBuf+=ch.toString()+ " ";
-					}
-					stringPhrases.add(buf.trim());
-					nodePhrases.add(nodeBuf.trim());
-				}
-				// selecting MEANINGFULL
-				if (nodePhrases.size()>1){
-					if ((nodePhrases.get(0).indexOf(">VP'")>-1 || nodePhrases.get(0).indexOf(">NNP'")>-1) &&
-							(nodePhrases.get(1).indexOf(">VP'")>-1 || nodePhrases.get(1).indexOf(">NNP'")>-1)){
-						bMeaningf = true;
-
-					}
-				}
-
-				report.add((String[])stringPhrases.toArray(new String[0]));
-				report.add((String[])nodePhrases.toArray(new String[0]));
-				if (bMeaningf){
-					report.add(new String[]{"===", "MEANINGFUL tweet"});
-					if (!meaningFUL.contains(cleanedTweet))
-						meaningFUL.add(cleanedTweet);
-				} else {
-					if (!meaningLESS.contains(cleanedTweet))
-						meaningLESS.add(cleanedTweet);
-				}
-
-				int count = 0;
-				ful_less.clear();
-				for(String less: meaningLESS ){
-					String fl = "";
-					if (count<meaningFUL.size())
-						fl = meaningFUL.get(count);
-					ful_less.add(new String[]{less, fl});
-					count++;
-				}
-
-				report.add(new String[]{"-----------------------------------------------------"});
-					ProfileReaderWriter.writeReport(report, "phrasesExtractedFromTweets3_"+nRun+".csv");
-					ProfileReaderWriter.writeReport(ful_less, "ful_lessTweets3_"+nRun+".csv");
-				
-			}
-		}
-	}
-
-
-	public static void main(String[] args){
-		TwitterEngineRunner runner = new TwitterEngineRunner();
-		int nRun = Integer.parseInt(args[0]);
-		runner.processTweetFile(nRun);
-
-	}
-}
-
-/*
-	public void processDirectory(String path){
-		List<String[]> report = new ArrayList<String[]>();
-		report.add(new String[] { "filename", "named entity list", "phrases of potential interest list" });
-
-		List<String> allNamedEntities = new ArrayList<String>();
-
-		addFiles(new File(path));
-		for(File f: queue){
-			List<String> entities = (List<String>) extractEntities(f.getAbsolutePath()).getFirst();
-			List<String> opinions = (List<String>) extractEntities(f.getAbsolutePath()).getSecond();
-			report.add(new String[]{ f.getName(), entities.toString(),  opinions.toString()});	
-			ProfileReaderWriter.writeReport(report, "nameEntitiesExtracted.csv");
-
-			allNamedEntities.addAll(entities);
-
-			allNamedEntities = new ArrayList<String>(new HashSet<String> (allNamedEntities ));
-
-
-		}
-		ProfileReaderWriter.writeReport(report, "nameEntitiesTopicsOfInterestExtracted.csv");
-	} 
-} */
-
-

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterFilter.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterFilter.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterFilter.java
deleted file mode 100644
index 0e5053d..0000000
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterFilter.java
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.tools.parse_thicket.opinion_processor;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-import opennlp.tools.apps.relevanceVocabs.SentimentVocab;
-import opennlp.tools.jsmlearning.ProfileReaderWriter;
-import opennlp.tools.parse_thicket.ParseThicket;
-import opennlp.tools.parse_thicket.ParseTreeNode;
-import opennlp.tools.parse_thicket.VerbNetProcessor;
-import opennlp.tools.parse_thicket.kernel_interface.DescriptiveParagraphFromDocExtractor;
-import opennlp.tools.parse_thicket.matching.Matcher;
-import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
-import opennlp.tools.similarity.apps.utils.Pair;
-import opennlp.tools.textsimilarity.ParseTreeChunk;
-import opennlp.tools.textsimilarity.TextProcessor;
-
-import org.apache.commons.io.FileUtils;
-import org.apache.commons.lang.StringUtils;
-
-public class TwitterFilter {
-	protected static Matcher matcher;
-	private static int PARA_LENGTH_IN_SENTENCES = 5, PARA_LENGTH = 250;
-	protected ArrayList<File> queue = new ArrayList<File>();
-	protected static PT2ThicketPhraseBuilder phraseBuilder;
-	protected static SentimentVocab sVocab = SentimentVocab.getInstance();
-	String resourceDirSentimentList = null;
-	Set<String> sentimentVcb = new HashSet<String> ();
-
-	static {
-		synchronized (TwitterFilter.class) {
-			matcher = new Matcher();
-			phraseBuilder = new PT2ThicketPhraseBuilder();
-		}
-	}
-
-	public TwitterFilter(){
-		try {
-			resourceDirSentimentList = new File( "." ).getCanonicalPath()+"/src/test/resources/opinions/sentiment_listReduced.csv";
-		} catch (IOException e) {
-			e.printStackTrace();
-		}
-		List<String[]> sentimentList=null;
-		sentimentList = ProfileReaderWriter.readProfiles(resourceDirSentimentList);
-		for(String[] line: sentimentList){
-			sentimentVcb.add(line[0]);
-		}
-	}
-
-	private boolean isSentimentWord(String word){
-		if (sentimentVcb.contains(word))
-			return true;
-		else
-			return false;		
-	}
-
-	public EntityExtractionResult extractEntities(String para){
-		List<List<ParseTreeNode>> extractedNERs = new ArrayList<List<ParseTreeNode>>();
-		List<String> extractedNERsWords = new ArrayList<String>();
-		List<List<ParseTreeNode>> extractedSentimentPhrases = 
-				new ArrayList<List<ParseTreeNode>>();
-		EntityExtractionResult result = new EntityExtractionResult();
-
-		ParseThicket pt = null;
-
-		System.out.println("Processing paragraph of length "+para.length() + " | "+ para);
-		pt = matcher.buildParseThicketFromTextWithRST(para);
-		List<List<ParseTreeNode>> nodeList = pt.getSentenceNodes();
-
-
-		for(List<ParseTreeNode> sentence: nodeList){
-			System.out.println("   Processing sentence: "+ sentence);
-			boolean bInsideNER = false; 
-			String currentPhrase = "";
-			List<ParseTreeNode> currentPhraseNode = new ArrayList<ParseTreeNode>(); 
-			for(ParseTreeNode word: sentence){
-				if (isNERforPhraseExtraction(word)){
-					System.out.println("++Found word ="+word + " | NER="+ word.getNe());
-					if (bInsideNER){
-						currentPhrase += " "+word.getWord();
-						currentPhraseNode.add(word);
-					} else {
-						bInsideNER=true;
-						currentPhrase = word.getWord();
-						currentPhraseNode.add(word);
-					}
-				} else {
-					if (bInsideNER){
-						if (currentPhrase.indexOf(' ')>-1) // at least two tokens
-							extractedNERsWords.add(currentPhrase);
-							extractedNERs.add(currentPhraseNode);
-						currentPhrase = "";
-						bInsideNER=false;
-					} else {
-						// do nothing, continue scan
-					}
-				}
-			}
-			if (currentPhrase.length()>1 && currentPhrase.indexOf(' ')>-1){
-				extractedNERs.add(currentPhraseNode);
-				extractedNERsWords.add(currentPhrase);
-			}
-
-			Set<String> foundSentimentWords = new HashSet<String>();
-			// now we extract phrases
-			List<List<ParseTreeNode>> phrases = pt.getPhrases();
-			for(List<ParseTreeNode> phrase: phrases){
-				// find a noun phrase under sentiment
-				try {
-					for(int i = phrase.size()-1; i>-1; i--){
-						ParseTreeNode word = phrase.get(i);
-						if ((isSentimentWord(word.getWord()) ||
-								sVocab.isSentimentWord(word.getWord()) && !foundSentimentWords.contains(word.getWord()) )){
-							foundSentimentWords.add(word.getWord());
-							System.out.println("Found opinionated phrase "+phrase.toString());
-							extractedSentimentPhrases.add(phrase);			
-							break;
-						}
-					}
-				} catch (Exception e) {
-					e.printStackTrace();
-				}
-			}
-
-		} 
-		result.setExtractedNER(extractedNERs);
-		result.setExtractedNERWords(extractedNERsWords);
-		result.setExtractedSentimentPhrases(extractedSentimentPhrases);
-		return result;
-	}
-
-
-
-	private boolean isNERforPhraseExtraction(ParseTreeNode word){
-		if ((word.getNe().equals("ORGANIZATION") ||word.getNe().equals("LOCATION") || word.getNe().equals("PERSON") ) &&
-				(word.getPos().startsWith("NN") || word.getPos().startsWith("PR") || word.getPos().startsWith("IN")|| 
-						word.getPos().startsWith("JJ") || word.getPos().startsWith("DT")  ))
-			return true;
-
-		return false;
-
-	}
-
-
-}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMiner.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMiner.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMiner.java
deleted file mode 100644
index a138de6..0000000
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMiner.java
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.tools.parse_thicket.opinion_processor;
-
-import java.io.File;
-import java.io.IOException;
-
-import org.apache.commons.io.FileUtils;
-import org.apache.commons.lang.StringUtils;
-
-import opennlp.tools.similarity.apps.utils.PageFetcher;
-
-public class YouTubeMiner {
-	private PageFetcher fetcher = new PageFetcher();
-	public YouTubeMinerResult getData(String url){
-		YouTubeMinerResult result = new YouTubeMinerResult();
-		String content = fetcher.fetchOrigHTML(url);
-		try {
-			FileUtils.writeStringToFile(new File(url.replace(':', '_').replace('/', '_')), content);
-		} catch (IOException e1) {
-			// TODO Auto-generated catch block
-			e1.printStackTrace();
-		}
-		if (url.indexOf("channel")>-1){
-			try { //subscriber-count" title="30" 
-				String subscribersStr = StringUtils.substringBetween(content,"subscriber-count", "tabindex");
-				String dirtyNumber = StringUtils.substringBetween(subscribersStr, "title=\"", "\"");
-				String cleanNumber = dirtyNumber.replaceAll("[^\\x00-\\x7F]", "");
-				if (cleanNumber!=null){
-					int subscribers = Integer.parseInt(cleanNumber );
-					result.subscribers = subscribers;
-				} else {
-					System.err.println("Not found data for 'subscriber-count', 'tabindex'");
-				}
-			} catch (NumberFormatException e) {
-				// TODO Auto-generated catch block
-				e.printStackTrace();
-			}
-		} else {
-			try {
-
-				String subscribersStr = StringUtils.substringBetween(content,"subscriber-count", "tabindex");
-				String dirtyNumber = StringUtils.substringBetween(subscribersStr, "title=\"", "\"").replace(" ", "");
-				if (dirtyNumber!=null){
-					int subscribers = Integer.parseInt(dirtyNumber );
-					result.subscribers = subscribers;
-				} else {
-					System.err.println("Not found data for 'subscriber-count', 'tabindex'");
-				}
-
-				String viewsStrDirty = StringUtils.substringBetween(content,
-						//"div class=\"watch-view-count\">"," views</div>");
-						//view-count">12 \u043f\u0440\u043e\u0441\u043c\u043e\u0442\u0440\u043e\u0432</div>
-						"view-count","<div>");
-				String viewsStr = StringUtils.substringBetween(viewsStrDirty,">", " ");
-				if (viewsStr!=null){
-					int views = Integer.parseInt(viewsStr );
-					result.views = views;
-				} else {
-					System.err.println("Not found data for 'view-count','<div>'");
-				}
-			} catch (NumberFormatException e) {
-				// TODO Auto-generated catch block
-				e.printStackTrace();
-			}
-		}
-
-		return result;
-	}
-
-
-
-
-	public static void main(String[] args){
-		YouTubeMiner  miner = new YouTubeMiner();
-		System.out.println(miner.getData("https://www.youtube.com/channel/UC-maQbG5eUS5c1wmaTnLwTA"));
-		System.out.println(miner.getData("https://www.youtube.com/watch?v=U6X4VT9dVr8"));
-		System.out.println(miner.getData("https://www.youtube.com/watch?v=kH-AQnta714"));
-		System.out.println(miner.getData("https://www.youtube.com/watch?v=pWb50Kn1ShQ"));
-	}
-}
-
-

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMinerResult.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMinerResult.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMinerResult.java
deleted file mode 100644
index 86c8e9d..0000000
--- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMinerResult.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.tools.parse_thicket.opinion_processor;
-
-public class YouTubeMinerResult {
-	public int likes;
-	public int subscribers;
-	public int views;
-	
-	boolean isPromisingYoungIndividual(){
-		if (subscribers>0)
-			if (subscribers>10 && subscribers< 20000)
-				return true;
-		if (views>0)
-			if (views>10 && views< 20000)
-				return true;
-		return false;
-
-	}
-	
-	public String toString(){
-		return "views :"+ views + "| subscribers = "+ subscribers;
-	}
-}