You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by bg...@apache.org on 2012/03/29 00:16:44 UTC

svn commit: r1306623 - in /opennlp/sandbox/opennlp-similarity: ./ resources/ src/main/java/opennlp/tools/similarity/apps/taxo_builder/ src/main/java/opennlp/tools/similarity/apps/utils/ src/main/java/opennlp/tools/textsimilarity/chunker2matcher/

Author: bgalitsky
Date: Wed Mar 28 22:16:43 2012
New Revision: 1306623

URL: http://svn.apache.org/viewvc?rev=1306623&view=rev
Log:
 OPENNLP-436
Auto Taxonomy Learner for Search Relevance Improvement based on Similarity

Added:
    opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/
    opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/AriAdapter.java
    opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/Languages.java
    opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java
    opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java
    opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomySerializer.java
    opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/taxonomy.txt
    opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/FileHandler.java
Removed:
    opennlp/sandbox/opennlp-similarity/resources/
Modified:
    opennlp/sandbox/opennlp-similarity/pom.xml
    opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java

Modified: opennlp/sandbox/opennlp-similarity/pom.xml
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/pom.xml?rev=1306623&r1=1306622&r2=1306623&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/pom.xml (original)
+++ opennlp/sandbox/opennlp-similarity/pom.xml Wed Mar 28 22:16:43 2012
@@ -70,6 +70,11 @@
 			<artifactId>tika-core</artifactId>
 			<version>0.7</version>
 		</dependency>
+		<dependency>
+			<groupId>xstream.codehaus.org</groupId>
+			<artifactId>xstream</artifactId>
+			<version>1.4.2</version>
+		</dependency>
 	</dependencies>
 	
 	<build>

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/AriAdapter.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/AriAdapter.java?rev=1306623&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/AriAdapter.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/AriAdapter.java Wed Mar 28 22:16:43 2012
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.similarity.apps.taxo_builder;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+
+/**
+ * This class makes it possible to use old prolog-files as the bases for taxonomy-learner.
+ * It cleans the prolog files and returns with Strings which can be used for the taxonomy extender process.
+ *
+ */
+public class AriAdapter {
+	//income_taks(state,company(cafeteria,_)):-do(71100).
+	Map<String, List<List<String>>> lemma_AssocWords = new HashMap<String, List<List<String>>>();
+	public void getChainsFromARIfile(String fileName) {
+		
+	    try {
+	        BufferedReader br = new BufferedReader( new InputStreamReader(new FileInputStream(fileName)));
+	        String line;
+	        while((line = br.readLine()) != null) {
+		       	if (line.length()<10 || line.startsWith("%") || line.startsWith(":"))
+		        	continue;
+	           String chain0 = line.replace("_,", "&").replace("_)", "&").replace(":-do(", "&").replace(":-var","&").
+	                    replace("taks","tax").
+	           			replace(":- do(", "&").replace("X=","&").replace(":-","&").replace("[X|_]","&").replace("nonvar","&").replace("var","&").
+	           					replace('(', '&').replace(')', '&').replace(',', '&').replace('.', '&').
+	           					replace("&&&","&").replace("&&","&").replace("&"," ");
+	           String[] chains = chain0.split(" ");
+	           List<String> chainList = new ArrayList<String>(); //Arrays.asList(chains);
+	           for(String word: chains){
+	        	   if (word!=null && word.length()>2 && word.indexOf("0")<0 && word.indexOf("1")<0 && word.indexOf("2")<0 
+	        			   && word.indexOf("3")<0 && word.indexOf("4")<0 && word.indexOf("5")<0 )
+	        		   chainList.add(word);
+	           }
+	           if (chains.length<1 || chainList.size()<1 || chainList.get(0).length()<3)
+	        	   continue;
+	           String entry = chainList.get(0);
+	           if (entry.length()<3)
+	           	  continue;
+	           chainList.remove(entry);
+	           List<List<String>> res =  lemma_AssocWords.get(entry);
+	           if (res==null){
+	        	   List<List<String>> resList = new ArrayList<List<String>>();
+	        	   resList.add(chainList);
+	        	   lemma_AssocWords.put(entry, resList);
+	           } else {
+	        	   res.add(chainList);
+	        	   lemma_AssocWords.put(entry, res);
+	           }
+	        }
+	     }catch (Exception e){
+	          e.printStackTrace();
+
+	      }
+	  }
+
+	  public static void main(String[] args){
+		  
+		  AriAdapter ad = new AriAdapter();
+	      ad.getChainsFromARIfile("src/test/resources/taxonomies/irs_dom.ari");
+	      System.out.println(ad.lemma_AssocWords);
+	      
+	  }
+
+}

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/Languages.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/Languages.java?rev=1306623&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/Languages.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/Languages.java Wed Mar 28 22:16:43 2012
@@ -0,0 +1,5 @@
+package opennlp.tools.similarity.apps.taxo_builder;
+
+public enum Languages {
+	ENGLISH,SPANISH,GERMAN,FRENCH,ITALIAN
+}

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java?rev=1306623&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java Wed Mar 28 22:16:43 2012
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps.taxo_builder;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Hashtable;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Logger;
+
+import opennlp.tools.similarity.apps.utils.FileHandler;
+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+
+import com.thoughtworks.xstream.XStream;
+
+
+/**
+ * This class can be used to generate scores based on the overlapping between a text and a given taxonomy.
+ *
+ */
+public class TaxoQuerySnapshotMatcher {
+	
+	ParserChunker2MatcherProcessor sm ;
+    //XStream xStream= new XStream();
+    Map<String, List<List<String>>> lemma_ExtendedAssocWords;
+    TaxonomySerializer taxo;
+    private static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.taxo_builder.TaxoQuerySnapshotMatcher");
+    
+    
+    public TaxoQuerySnapshotMatcher() {
+    	sm = ParserChunker2MatcherProcessor.getInstance();
+        taxo = TaxonomySerializer.readTaxonomy("src/test/resources/taxonomies/irs_domTaxo.dat");    	
+	}
+	/**
+	 * Can be used to generate scores based on the overlapping between a text and a given taxonomy.
+	 * @param query The query string the user used for ask a question.
+	 * @param snapshot The abstract of a hit the system gave back
+	 * @return
+	 */
+	public int getTaxoScore(String query, String snapshot){
+   
+		lemma_ExtendedAssocWords=(HashMap<String, List<List<String>>>) taxo.getLemma_ExtendedAssocWords();
+	   
+		query=query.toLowerCase();
+		snapshot=snapshot.toLowerCase();
+		String[] queryWords = sm.getTokenizer().tokenize(query);
+		String[] snapshotWords = sm.getTokenizer().tokenize(snapshot);
+		
+		List<String> queryList = Arrays.asList(queryWords);
+		List<String> snapshotList = Arrays.asList(snapshotWords);
+		
+		List<String> commonBetweenQuerySnapshot = (new ArrayList<String>(queryList));
+		commonBetweenQuerySnapshot.retainAll(snapshotList);//Still could be duplicated words (even more if I would retain all the opposite ways)
+	
+		int score = 0;
+		List<String> accumCommonParams = new ArrayList<String>(); 
+		for(String qWord: commonBetweenQuerySnapshot){
+			if (!lemma_ExtendedAssocWords.containsKey(qWord))
+				continue;
+			List<List<String>> foundParams = new ArrayList<List<String>>(); 
+			foundParams=lemma_ExtendedAssocWords.get(qWord);
+		
+			for(List<String> paramsForGivenMeaning: foundParams){
+				paramsForGivenMeaning.retainAll(queryList);
+				paramsForGivenMeaning.retainAll(snapshotList);
+				int size = paramsForGivenMeaning.size();
+				
+				if (size>0 && !accumCommonParams.containsAll(paramsForGivenMeaning)){
+					score+=size;
+					accumCommonParams.addAll(paramsForGivenMeaning);
+				}
+			}
+		}	
+		return score;
+	}
+	
+	/**
+	 * It loads a serialized taxonomy in .dat format and serializes it into a much more readable XML format. 
+	 * @param taxonomyPath
+	 * @param taxonomyXML_Path
+	 * */
+	 
+	public void convertDatToXML(String taxonomyXML_Path, TaxonomySerializer taxo){
+		XStream xStream = new XStream();
+		FileHandler fileHandler = new FileHandler();
+		try {
+			fileHandler.writeToTextFile(xStream.toXML(taxo), taxonomyXML_Path, false);
+		} catch (Exception e) {
+				e.printStackTrace();
+				LOG.info(e.toString());
+		}
+			
+	} 
+	
+	public void xmlWork (){
+		TaxoQuerySnapshotMatcher matcher = new TaxoQuerySnapshotMatcher();
+		XStream xStream = new XStream();
+		FileHandler fileHandler = new FileHandler();
+		matcher.taxo = (TaxonomySerializer)xStream.fromXML(fileHandler.readFromTextFile("src/test/resources/taxo_English.xml"));
+	}
+	/**
+	 * demonstrates the usage of the taxonomy matcher
+	 * @param args
+	 */
+	static public void main(String[] args){
+
+		TaxoQuerySnapshotMatcher matcher = new TaxoQuerySnapshotMatcher();
+
+		System.out.println("The score is: "+matcher.getTaxoScore("Can Form 1040 EZ be used to claim the earned income credit.",
+				"Can Form 1040EZ be used to claim the earned income credit? . Must I be entitled to claim a child as a dependent to claim the earned income credit based on the child being "));
+		
+		
+	}
+}
+

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java?rev=1306623&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java Wed Mar 28 22:16:43 2012
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.similarity.apps.taxo_builder;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Logger;
+
+import opennlp.tools.similarity.apps.BingResponse;
+import opennlp.tools.similarity.apps.BingWebQueryRunner;
+import opennlp.tools.similarity.apps.HitBase;
+import opennlp.tools.similarity.apps.utils.StringCleaner;
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
+import opennlp.tools.textsimilarity.SentencePairMatchResult;
+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+
+
+/**
+ * Results of taxonomy learning are two maps
+ * 0) For an entity like tax it gives all lists of associated parameters obtained from the 
+ * taxonomy kernel (done manually) 
+ * Now, given 0, we obtain the derived list of parameters as commonalities of search results snapshots
+ * output map 1) for the entity, derived list
+ * output map 2) for such manual list of words -> derived list of words 
+ *
+ *
+ */
+
+
+public class TaxonomyExtenderViaMebMining extends BingWebQueryRunner{
+	private static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.taxo_builder.TaxonomyExtenderSearchResultFromYahoo");
+	private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
+	ParserChunker2MatcherProcessor sm ;
+
+	private Map<String, List<List<String>>> lemma_ExtendedAssocWords = new HashMap<String, List<List<String>>>();
+	private Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords = new HashMap<List<String>, List<List<String>>>();
+	private PorterStemmer ps;
+
+	public Map<List<String>, List<List<String>>> getAssocWords_ExtendedAssocWords() {
+		return assocWords_ExtendedAssocWords;
+	}
+	
+	public Map<String, List<List<String>>> getLemma_ExtendedAssocWords() {
+		return lemma_ExtendedAssocWords;
+	}
+
+	public void setLemma_ExtendedAssocWords(
+			Map<String, List<List<String>>> lemma_ExtendedAssocWords) {
+		this.lemma_ExtendedAssocWords = lemma_ExtendedAssocWords;
+	}
+	
+	public TaxonomyExtenderViaMebMining(){
+		try {	
+			sm = ParserChunker2MatcherProcessor.getInstance();
+		} catch (Exception e){ // now try 'local' openNLP
+			System.err.println("Problem loading synt matcher");
+		
+		}
+		ps  = new PorterStemmer();
+      
+	}
+	
+ 	private List<List<String>>
+		getCommonWordsFromList_List_ParseTreeChunk(List<List<ParseTreeChunk>> matchList, List<String> queryWordsToRemove, 
+				List<String> toAddAtEnd){
+ 		List<List<String>> res = new ArrayList<List<String>>();
+		for(List<ParseTreeChunk> chunks: matchList){
+			List<String> wordRes = new ArrayList<String>();
+			for (ParseTreeChunk ch: chunks){
+				List<String> lemmas =  ch.getLemmas();
+				for(int w=0; w< lemmas.size(); w++)
+					if ( (!lemmas.get(w).equals("*")) && 
+							((ch.getPOSs().get(w).startsWith("NN") || ch.getPOSs().get(w).startsWith("VB"))) &&
+							lemmas.get(w).length()>2){
+						String formedWord = lemmas.get(w);
+						String stemmedFormedWord = ps.stem(formedWord);
+						if (!stemmedFormedWord.startsWith("invalid"))
+							wordRes.add(formedWord);
+					}
+			}
+			wordRes = new ArrayList<String>(new HashSet<String>(wordRes));
+			wordRes.removeAll(queryWordsToRemove);
+			if (wordRes.size()>0){	
+				wordRes.addAll(toAddAtEnd);
+				res.add(wordRes);
+			}
+		}
+		res = new ArrayList<List<String>>(new HashSet<List<String>>(res));
+		return res;
+	}
+	
+	public void extendTaxonomy(String fileName, String domain, String lang){
+		AriAdapter ad = new AriAdapter();
+	      ad.getChainsFromARIfile(fileName);
+	      List<String> entries = new ArrayList<String>((ad.lemma_AssocWords.keySet()));
+	      try {
+	      for(String entity: entries ){ //.
+	    	  List<List<String>> paths = ad.lemma_AssocWords.get(entity);
+	    	  for(List<String> taxoPath: paths){
+	    		  String query = taxoPath.toString()+ " " + entity + " "+ domain; // todo: query forming function here 
+	    		  query = query.replace('[', ' ').replace(']',' ').replace(',', ' ').replace('_', ' ');
+	    		  List<List<ParseTreeChunk>> matchList = runSearchForTaxonomyPath(query, "", lang, 30);
+	    		  List<String> toRemoveFromExtension = new ArrayList<String>(taxoPath);
+	    		  toRemoveFromExtension.add(entity); toRemoveFromExtension.add(domain);
+	    		  List<List<String>> resList =	getCommonWordsFromList_List_ParseTreeChunk(matchList, toRemoveFromExtension, taxoPath);
+	    		  assocWords_ExtendedAssocWords.put(taxoPath, resList);
+	    		  resList.add(taxoPath);
+	    		  lemma_ExtendedAssocWords.put(entity, resList);
+	    	  }
+	      }
+	      } catch (Exception e){
+	    	 System.err.println("Problem taxonomy matching");
+	      }
+	     
+	      TaxonomySerializer ser = new TaxonomySerializer(lemma_ExtendedAssocWords, assocWords_ExtendedAssocWords);
+	      ser.writeTaxonomy(fileName.replace(".ari", "Taxo.dat"));
+	}
+	
+	public List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query, String domain, String lang, int numbOfHits) {
+		List<List<ParseTreeChunk>> genResult = new ArrayList<List<ParseTreeChunk>>();
+		try {
+			List<String> resultList = search(query,domain,lang,numbOfHits);
+			
+			BingResponse resp = populateBingHit(resultList.get(0));
+			//printSearchResult(resultList.get(0));
+			for(int i=0; i<resp.getHits().size(); i++){
+				{
+					for( int j=i+1; j<resp.getHits().size(); j++){
+						HitBase h1 = resp.getHits().get(i);
+						HitBase h2 = resp.getHits().get(j);
+						String snapshot1 = StringCleaner.processSnapshotForMatching(h1.getTitle()+ " . "+h1.getAbstractText());
+						String snapshot2 = StringCleaner.processSnapshotForMatching(h2.getTitle()+ " . "+h2.getAbstractText());
+						SentencePairMatchResult matchRes = sm.assessRelevance(snapshot1, snapshot2);
+						List<List<ParseTreeChunk>> matchResult  = matchRes.getMatchResult();
+						genResult.addAll(matchResult);						
+					}
+				}
+			}
+			
+		} catch (Exception e) {
+			System.err.print("Problem extracting taxonomy node");
+		}
+		
+		return genResult;
+	} 
+
+	public static void main(String[] args){
+			TaxonomyExtenderViaMebMining self = new TaxonomyExtenderViaMebMining();
+			self.extendTaxonomy("src/test/resources/taxonomies/irs_dom.ari", "tax", "en");
+		
+	}
+
+}

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomySerializer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomySerializer.java?rev=1306623&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomySerializer.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomySerializer.java Wed Mar 28 22:16:43 2012
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.similarity.apps.taxo_builder;
+
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.Serializable;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * This class stores the taxonomy on the file-system
+ * @author Boris
+ *
+ */
+public class TaxonomySerializer implements Serializable {
+	
+	private static final long serialVersionUID = 7431412616514648388L;
+	private Map<String, List<List<String>>> lemma_ExtendedAssocWords = new HashMap<String, List<List<String>>>();
+	private Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords = new HashMap<List<String>, List<List<String>>>();
+	
+	
+	public TaxonomySerializer(
+			Map<String, List<List<String>>> lemma_ExtendedAssocWords,
+			Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords) {
+		
+		this.lemma_ExtendedAssocWords = lemma_ExtendedAssocWords;
+		this.assocWords_ExtendedAssocWords = assocWords_ExtendedAssocWords;
+	}
+	public TaxonomySerializer() {
+		// TODO Auto-generated constructor stub
+	}
+	public Map<List<String>, List<List<String>>> getAssocWords_ExtendedAssocWords() {
+		return assocWords_ExtendedAssocWords;
+	}
+	public Map<String, List<List<String>>> getLemma_ExtendedAssocWords() {
+		return lemma_ExtendedAssocWords;
+	}
+	public void setLemma_ExtendedAssocWords(
+			Map<String, List<List<String>>> lemma_ExtendedAssocWords) {
+		this.lemma_ExtendedAssocWords = lemma_ExtendedAssocWords;
+	}
+	public void setAssocWords_ExtendedAssocWords(
+			Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords) {
+		this.assocWords_ExtendedAssocWords = assocWords_ExtendedAssocWords;
+	}
+	
+	public void writeTaxonomy(String filename){
+		FileOutputStream fos = null;
+		ObjectOutputStream out = null;
+		try {
+		  fos = new FileOutputStream(filename);
+		  out = new ObjectOutputStream(fos);
+		  out.writeObject(this);
+		  out.close();
+		}
+		catch(IOException ex)   {     ex.printStackTrace();   }
+
+	}
+	
+	public static TaxonomySerializer readTaxonomy(String filename){
+		TaxonomySerializer data = null;
+		FileInputStream fis = null;
+	    ObjectInputStream in = null;
+		try
+		{
+		   fis = new FileInputStream(filename);
+		   in = new ObjectInputStream(fis);
+		   data = (TaxonomySerializer)in.readObject();
+		   in.close();
+		}
+		catch(IOException ex) {  ex.printStackTrace();  }
+		catch(ClassNotFoundException ex) {  ex.printStackTrace();  }
+		
+		//System.out.print(data.lemma_ExtendedAssocWords);
+		
+		return data;
+
+	}
+}

Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/taxonomy.txt
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/taxonomy.txt?rev=1306623&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/taxonomy.txt (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/taxonomy.txt Wed Mar 28 22:16:43 2012
@@ -0,0 +1 @@

[... 3 lines stripped ...]
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/FileHandler.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/FileHandler.java?rev=1306623&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/FileHandler.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/FileHandler.java Wed Mar 28 22:16:43 2012
@@ -0,0 +1,368 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps.utils;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.ByteArrayOutputStream;
+import java.io.EOFException;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import org.apache.log4j.Logger;
+
+
+/**
+ *This class responsible to save data to files as well as read out!
+ *It is capable to handle text and binary files. 
+ */
+public class FileHandler {
+	
+	private static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.utils.FileHandler");
+       
+	
+	public  void writeToTextFile(String data,String filepath,boolean append) throws IOException {
+		try{
+			BufferedWriter out = new BufferedWriter(new FileWriter(filepath, append));
+			out.write(data + "\n");
+            out.close();
+            } catch (IOException e) {
+                	LOG.error(e);
+                	e.printStackTrace();
+            }
+	}
+	/**
+	 * Writes data from an arrayList<String> to a text-file where each line of the text represented by an element in the list.
+	 * @param list
+	 * @param filePath
+	 * @param append
+	 * @throws Exception
+	 */
+	public  void writeToTextFile(ArrayList<String> list, String filePath, boolean append)	throws Exception {
+		FileWriter outFile = null;
+		Iterator<String> it = list.iterator();
+		if (!append) {
+			outFile = new FileWriter(filePath);
+			PrintWriter out = new PrintWriter(outFile);
+			while (it.hasNext()) {
+				out.println((String) it.next());
+			}
+			outFile.close();
+		} else {
+			int tmp = 0;
+			while (it.hasNext()) {
+				if (tmp == 0) {
+					appendtofile("\n" + (String) it.next(), filePath);
+				} else {
+					appendtofile((String) it.next(), filePath);
+				}
+				tmp++;
+			}
+		}
+	}
+
+     public  void writeObjectToFile(Object obj, String filepath, boolean append) {
+    	 	if(!isFileOrDirectoryExists(getDirPathfromFullPath(filepath))){
+    	 		createFolder(getDirPathfromFullPath(filepath));
+    	 	}
+    	 	ObjectOutputStream outputStream = null;
+         try {
+        	 outputStream = new ObjectOutputStream(new FileOutputStream(filepath));
+             outputStream.writeObject(obj);
+             } catch (IOException e) {
+            	 LOG.error(e);
+             }
+    }
+    public  Object readObjectfromFile(String filePath){
+    	ObjectInputStream inputStream = null;
+    	try {
+    		//Construct the ObjectInputStream object
+            inputStream = new ObjectInputStream(new FileInputStream(filePath));
+            Object obj = null;
+            while ((obj = inputStream.readObject()) != null) {
+            	return  obj;
+            }
+        } catch (EOFException ex) { //This exception will be caught when EOF is reached
+        	LOG.error("End of file reached.",ex);
+        } catch (ClassNotFoundException ex) {
+        	LOG.error(ex);
+        } catch (FileNotFoundException ex) {
+        	LOG.error(ex);
+        } catch (IOException ex) {
+        	LOG.error(ex);
+        } finally {
+            //Close the ObjectInputStream
+            try {
+                if (inputStream != null) {
+                    inputStream.close();
+                }
+            } catch (IOException ex) {
+            	LOG.error(ex);
+            }
+        }
+             return null;
+         }
+    /**
+     * Creates a byte array from any object.
+     * 
+     * I wanted to use it when I write out object to files! (This is not in use right now, I may move it into other class)
+     * 
+     * @param obj
+     * @return
+     * @throws java.io.IOException
+     */
+    public  byte[] getBytes(Object obj) throws java.io.IOException{
+    	ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        ObjectOutputStream oos = new ObjectOutputStream(bos);
+        oos.writeObject(obj);
+        oos.flush();
+        oos.close();
+        bos.close();
+        byte [] data = bos.toByteArray();
+        return data;
+    }
+
+	/**
+	 * Fetches all content from a text file, and return it as a String.
+	 * @return
+	 */
+	 public String readFromTextFile(String filePath) {
+		StringBuilder contents = new StringBuilder();
+		// ...checks on aFile are edited
+		File aFile = new File(filePath);
+
+		try {
+			// use buffering, reading one line at a time
+			// FileReader always assumes default encoding is OK!
+			// TODO be sure that the default encoding is OK!!!!! Otherwise
+			// change it
+
+			BufferedReader input = new BufferedReader(new FileReader(aFile));
+			try {
+				String line = null; // not declared within while loop
+				/*
+				 * readLine is a bit quirky : it returns the content of a line
+				 * MINUS the newline. it returns null only for the END of the
+				 * stream. it returns an empty String if two newlines appear in
+				 * a row.
+				 */
+				while ((line = input.readLine()) != null) {
+					contents.append(line);
+					contents.append(System.getProperty("line.separator"));
+				}
+			} finally {
+				input.close();
+			}
+		} catch (IOException ex) {
+			LOG.error("fileName: "+filePath,ex);
+		}
+		return contents.toString();
+	}
+	/**
+	 * Reads text file line-wise each line will be an element in the resulting list
+	 * @param filePath
+	 * @return
+	 */
+	public  List<String> readLinesFromTextFile(String filePath){
+		List<String> lines= new ArrayList<String>();
+		// ...checks on aFile are edited
+		File aFile = new File(filePath);
+		try {
+			// use buffering, reading one line at a time
+			// FileReader always assumes default encoding is OK!
+			// TODO be sure that the default encoding is OK!!!!! Otherwise
+			// change it
+
+			BufferedReader input = new BufferedReader(new FileReader(aFile));
+			try {
+				String line = null; // not declared within while loop
+				/*
+				 * readLine is a bit quirky : it returns the content of a line
+				 * MINUS the newline. it returns null only for the END of the
+				 * stream. it returns an empty String if two newlines appear in
+				 * a row.
+				 */
+				while ((line = input.readLine()) != null) {
+					lines.add(line);
+				}
+			} finally {
+				input.close();
+			}
+		} catch (IOException ex) {
+			LOG.error(ex);
+		}
+		return lines;
+	}
+
+	
+
+	private  void appendtofile(String data, String filePath) {
+		try {
+			BufferedWriter out = new BufferedWriter(new FileWriter(filePath,true));
+			out.write(data + "\n");
+			out.close();
+		} catch (IOException e) {
+		}
+	}
+	public  void  createFolder(String path){
+		if(!isFileOrDirectoryExists(path)){
+			File file = new File(path);
+	    	 try{
+	    	 file.mkdirs();
+	    	 }catch (Exception e) {
+				LOG.error("Directory already exists or the file-system is read only",e);
+			}	
+		} 
+	}
+	public  boolean isFileOrDirectoryExists(String path){
+		File file=new File(path);
+		boolean exists = file.exists();
+		return exists;
+	}
+	/**
+	 * Separates the directory-path from a full file-path
+	 * @param filePath
+	 * @return
+	 */
+	private  String getDirPathfromFullPath(String filePath){
+		String dirPath="";
+		if(filePath!=null){
+			if(filePath!=""&&filePath.contains("\\"))
+			dirPath =filePath.substring(0,filePath.lastIndexOf("\\"));
+		}
+		return dirPath;
+	}
+	/**
+	 * Returns the file-names of the files in a folder (not paths only names) (Not recursive)
+	 * @param dirPath
+	 * @return
+	 */
+	public  ArrayList<String> getFileNamesInFolder(String dirPath){
+		ArrayList<String> fileNames= new ArrayList<String>();
+			
+			File folder = new File(dirPath);
+		    File[] listOfFiles = folder.listFiles();
+
+		    for (int i = 0; i < listOfFiles.length; i++) {
+		      if (listOfFiles[i].isFile()) {
+		        fileNames.add(listOfFiles[i].getName());
+		      } else if (listOfFiles[i].isDirectory()) {
+		        //TODO if I want to use it recursive I should handle this case
+		      }
+			}
+		return fileNames;
+	}
+	
+	public void deleteAllfilesinDir(String dirName){
+		ArrayList<String> fileNameList=getFileNamesInFolder(dirName);
+		if(fileNameList!=null){
+		for(int i=0; i<fileNameList.size();i++){
+		try{
+			deleteFile(dirName+fileNameList.get(i));
+			}catch(IllegalArgumentException e){
+				LOG.error("No way to delete file: "+dirName+fileNameList.get(i),e);
+			}
+		}
+		}
+	}
+	public  void deleteFile(String filePath) throws IllegalArgumentException{
+		// A File object to represent the filename
+	    File f = new File(filePath);
+	    // Make sure the file or directory exists and isn't write protected
+	    if (!f.exists())
+	      throw new IllegalArgumentException(
+	          "Delete: no such file or directory: " + filePath);
+
+	    if (!f.canWrite())
+	      throw new IllegalArgumentException("Delete: write protected: "
+	          + filePath);
+	    // If it is a directory, make sure it is empty
+	    if (f.isDirectory()) {
+	      String[] files = f.list();
+	      if (files.length > 0)
+	        throw new IllegalArgumentException(
+	            "Delete: directory not empty: " + filePath);
+	    }
+	    // Attempt to delete it
+	    boolean success = f.delete();
+	    if (!success)
+	      throw new IllegalArgumentException("Delete: deletion failed");
+	}
+	
+	public boolean deleteDirectory(File path) {
+	    if( path.exists() ) {
+	      File[] files = path.listFiles();
+	      for(int i=0; i<files.length; i++) {
+	         if(files[i].isDirectory()) {
+	           deleteDirectory(files[i]);
+	         }
+	         else {
+	           files[i].delete();
+	         }
+	      }
+	    }
+	    return( path.delete() );
+	  }
+	
+	/**
+	 * Returns the absolute-file-paths of the files in a directory (not recursive)
+	 * @param dirPath
+	 * @return
+	 */
+	public  ArrayList<String> getFilePathsInFolder(String dirPath){
+		ArrayList<String> filePaths= new ArrayList<String>();
+			
+			File folder = new File(dirPath);
+		    File[] listOfFiles = folder.listFiles();
+		    if(listOfFiles==null)
+		    	return null;
+		    for (int i = 0; i < listOfFiles.length; i++) {
+		      if (listOfFiles[i].isFile()) {
+		    	  filePaths.add(listOfFiles[i].getAbsolutePath());
+		      } else if (listOfFiles[i].isDirectory()) {
+		        //TODO if I want to use it recursive I should handle this case
+		      }
+			}
+		return filePaths;
+	}
+	/**
+	 * Returns the number of individual files in a directory (Not ercursive)
+	 * @param dirPath
+	 * @return
+	 */
+	public  int getFileNumInFolder(String dirPath){
+		int num=0;
+		try{
+			num=getFileNamesInFolder(dirPath).size();
+		}catch (Exception e) {
+			num=0;
+		}
+		return num;
+	}
+
+}

Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java?rev=1306623&r1=1306622&r2=1306623&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java Wed Mar 28 22:16:43 2012
@@ -74,12 +74,36 @@ public class ParserChunker2MatcherProces
 	private static final String MODEL_DIR_KEY = "nlp.models.dir";
 	// TODO config
 	// this is where resources should live
-	private static String MODEL_DIR, MODEL_DIR_REL = "resources/models111";
+	private static String MODEL_DIR, MODEL_DIR_REL = "resources/models";
 	protected static ParserChunker2MatcherProcessor instance;
 
 	private SentenceDetector sentenceDetector;
 	private Tokenizer tokenizer;
 	private POSTagger posTagger;
+	public SentenceDetector getSentenceDetector() {
+		return sentenceDetector;
+	}
+
+	public void setSentenceDetector(SentenceDetector sentenceDetector) {
+		this.sentenceDetector = sentenceDetector;
+	}
+
+	public Tokenizer getTokenizer() {
+		return tokenizer;
+	}
+
+	public void setTokenizer(Tokenizer tokenizer) {
+		this.tokenizer = tokenizer;
+	}
+
+	public ChunkerME getChunker() {
+		return chunker;
+	}
+
+	public void setChunker(ChunkerME chunker) {
+		this.chunker = chunker;
+	}
+
 	private Parser parser;
 	private ChunkerME chunker;
 	private final int NUMBER_OF_SECTIONS_IN_SENTENCE_CHUNKS = 5;
@@ -261,17 +285,17 @@ public class ParserChunker2MatcherProces
 		
 		tags = POSlist.toArray(new String[0]);
 		if (toks.length != tags.length){
-			LOG.info("disagreement between toks and tags; sent =  '"+sentence + "'\n tags = "+tags + 
+			LOG.finest("disagreement between toks and tags; sent =  '"+sentence + "'\n tags = "+tags + 
 					"\n will now try this sentence in lower case" );
 			node  = parseSentenceNode(sentence.toLowerCase());
 			if (node==null){
-				LOG.info("Problem parsing sentence '"+sentence);
+				LOG.finest("Problem parsing sentence '"+sentence);
 				return null;
 			}
 			POSlist = node.getOrderedPOSList();
 			tags = POSlist.toArray(new String[0]);
 			if (toks.length != tags.length){
-				LOG.info("AGAIN: disagreement between toks and tags for lower case! ");
+				LOG.finest("AGAIN: disagreement between toks and tags for lower case! ");
 				if (toks.length>tags.length){
 					String[] newToks = new String[tags.length];
 					for(int i = 0; i<tags.length; i++ ){