You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by bg...@apache.org on 2012/03/29 02:29:12 UTC

svn commit: r1306658 - in /opennlp/sandbox/opennlp-similarity/src: main/java/opennlp/tools/similarity/apps/taxo_builder/ test/java/opennlp/tools/similarity/apps/ test/java/opennlp/tools/similarity/apps/taxo_builder/

Author: bgalitsky
Date: Thu Mar 29 00:29:11 2012
New Revision: 1306658

URL: http://svn.apache.org/viewvc?rev=1306658&view=rev
Log:
test for OPENNLP-436
Auto Taxonomy Learner for Search Relevance Improvement based on Similarity

Added:
    opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/similarity/apps/taxo_builder/
    opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyBuildMatchTest.java
Modified:
    opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/Languages.java
    opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java
    opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java
    opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/similarity/apps/SearchResultsProcessorTest.java

Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/Languages.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/Languages.java?rev=1306658&r1=1306657&r2=1306658&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/Languages.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/Languages.java Thu Mar 29 00:29:11 2012
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package opennlp.tools.similarity.apps.taxo_builder;
 
 public enum Languages {

Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java?rev=1306658&r1=1306657&r2=1306658&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java Thu Mar 29 00:29:11 2012
@@ -30,103 +30,123 @@ import opennlp.tools.textsimilarity.chun
 
 import com.thoughtworks.xstream.XStream;
 
-
 /**
- * This class can be used to generate scores based on the overlapping between a text and a given taxonomy.
- *
+ * This class can be used to generate scores based on the overlapping between a
+ * text and a given taxonomy.
+ * 
  */
 public class TaxoQuerySnapshotMatcher {
-	
-	ParserChunker2MatcherProcessor sm ;
-    //XStream xStream= new XStream();
-    Map<String, List<List<String>>> lemma_ExtendedAssocWords;
-    TaxonomySerializer taxo;
-    private static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.taxo_builder.TaxoQuerySnapshotMatcher");
-    
-    
-    public TaxoQuerySnapshotMatcher() {
-    	sm = ParserChunker2MatcherProcessor.getInstance();
-        taxo = TaxonomySerializer.readTaxonomy("src/test/resources/taxonomies/irs_domTaxo.dat");    	
-	}
-	/**
-	 * Can be used to generate scores based on the overlapping between a text and a given taxonomy.
-	 * @param query The query string the user used for ask a question.
-	 * @param snapshot The abstract of a hit the system gave back
-	 * @return
-	 */
-	public int getTaxoScore(String query, String snapshot){
-   
-		lemma_ExtendedAssocWords=(HashMap<String, List<List<String>>>) taxo.getLemma_ExtendedAssocWords();
-	   
-		query=query.toLowerCase();
-		snapshot=snapshot.toLowerCase();
-		String[] queryWords = sm.getTokenizer().tokenize(query);
-		String[] snapshotWords = sm.getTokenizer().tokenize(snapshot);
-		
-		List<String> queryList = Arrays.asList(queryWords);
-		List<String> snapshotList = Arrays.asList(snapshotWords);
-		
-		List<String> commonBetweenQuerySnapshot = (new ArrayList<String>(queryList));
-		commonBetweenQuerySnapshot.retainAll(snapshotList);//Still could be duplicated words (even more if I would retain all the opposite ways)
-	
-		int score = 0;
-		List<String> accumCommonParams = new ArrayList<String>(); 
-		for(String qWord: commonBetweenQuerySnapshot){
-			if (!lemma_ExtendedAssocWords.containsKey(qWord))
-				continue;
-			List<List<String>> foundParams = new ArrayList<List<String>>(); 
-			foundParams=lemma_ExtendedAssocWords.get(qWord);
-		
-			for(List<String> paramsForGivenMeaning: foundParams){
-				paramsForGivenMeaning.retainAll(queryList);
-				paramsForGivenMeaning.retainAll(snapshotList);
-				int size = paramsForGivenMeaning.size();
-				
-				if (size>0 && !accumCommonParams.containsAll(paramsForGivenMeaning)){
-					score+=size;
-					accumCommonParams.addAll(paramsForGivenMeaning);
-				}
-			}
-		}	
-		return score;
-	}
-	
-	/**
-	 * It loads a serialized taxonomy in .dat format and serializes it into a much more readable XML format. 
-	 * @param taxonomyPath
-	 * @param taxonomyXML_Path
-	 * */
-	 
-	public void convertDatToXML(String taxonomyXML_Path, TaxonomySerializer taxo){
-		XStream xStream = new XStream();
-		FileHandler fileHandler = new FileHandler();
-		try {
-			fileHandler.writeToTextFile(xStream.toXML(taxo), taxonomyXML_Path, false);
-		} catch (Exception e) {
-				e.printStackTrace();
-				LOG.info(e.toString());
-		}
-			
-	} 
-	
-	public void xmlWork (){
-		TaxoQuerySnapshotMatcher matcher = new TaxoQuerySnapshotMatcher();
-		XStream xStream = new XStream();
-		FileHandler fileHandler = new FileHandler();
-		matcher.taxo = (TaxonomySerializer)xStream.fromXML(fileHandler.readFromTextFile("src/test/resources/taxo_English.xml"));
-	}
-	/**
-	 * demonstrates the usage of the taxonomy matcher
-	 * @param args
-	 */
-	static public void main(String[] args){
-
-		TaxoQuerySnapshotMatcher matcher = new TaxoQuerySnapshotMatcher();
-
-		System.out.println("The score is: "+matcher.getTaxoScore("Can Form 1040 EZ be used to claim the earned income credit.",
-				"Can Form 1040EZ be used to claim the earned income credit? . Must I be entitled to claim a child as a dependent to claim the earned income credit based on the child being "));
-		
-		
-	}
-}
 
+  ParserChunker2MatcherProcessor sm;
+  // XStream xStream= new XStream();
+  Map<String, List<List<String>>> lemma_ExtendedAssocWords;
+  TaxonomySerializer taxo;
+  private static Logger LOG = Logger
+      .getLogger("opennlp.tools.similarity.apps.taxo_builder.TaxoQuerySnapshotMatcher");
+
+  public TaxoQuerySnapshotMatcher(String taxoFileName) {
+    sm = ParserChunker2MatcherProcessor.getInstance();
+    taxo = TaxonomySerializer.readTaxonomy(taxoFileName); // "src/test/resources/taxonomies/irs_domTaxo.dat");
+  }
+
+  /**
+   * Can be used to generate scores based on the overlapping between a text and
+   * a given taxonomy.
+   * 
+   * @param query
+   *          The query string the user used for ask a question.
+   * @param snapshot
+   *          The abstract of a hit the system gave back
+   * @return
+   */
+  public int getTaxoScore(String query, String snapshot) {
+
+    lemma_ExtendedAssocWords = (HashMap<String, List<List<String>>>) taxo
+        .getLemma_ExtendedAssocWords();
+
+    query = query.toLowerCase();
+    snapshot = snapshot.toLowerCase();
+    String[] queryWords = sm.getTokenizer().tokenize(query);
+    String[] snapshotWords = sm.getTokenizer().tokenize(snapshot);
+
+    List<String> queryList = Arrays.asList(queryWords);
+    List<String> snapshotList = Arrays.asList(snapshotWords);
+
+    List<String> commonBetweenQuerySnapshot = (new ArrayList<String>(queryList));
+    commonBetweenQuerySnapshot.retainAll(snapshotList);// Still could be
+                                                       // duplicated words (even
+                                                       // more if I would retain
+                                                       // all the opposite ways)
+
+    int score = 0;
+    List<String> accumCommonParams = new ArrayList<String>();
+    for (String qWord : commonBetweenQuerySnapshot) {
+      if (!lemma_ExtendedAssocWords.containsKey(qWord))
+        continue;
+      List<List<String>> foundParams = new ArrayList<List<String>>();
+      foundParams = lemma_ExtendedAssocWords.get(qWord);
+
+      for (List<String> paramsForGivenMeaning : foundParams) {
+        paramsForGivenMeaning.retainAll(queryList);
+        paramsForGivenMeaning.retainAll(snapshotList);
+        int size = paramsForGivenMeaning.size();
+
+        if (size > 0 && !accumCommonParams.containsAll(paramsForGivenMeaning)) {
+          score += size;
+          accumCommonParams.addAll(paramsForGivenMeaning);
+        }
+      }
+    }
+    return score;
+  }
+
+  /**
+   * It loads a serialized taxonomy in .dat format and serializes it into a much
+   * more readable XML format.
+   * 
+   * @param taxonomyPath
+   * @param taxonomyXML_Path
+   * */
+
+  public void convertDatToXML(String taxonomyXML_Path, TaxonomySerializer taxo) {
+    XStream xStream = new XStream();
+    FileHandler fileHandler = new FileHandler();
+    try {
+      fileHandler.writeToTextFile(xStream.toXML(taxo), taxonomyXML_Path, false);
+    } catch (Exception e) {
+      e.printStackTrace();
+      LOG.info(e.toString());
+    }
+
+  }
+
+  public void xmlWork (){
+    TaxoQuerySnapshotMatcher matcher = new TaxoQuerySnapshotMatcher("src/test/resources/taxonomies/irs_domTaxo.dat");
+    XStream xStream = new XStream();
+    FileHandler fileHandler = new FileHandler();
+    matcher.taxo = (TaxonomySerializer)xStream.fromXML(fileHandler.readFromTextFile("src/test/resources/taxo_English.xml"));
+  }
+
+  public void close() {
+    sm.close();
+  }
+
+  /**
+   * demonstrates the usage of the taxonomy matcher
+   * 
+   * @param args
+   */
+  static public void main(String[] args) {
+
+    TaxoQuerySnapshotMatcher matcher = new TaxoQuerySnapshotMatcher(
+        "src/test/resources/taxonomies/irs_domTaxo.dat");
+
+    System.out
+        .println("The score is: "
+            + matcher
+                .getTaxoScore(
+                    "Can Form 1040 EZ be used to claim the earned income credit.",
+                    "Can Form 1040EZ be used to claim the earned income credit? . Must I be entitled to claim a child as a dependent to claim the earned income credit based on the child being "));
+  }
+
+}

Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java?rev=1306658&r1=1306657&r2=1306658&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java Thu Mar 29 00:29:11 2012
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 package opennlp.tools.similarity.apps.taxo_builder;
+
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -32,140 +33,158 @@ import opennlp.tools.textsimilarity.Pars
 import opennlp.tools.textsimilarity.SentencePairMatchResult;
 import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
 
-
 /**
- * Results of taxonomy learning are two maps
- * 0) For an entity like tax it gives all lists of associated parameters obtained from the 
- * taxonomy kernel (done manually) 
- * Now, given 0, we obtain the derived list of parameters as commonalities of search results snapshots
- * output map 1) for the entity, derived list
- * output map 2) for such manual list of words -> derived list of words 
- *
- *
+ * Results of taxonomy learning are two maps 0) For an entity like tax it gives
+ * all lists of associated parameters obtained from the taxonomy kernel (done
+ * manually) Now, given 0, we obtain the derived list of parameters as
+ * commonalities of search results snapshots output map 1) for the entity,
+ * derived list output map 2) for such manual list of words -> derived list of
+ * words
+ * 
+ * 
  */
 
+public class TaxonomyExtenderViaMebMining extends BingWebQueryRunner {
+  private static Logger LOG = Logger
+      .getLogger("opennlp.tools.similarity.apps.taxo_builder.TaxonomyExtenderSearchResultFromYahoo");
+  private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
+  ParserChunker2MatcherProcessor sm;
+
+  private Map<String, List<List<String>>> lemma_ExtendedAssocWords = new HashMap<String, List<List<String>>>();
+  private Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords = new HashMap<List<String>, List<List<String>>>();
+  private PorterStemmer ps;
+
+  public Map<List<String>, List<List<String>>> getAssocWords_ExtendedAssocWords() {
+    return assocWords_ExtendedAssocWords;
+  }
+
+  public Map<String, List<List<String>>> getLemma_ExtendedAssocWords() {
+    return lemma_ExtendedAssocWords;
+  }
+
+  public void setLemma_ExtendedAssocWords(
+      Map<String, List<List<String>>> lemma_ExtendedAssocWords) {
+    this.lemma_ExtendedAssocWords = lemma_ExtendedAssocWords;
+  }
+
+  public TaxonomyExtenderViaMebMining() {
+    try {
+      sm = ParserChunker2MatcherProcessor.getInstance();
+    } catch (Exception e) { // now try 'local' openNLP
+      System.err.println("Problem loading synt matcher");
+
+    }
+    ps = new PorterStemmer();
+
+  }
+
+  private List<List<String>> getCommonWordsFromList_List_ParseTreeChunk(
+      List<List<ParseTreeChunk>> matchList, List<String> queryWordsToRemove,
+      List<String> toAddAtEnd) {
+    List<List<String>> res = new ArrayList<List<String>>();
+    for (List<ParseTreeChunk> chunks : matchList) {
+      List<String> wordRes = new ArrayList<String>();
+      for (ParseTreeChunk ch : chunks) {
+        List<String> lemmas = ch.getLemmas();
+        for (int w = 0; w < lemmas.size(); w++)
+          if ((!lemmas.get(w).equals("*"))
+              && ((ch.getPOSs().get(w).startsWith("NN") || ch.getPOSs().get(w)
+                  .startsWith("VB"))) && lemmas.get(w).length() > 2) {
+            String formedWord = lemmas.get(w);
+            String stemmedFormedWord = ps.stem(formedWord);
+            if (!stemmedFormedWord.startsWith("invalid"))
+              wordRes.add(formedWord);
+          }
+      }
+      wordRes = new ArrayList<String>(new HashSet<String>(wordRes));
+      wordRes.removeAll(queryWordsToRemove);
+      if (wordRes.size() > 0) {
+        wordRes.addAll(toAddAtEnd);
+        res.add(wordRes);
+      }
+    }
+    res = new ArrayList<List<String>>(new HashSet<List<String>>(res));
+    return res;
+  }
+
+  public void extendTaxonomy(String fileName, String domain, String lang) {
+    AriAdapter ad = new AriAdapter();
+    ad.getChainsFromARIfile(fileName);
+    List<String> entries = new ArrayList<String>((ad.lemma_AssocWords.keySet()));
+    try {
+      for (String entity : entries) { // .
+        List<List<String>> paths = ad.lemma_AssocWords.get(entity);
+        for (List<String> taxoPath : paths) {
+          String query = taxoPath.toString() + " " + entity + " " + domain; // todo:
+                                                                            // query
+                                                                            // forming
+                                                                            // function
+                                                                            // here
+          query = query.replace('[', ' ').replace(']', ' ').replace(',', ' ')
+              .replace('_', ' ');
+          List<List<ParseTreeChunk>> matchList = runSearchForTaxonomyPath(
+              query, "", lang, 30);
+          List<String> toRemoveFromExtension = new ArrayList<String>(taxoPath);
+          toRemoveFromExtension.add(entity);
+          toRemoveFromExtension.add(domain);
+          List<List<String>> resList = getCommonWordsFromList_List_ParseTreeChunk(
+              matchList, toRemoveFromExtension, taxoPath);
+          assocWords_ExtendedAssocWords.put(taxoPath, resList);
+          resList.add(taxoPath);
+          lemma_ExtendedAssocWords.put(entity, resList);
+        }
+      }
+    } catch (Exception e) {
+      System.err.println("Problem taxonomy matching");
+    }
+
+    TaxonomySerializer ser = new TaxonomySerializer(lemma_ExtendedAssocWords,
+        assocWords_ExtendedAssocWords);
+    ser.writeTaxonomy(fileName.replace(".ari", "Taxo.dat"));
+  }
+
+  public List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query,
+      String domain, String lang, int numbOfHits) {
+    List<List<ParseTreeChunk>> genResult = new ArrayList<List<ParseTreeChunk>>();
+    try {
+      List<String> resultList = search(query, domain, lang, numbOfHits);
+
+      BingResponse resp = populateBingHit(resultList.get(0));
+      // printSearchResult(resultList.get(0));
+      for (int i = 0; i < resp.getHits().size(); i++) {
+        {
+          for (int j = i + 1; j < resp.getHits().size(); j++) {
+            HitBase h1 = resp.getHits().get(i);
+            HitBase h2 = resp.getHits().get(j);
+            String snapshot1 = StringCleaner.processSnapshotForMatching(h1
+                .getTitle() + " . " + h1.getAbstractText());
+            String snapshot2 = StringCleaner.processSnapshotForMatching(h2
+                .getTitle() + " . " + h2.getAbstractText());
+            SentencePairMatchResult matchRes = sm.assessRelevance(snapshot1,
+                snapshot2);
+            List<List<ParseTreeChunk>> matchResult = matchRes.getMatchResult();
+            genResult.addAll(matchResult);
+          }
+        }
+      }
+
+    } catch (Exception e) {
+      System.err.print("Problem extracting taxonomy node");
+    }
+
+    return genResult;
+  }
+
+  public void close() {
+    sm.close();
+
+  }
+
+  public static void main(String[] args) {
+    TaxonomyExtenderViaMebMining self = new TaxonomyExtenderViaMebMining();
+    self.extendTaxonomy("src/test/resources/taxonomies/irs_dom.ari", "tax",
+        "en");
 
-public class TaxonomyExtenderViaMebMining extends BingWebQueryRunner{
-	private static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.taxo_builder.TaxonomyExtenderSearchResultFromYahoo");
-	private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
-	ParserChunker2MatcherProcessor sm ;
-
-	private Map<String, List<List<String>>> lemma_ExtendedAssocWords = new HashMap<String, List<List<String>>>();
-	private Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords = new HashMap<List<String>, List<List<String>>>();
-	private PorterStemmer ps;
-
-	public Map<List<String>, List<List<String>>> getAssocWords_ExtendedAssocWords() {
-		return assocWords_ExtendedAssocWords;
-	}
-	
-	public Map<String, List<List<String>>> getLemma_ExtendedAssocWords() {
-		return lemma_ExtendedAssocWords;
-	}
-
-	public void setLemma_ExtendedAssocWords(
-			Map<String, List<List<String>>> lemma_ExtendedAssocWords) {
-		this.lemma_ExtendedAssocWords = lemma_ExtendedAssocWords;
-	}
-	
-	public TaxonomyExtenderViaMebMining(){
-		try {	
-			sm = ParserChunker2MatcherProcessor.getInstance();
-		} catch (Exception e){ // now try 'local' openNLP
-			System.err.println("Problem loading synt matcher");
-		
-		}
-		ps  = new PorterStemmer();
-      
-	}
-	
- 	private List<List<String>>
-		getCommonWordsFromList_List_ParseTreeChunk(List<List<ParseTreeChunk>> matchList, List<String> queryWordsToRemove, 
-				List<String> toAddAtEnd){
- 		List<List<String>> res = new ArrayList<List<String>>();
-		for(List<ParseTreeChunk> chunks: matchList){
-			List<String> wordRes = new ArrayList<String>();
-			for (ParseTreeChunk ch: chunks){
-				List<String> lemmas =  ch.getLemmas();
-				for(int w=0; w< lemmas.size(); w++)
-					if ( (!lemmas.get(w).equals("*")) && 
-							((ch.getPOSs().get(w).startsWith("NN") || ch.getPOSs().get(w).startsWith("VB"))) &&
-							lemmas.get(w).length()>2){
-						String formedWord = lemmas.get(w);
-						String stemmedFormedWord = ps.stem(formedWord);
-						if (!stemmedFormedWord.startsWith("invalid"))
-							wordRes.add(formedWord);
-					}
-			}
-			wordRes = new ArrayList<String>(new HashSet<String>(wordRes));
-			wordRes.removeAll(queryWordsToRemove);
-			if (wordRes.size()>0){	
-				wordRes.addAll(toAddAtEnd);
-				res.add(wordRes);
-			}
-		}
-		res = new ArrayList<List<String>>(new HashSet<List<String>>(res));
-		return res;
-	}
-	
-	public void extendTaxonomy(String fileName, String domain, String lang){
-		AriAdapter ad = new AriAdapter();
-	      ad.getChainsFromARIfile(fileName);
-	      List<String> entries = new ArrayList<String>((ad.lemma_AssocWords.keySet()));
-	      try {
-	      for(String entity: entries ){ //.
-	    	  List<List<String>> paths = ad.lemma_AssocWords.get(entity);
-	    	  for(List<String> taxoPath: paths){
-	    		  String query = taxoPath.toString()+ " " + entity + " "+ domain; // todo: query forming function here 
-	    		  query = query.replace('[', ' ').replace(']',' ').replace(',', ' ').replace('_', ' ');
-	    		  List<List<ParseTreeChunk>> matchList = runSearchForTaxonomyPath(query, "", lang, 30);
-	    		  List<String> toRemoveFromExtension = new ArrayList<String>(taxoPath);
-	    		  toRemoveFromExtension.add(entity); toRemoveFromExtension.add(domain);
-	    		  List<List<String>> resList =	getCommonWordsFromList_List_ParseTreeChunk(matchList, toRemoveFromExtension, taxoPath);
-	    		  assocWords_ExtendedAssocWords.put(taxoPath, resList);
-	    		  resList.add(taxoPath);
-	    		  lemma_ExtendedAssocWords.put(entity, resList);
-	    	  }
-	      }
-	      } catch (Exception e){
-	    	 System.err.println("Problem taxonomy matching");
-	      }
-	     
-	      TaxonomySerializer ser = new TaxonomySerializer(lemma_ExtendedAssocWords, assocWords_ExtendedAssocWords);
-	      ser.writeTaxonomy(fileName.replace(".ari", "Taxo.dat"));
-	}
-	
-	public List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query, String domain, String lang, int numbOfHits) {
-		List<List<ParseTreeChunk>> genResult = new ArrayList<List<ParseTreeChunk>>();
-		try {
-			List<String> resultList = search(query,domain,lang,numbOfHits);
-			
-			BingResponse resp = populateBingHit(resultList.get(0));
-			//printSearchResult(resultList.get(0));
-			for(int i=0; i<resp.getHits().size(); i++){
-				{
-					for( int j=i+1; j<resp.getHits().size(); j++){
-						HitBase h1 = resp.getHits().get(i);
-						HitBase h2 = resp.getHits().get(j);
-						String snapshot1 = StringCleaner.processSnapshotForMatching(h1.getTitle()+ " . "+h1.getAbstractText());
-						String snapshot2 = StringCleaner.processSnapshotForMatching(h2.getTitle()+ " . "+h2.getAbstractText());
-						SentencePairMatchResult matchRes = sm.assessRelevance(snapshot1, snapshot2);
-						List<List<ParseTreeChunk>> matchResult  = matchRes.getMatchResult();
-						genResult.addAll(matchResult);						
-					}
-				}
-			}
-			
-		} catch (Exception e) {
-			System.err.print("Problem extracting taxonomy node");
-		}
-		
-		return genResult;
-	} 
-
-	public static void main(String[] args){
-			TaxonomyExtenderViaMebMining self = new TaxonomyExtenderViaMebMining();
-			self.extendTaxonomy("src/test/resources/taxonomies/irs_dom.ari", "tax", "en");
-		
-	}
+  }
 
 }

Modified: opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/similarity/apps/SearchResultsProcessorTest.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/similarity/apps/SearchResultsProcessorTest.java?rev=1306658&r1=1306657&r2=1306658&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/similarity/apps/SearchResultsProcessorTest.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/similarity/apps/SearchResultsProcessorTest.java Thu Mar 29 00:29:11 2012
@@ -1,39 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package opennlp.tools.similarity.apps;
 
 import java.util.List;
 
 import junit.framework.TestCase;
 
-public class SearchResultsProcessorTest extends TestCase{
-	SearchResultsProcessor proc = new SearchResultsProcessor();
-	
-	
-	public void testSearchOrder(){
-		List<HitBase> res = proc.runSearch("How can I pay tax on my income abroad"); 
-		
-		// we verify that top answers have high similarity score
-		System.out.println(res);
-		HitBase first = res.get(0);
-		assertTrue( first.getGenerWithQueryScore()>3.0);
-		//assertTrue(first.getTitle().indexOf("Foreign")>-1 && first.getTitle().indexOf("earned")>-1);
-		
-		HitBase second = res.get(1);
-		assertTrue( second.getGenerWithQueryScore()>1.9);
-		//assertTrue(second.getTitle().indexOf("living abroad")>-1);
-		proc.close();
-				
-	}
-	
-	public void testSearchOrder2(){
-		List<HitBase> res = proc.runSearch(
-	   "Can I estimate what my income tax would be by using my last pay"); 
-		
-		System.out.println(res);
-		HitBase first = res.get(0);
-		assertTrue( first.getGenerWithQueryScore()>1.9);
-		
-		HitBase second = res.get(1);
-		assertTrue( second.getGenerWithQueryScore()>1.9);
-		proc.close();	
-	}
+public class SearchResultsProcessorTest extends TestCase {
+  SearchResultsProcessor proc = new SearchResultsProcessor();
+
+  public void testSearchOrder() {
+    List<HitBase> res = proc.runSearch("How can I pay tax on my income abroad");
+
+    // we verify that top answers have high similarity score
+    System.out.println(res);
+    HitBase first = res.get(0);
+    assertTrue(first.getGenerWithQueryScore() > 3.0);
+    // assertTrue(first.getTitle().indexOf("Foreign")>-1 &&
+    // first.getTitle().indexOf("earned")>-1);
+
+    HitBase second = res.get(1);
+    assertTrue(second.getGenerWithQueryScore() > 1.9);
+    // assertTrue(second.getTitle().indexOf("living abroad")>-1);
+    proc.close();
+
+  }
+
+  public void testSearchOrder2() {
+    List<HitBase> res = proc
+        .runSearch("Can I estimate what my income tax would be by using my last pay");
+
+    System.out.println(res);
+    HitBase first = res.get(0);
+    assertTrue(first.getGenerWithQueryScore() > 1.9);
+
+    HitBase second = res.get(1);
+    assertTrue(second.getGenerWithQueryScore() > 1.9);
+    proc.close();
+  }
 }

Added: opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyBuildMatchTest.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyBuildMatchTest.java?rev=1306658&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyBuildMatchTest.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyBuildMatchTest.java Thu Mar 29 00:29:11 2012
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.similarity.apps.taxo_builder;
+
+import java.util.List;
+
+import junit.framework.TestCase;
+
+public class TaxonomyBuildMatchTest extends TestCase {
+
+  public void testTaxonomySeedImport() {
+    AriAdapter ad = new AriAdapter();
+    ad.getChainsFromARIfile("src/test/resources/taxonomies/irs_dom.ari");
+    System.out.println(ad.lemma_AssocWords);
+    assertTrue(ad.lemma_AssocWords.size() > 0);
+  }
+
+  public void testTaxonomyBuild() {
+    TaxonomyExtenderViaMebMining self = new TaxonomyExtenderViaMebMining();
+    self.extendTaxonomy("src/test/resources/taxonomies/irs_dom.ari", "tax",
+        "en");
+    self.close();
+    assertTrue(self.getAssocWords_ExtendedAssocWords().size() > 0);
+  }
+
+  public void testTaxonomyMatch() {
+    TaxoQuerySnapshotMatcher matcher = new TaxoQuerySnapshotMatcher("src/test/resources/taxonomies/irs_domTaxo.dat");
+    int score = matcher.getTaxoScore("Can Form 1040 EZ be used to claim the earned income credit.",
+    "Can Form 1040EZ be used to claim the earned income credit? . Must I be entitled to claim a child as a dependent to claim the earned income credit based on the child being ");
+
+    System.out.println("The score is: "+ score);
+    assertTrue(score>3);
+    matcher.close();
+  }
+}