You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by bg...@apache.org on 2012/03/29 02:29:12 UTC
svn commit: r1306658 - in /opennlp/sandbox/opennlp-similarity/src:
main/java/opennlp/tools/similarity/apps/taxo_builder/
test/java/opennlp/tools/similarity/apps/
test/java/opennlp/tools/similarity/apps/taxo_builder/
Author: bgalitsky
Date: Thu Mar 29 00:29:11 2012
New Revision: 1306658
URL: http://svn.apache.org/viewvc?rev=1306658&view=rev
Log:
test for OPENNLP-436
Auto Taxonomy Learner for Search Relevance Improvement based on Similarity
Added:
opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/similarity/apps/taxo_builder/
opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyBuildMatchTest.java
Modified:
opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/Languages.java
opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java
opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java
opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/similarity/apps/SearchResultsProcessorTest.java
Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/Languages.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/Languages.java?rev=1306658&r1=1306657&r2=1306658&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/Languages.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/Languages.java Thu Mar 29 00:29:11 2012
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package opennlp.tools.similarity.apps.taxo_builder;
public enum Languages {
Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java?rev=1306658&r1=1306657&r2=1306658&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java Thu Mar 29 00:29:11 2012
@@ -30,103 +30,123 @@ import opennlp.tools.textsimilarity.chun
import com.thoughtworks.xstream.XStream;
-
/**
- * This class can be used to generate scores based on the overlapping between a text and a given taxonomy.
- *
+ * This class can be used to generate scores based on the overlapping between a
+ * text and a given taxonomy.
+ *
*/
public class TaxoQuerySnapshotMatcher {
-
- ParserChunker2MatcherProcessor sm ;
- //XStream xStream= new XStream();
- Map<String, List<List<String>>> lemma_ExtendedAssocWords;
- TaxonomySerializer taxo;
- private static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.taxo_builder.TaxoQuerySnapshotMatcher");
-
-
- public TaxoQuerySnapshotMatcher() {
- sm = ParserChunker2MatcherProcessor.getInstance();
- taxo = TaxonomySerializer.readTaxonomy("src/test/resources/taxonomies/irs_domTaxo.dat");
- }
- /**
- * Can be used to generate scores based on the overlapping between a text and a given taxonomy.
- * @param query The query string the user used for ask a question.
- * @param snapshot The abstract of a hit the system gave back
- * @return
- */
- public int getTaxoScore(String query, String snapshot){
-
- lemma_ExtendedAssocWords=(HashMap<String, List<List<String>>>) taxo.getLemma_ExtendedAssocWords();
-
- query=query.toLowerCase();
- snapshot=snapshot.toLowerCase();
- String[] queryWords = sm.getTokenizer().tokenize(query);
- String[] snapshotWords = sm.getTokenizer().tokenize(snapshot);
-
- List<String> queryList = Arrays.asList(queryWords);
- List<String> snapshotList = Arrays.asList(snapshotWords);
-
- List<String> commonBetweenQuerySnapshot = (new ArrayList<String>(queryList));
- commonBetweenQuerySnapshot.retainAll(snapshotList);//Still could be duplicated words (even more if I would retain all the opposite ways)
-
- int score = 0;
- List<String> accumCommonParams = new ArrayList<String>();
- for(String qWord: commonBetweenQuerySnapshot){
- if (!lemma_ExtendedAssocWords.containsKey(qWord))
- continue;
- List<List<String>> foundParams = new ArrayList<List<String>>();
- foundParams=lemma_ExtendedAssocWords.get(qWord);
-
- for(List<String> paramsForGivenMeaning: foundParams){
- paramsForGivenMeaning.retainAll(queryList);
- paramsForGivenMeaning.retainAll(snapshotList);
- int size = paramsForGivenMeaning.size();
-
- if (size>0 && !accumCommonParams.containsAll(paramsForGivenMeaning)){
- score+=size;
- accumCommonParams.addAll(paramsForGivenMeaning);
- }
- }
- }
- return score;
- }
-
- /**
- * It loads a serialized taxonomy in .dat format and serializes it into a much more readable XML format.
- * @param taxonomyPath
- * @param taxonomyXML_Path
- * */
-
- public void convertDatToXML(String taxonomyXML_Path, TaxonomySerializer taxo){
- XStream xStream = new XStream();
- FileHandler fileHandler = new FileHandler();
- try {
- fileHandler.writeToTextFile(xStream.toXML(taxo), taxonomyXML_Path, false);
- } catch (Exception e) {
- e.printStackTrace();
- LOG.info(e.toString());
- }
-
- }
-
- public void xmlWork (){
- TaxoQuerySnapshotMatcher matcher = new TaxoQuerySnapshotMatcher();
- XStream xStream = new XStream();
- FileHandler fileHandler = new FileHandler();
- matcher.taxo = (TaxonomySerializer)xStream.fromXML(fileHandler.readFromTextFile("src/test/resources/taxo_English.xml"));
- }
- /**
- * demonstrates the usage of the taxonomy matcher
- * @param args
- */
- static public void main(String[] args){
-
- TaxoQuerySnapshotMatcher matcher = new TaxoQuerySnapshotMatcher();
-
- System.out.println("The score is: "+matcher.getTaxoScore("Can Form 1040 EZ be used to claim the earned income credit.",
- "Can Form 1040EZ be used to claim the earned income credit? . Must I be entitled to claim a child as a dependent to claim the earned income credit based on the child being "));
-
-
- }
-}
+ ParserChunker2MatcherProcessor sm;
+ // XStream xStream= new XStream();
+ Map<String, List<List<String>>> lemma_ExtendedAssocWords;
+ TaxonomySerializer taxo;
+ private static Logger LOG = Logger
+ .getLogger("opennlp.tools.similarity.apps.taxo_builder.TaxoQuerySnapshotMatcher");
+
+ public TaxoQuerySnapshotMatcher(String taxoFileName) {
+ sm = ParserChunker2MatcherProcessor.getInstance();
+ taxo = TaxonomySerializer.readTaxonomy(taxoFileName); // "src/test/resources/taxonomies/irs_domTaxo.dat");
+ }
+
+ /**
+ * Can be used to generate scores based on the overlapping between a text and
+ * a given taxonomy.
+ *
+ * @param query
+ * The query string the user used for ask a question.
+ * @param snapshot
+ * The abstract of a hit the system gave back
+ * @return
+ */
+ public int getTaxoScore(String query, String snapshot) {
+
+ lemma_ExtendedAssocWords = (HashMap<String, List<List<String>>>) taxo
+ .getLemma_ExtendedAssocWords();
+
+ query = query.toLowerCase();
+ snapshot = snapshot.toLowerCase();
+ String[] queryWords = sm.getTokenizer().tokenize(query);
+ String[] snapshotWords = sm.getTokenizer().tokenize(snapshot);
+
+ List<String> queryList = Arrays.asList(queryWords);
+ List<String> snapshotList = Arrays.asList(snapshotWords);
+
+ List<String> commonBetweenQuerySnapshot = (new ArrayList<String>(queryList));
+ commonBetweenQuerySnapshot.retainAll(snapshotList);// Still could be
+ // duplicated words (even
+ // more if I would retain
+ // all the opposite ways)
+
+ int score = 0;
+ List<String> accumCommonParams = new ArrayList<String>();
+ for (String qWord : commonBetweenQuerySnapshot) {
+ if (!lemma_ExtendedAssocWords.containsKey(qWord))
+ continue;
+ List<List<String>> foundParams = new ArrayList<List<String>>();
+ foundParams = lemma_ExtendedAssocWords.get(qWord);
+
+ for (List<String> paramsForGivenMeaning : foundParams) {
+ paramsForGivenMeaning.retainAll(queryList);
+ paramsForGivenMeaning.retainAll(snapshotList);
+ int size = paramsForGivenMeaning.size();
+
+ if (size > 0 && !accumCommonParams.containsAll(paramsForGivenMeaning)) {
+ score += size;
+ accumCommonParams.addAll(paramsForGivenMeaning);
+ }
+ }
+ }
+ return score;
+ }
+
+ /**
+ * It loads a serialized taxonomy in .dat format and serializes it into a much
+ * more readable XML format.
+ *
+ * @param taxonomyPath
+ * @param taxonomyXML_Path
+ * */
+
+ public void convertDatToXML(String taxonomyXML_Path, TaxonomySerializer taxo) {
+ XStream xStream = new XStream();
+ FileHandler fileHandler = new FileHandler();
+ try {
+ fileHandler.writeToTextFile(xStream.toXML(taxo), taxonomyXML_Path, false);
+ } catch (Exception e) {
+ e.printStackTrace();
+ LOG.info(e.toString());
+ }
+
+ }
+
+ public void xmlWork (){
+ TaxoQuerySnapshotMatcher matcher = new TaxoQuerySnapshotMatcher("src/test/resources/taxonomies/irs_domTaxo.dat");
+ XStream xStream = new XStream();
+ FileHandler fileHandler = new FileHandler();
+ matcher.taxo = (TaxonomySerializer)xStream.fromXML(fileHandler.readFromTextFile("src/test/resources/taxo_English.xml"));
+ }
+
+ public void close() {
+ sm.close();
+ }
+
+ /**
+ * demonstrates the usage of the taxonomy matcher
+ *
+ * @param args
+ */
+ static public void main(String[] args) {
+
+ TaxoQuerySnapshotMatcher matcher = new TaxoQuerySnapshotMatcher(
+ "src/test/resources/taxonomies/irs_domTaxo.dat");
+
+ System.out
+ .println("The score is: "
+ + matcher
+ .getTaxoScore(
+ "Can Form 1040 EZ be used to claim the earned income credit.",
+ "Can Form 1040EZ be used to claim the earned income credit? . Must I be entitled to claim a child as a dependent to claim the earned income credit based on the child being "));
+ }
+
+}
Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java?rev=1306658&r1=1306657&r2=1306658&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java Thu Mar 29 00:29:11 2012
@@ -15,6 +15,7 @@
* limitations under the License.
*/
package opennlp.tools.similarity.apps.taxo_builder;
+
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
@@ -32,140 +33,158 @@ import opennlp.tools.textsimilarity.Pars
import opennlp.tools.textsimilarity.SentencePairMatchResult;
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
-
/**
- * Results of taxonomy learning are two maps
- * 0) For an entity like tax it gives all lists of associated parameters obtained from the
- * taxonomy kernel (done manually)
- * Now, given 0, we obtain the derived list of parameters as commonalities of search results snapshots
- * output map 1) for the entity, derived list
- * output map 2) for such manual list of words -> derived list of words
- *
- *
+ * Results of taxonomy learning are two maps 0) For an entity like tax it gives
+ * all lists of associated parameters obtained from the taxonomy kernel (done
+ * manually) Now, given 0, we obtain the derived list of parameters as
+ * commonalities of search results snapshots output map 1) for the entity,
+ * derived list output map 2) for such manual list of words -> derived list of
+ * words
+ *
+ *
*/
+public class TaxonomyExtenderViaMebMining extends BingWebQueryRunner {
+ private static Logger LOG = Logger
+ .getLogger("opennlp.tools.similarity.apps.taxo_builder.TaxonomyExtenderSearchResultFromYahoo");
+ private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
+ ParserChunker2MatcherProcessor sm;
+
+ private Map<String, List<List<String>>> lemma_ExtendedAssocWords = new HashMap<String, List<List<String>>>();
+ private Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords = new HashMap<List<String>, List<List<String>>>();
+ private PorterStemmer ps;
+
+ public Map<List<String>, List<List<String>>> getAssocWords_ExtendedAssocWords() {
+ return assocWords_ExtendedAssocWords;
+ }
+
+ public Map<String, List<List<String>>> getLemma_ExtendedAssocWords() {
+ return lemma_ExtendedAssocWords;
+ }
+
+ public void setLemma_ExtendedAssocWords(
+ Map<String, List<List<String>>> lemma_ExtendedAssocWords) {
+ this.lemma_ExtendedAssocWords = lemma_ExtendedAssocWords;
+ }
+
+ public TaxonomyExtenderViaMebMining() {
+ try {
+ sm = ParserChunker2MatcherProcessor.getInstance();
+ } catch (Exception e) { // now try 'local' openNLP
+ System.err.println("Problem loading synt matcher");
+
+ }
+ ps = new PorterStemmer();
+
+ }
+
+ private List<List<String>> getCommonWordsFromList_List_ParseTreeChunk(
+ List<List<ParseTreeChunk>> matchList, List<String> queryWordsToRemove,
+ List<String> toAddAtEnd) {
+ List<List<String>> res = new ArrayList<List<String>>();
+ for (List<ParseTreeChunk> chunks : matchList) {
+ List<String> wordRes = new ArrayList<String>();
+ for (ParseTreeChunk ch : chunks) {
+ List<String> lemmas = ch.getLemmas();
+ for (int w = 0; w < lemmas.size(); w++)
+ if ((!lemmas.get(w).equals("*"))
+ && ((ch.getPOSs().get(w).startsWith("NN") || ch.getPOSs().get(w)
+ .startsWith("VB"))) && lemmas.get(w).length() > 2) {
+ String formedWord = lemmas.get(w);
+ String stemmedFormedWord = ps.stem(formedWord);
+ if (!stemmedFormedWord.startsWith("invalid"))
+ wordRes.add(formedWord);
+ }
+ }
+ wordRes = new ArrayList<String>(new HashSet<String>(wordRes));
+ wordRes.removeAll(queryWordsToRemove);
+ if (wordRes.size() > 0) {
+ wordRes.addAll(toAddAtEnd);
+ res.add(wordRes);
+ }
+ }
+ res = new ArrayList<List<String>>(new HashSet<List<String>>(res));
+ return res;
+ }
+
+ public void extendTaxonomy(String fileName, String domain, String lang) {
+ AriAdapter ad = new AriAdapter();
+ ad.getChainsFromARIfile(fileName);
+ List<String> entries = new ArrayList<String>((ad.lemma_AssocWords.keySet()));
+ try {
+ for (String entity : entries) { // .
+ List<List<String>> paths = ad.lemma_AssocWords.get(entity);
+ for (List<String> taxoPath : paths) {
+ String query = taxoPath.toString() + " " + entity + " " + domain; // todo:
+ // query
+ // forming
+ // function
+ // here
+ query = query.replace('[', ' ').replace(']', ' ').replace(',', ' ')
+ .replace('_', ' ');
+ List<List<ParseTreeChunk>> matchList = runSearchForTaxonomyPath(
+ query, "", lang, 30);
+ List<String> toRemoveFromExtension = new ArrayList<String>(taxoPath);
+ toRemoveFromExtension.add(entity);
+ toRemoveFromExtension.add(domain);
+ List<List<String>> resList = getCommonWordsFromList_List_ParseTreeChunk(
+ matchList, toRemoveFromExtension, taxoPath);
+ assocWords_ExtendedAssocWords.put(taxoPath, resList);
+ resList.add(taxoPath);
+ lemma_ExtendedAssocWords.put(entity, resList);
+ }
+ }
+ } catch (Exception e) {
+ System.err.println("Problem taxonomy matching");
+ }
+
+ TaxonomySerializer ser = new TaxonomySerializer(lemma_ExtendedAssocWords,
+ assocWords_ExtendedAssocWords);
+ ser.writeTaxonomy(fileName.replace(".ari", "Taxo.dat"));
+ }
+
+ public List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query,
+ String domain, String lang, int numbOfHits) {
+ List<List<ParseTreeChunk>> genResult = new ArrayList<List<ParseTreeChunk>>();
+ try {
+ List<String> resultList = search(query, domain, lang, numbOfHits);
+
+ BingResponse resp = populateBingHit(resultList.get(0));
+ // printSearchResult(resultList.get(0));
+ for (int i = 0; i < resp.getHits().size(); i++) {
+ {
+ for (int j = i + 1; j < resp.getHits().size(); j++) {
+ HitBase h1 = resp.getHits().get(i);
+ HitBase h2 = resp.getHits().get(j);
+ String snapshot1 = StringCleaner.processSnapshotForMatching(h1
+ .getTitle() + " . " + h1.getAbstractText());
+ String snapshot2 = StringCleaner.processSnapshotForMatching(h2
+ .getTitle() + " . " + h2.getAbstractText());
+ SentencePairMatchResult matchRes = sm.assessRelevance(snapshot1,
+ snapshot2);
+ List<List<ParseTreeChunk>> matchResult = matchRes.getMatchResult();
+ genResult.addAll(matchResult);
+ }
+ }
+ }
+
+ } catch (Exception e) {
+ System.err.print("Problem extracting taxonomy node");
+ }
+
+ return genResult;
+ }
+
+ public void close() {
+ sm.close();
+
+ }
+
+ public static void main(String[] args) {
+ TaxonomyExtenderViaMebMining self = new TaxonomyExtenderViaMebMining();
+ self.extendTaxonomy("src/test/resources/taxonomies/irs_dom.ari", "tax",
+ "en");
-public class TaxonomyExtenderViaMebMining extends BingWebQueryRunner{
- private static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.taxo_builder.TaxonomyExtenderSearchResultFromYahoo");
- private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
- ParserChunker2MatcherProcessor sm ;
-
- private Map<String, List<List<String>>> lemma_ExtendedAssocWords = new HashMap<String, List<List<String>>>();
- private Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords = new HashMap<List<String>, List<List<String>>>();
- private PorterStemmer ps;
-
- public Map<List<String>, List<List<String>>> getAssocWords_ExtendedAssocWords() {
- return assocWords_ExtendedAssocWords;
- }
-
- public Map<String, List<List<String>>> getLemma_ExtendedAssocWords() {
- return lemma_ExtendedAssocWords;
- }
-
- public void setLemma_ExtendedAssocWords(
- Map<String, List<List<String>>> lemma_ExtendedAssocWords) {
- this.lemma_ExtendedAssocWords = lemma_ExtendedAssocWords;
- }
-
- public TaxonomyExtenderViaMebMining(){
- try {
- sm = ParserChunker2MatcherProcessor.getInstance();
- } catch (Exception e){ // now try 'local' openNLP
- System.err.println("Problem loading synt matcher");
-
- }
- ps = new PorterStemmer();
-
- }
-
- private List<List<String>>
- getCommonWordsFromList_List_ParseTreeChunk(List<List<ParseTreeChunk>> matchList, List<String> queryWordsToRemove,
- List<String> toAddAtEnd){
- List<List<String>> res = new ArrayList<List<String>>();
- for(List<ParseTreeChunk> chunks: matchList){
- List<String> wordRes = new ArrayList<String>();
- for (ParseTreeChunk ch: chunks){
- List<String> lemmas = ch.getLemmas();
- for(int w=0; w< lemmas.size(); w++)
- if ( (!lemmas.get(w).equals("*")) &&
- ((ch.getPOSs().get(w).startsWith("NN") || ch.getPOSs().get(w).startsWith("VB"))) &&
- lemmas.get(w).length()>2){
- String formedWord = lemmas.get(w);
- String stemmedFormedWord = ps.stem(formedWord);
- if (!stemmedFormedWord.startsWith("invalid"))
- wordRes.add(formedWord);
- }
- }
- wordRes = new ArrayList<String>(new HashSet<String>(wordRes));
- wordRes.removeAll(queryWordsToRemove);
- if (wordRes.size()>0){
- wordRes.addAll(toAddAtEnd);
- res.add(wordRes);
- }
- }
- res = new ArrayList<List<String>>(new HashSet<List<String>>(res));
- return res;
- }
-
- public void extendTaxonomy(String fileName, String domain, String lang){
- AriAdapter ad = new AriAdapter();
- ad.getChainsFromARIfile(fileName);
- List<String> entries = new ArrayList<String>((ad.lemma_AssocWords.keySet()));
- try {
- for(String entity: entries ){ //.
- List<List<String>> paths = ad.lemma_AssocWords.get(entity);
- for(List<String> taxoPath: paths){
- String query = taxoPath.toString()+ " " + entity + " "+ domain; // todo: query forming function here
- query = query.replace('[', ' ').replace(']',' ').replace(',', ' ').replace('_', ' ');
- List<List<ParseTreeChunk>> matchList = runSearchForTaxonomyPath(query, "", lang, 30);
- List<String> toRemoveFromExtension = new ArrayList<String>(taxoPath);
- toRemoveFromExtension.add(entity); toRemoveFromExtension.add(domain);
- List<List<String>> resList = getCommonWordsFromList_List_ParseTreeChunk(matchList, toRemoveFromExtension, taxoPath);
- assocWords_ExtendedAssocWords.put(taxoPath, resList);
- resList.add(taxoPath);
- lemma_ExtendedAssocWords.put(entity, resList);
- }
- }
- } catch (Exception e){
- System.err.println("Problem taxonomy matching");
- }
-
- TaxonomySerializer ser = new TaxonomySerializer(lemma_ExtendedAssocWords, assocWords_ExtendedAssocWords);
- ser.writeTaxonomy(fileName.replace(".ari", "Taxo.dat"));
- }
-
- public List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query, String domain, String lang, int numbOfHits) {
- List<List<ParseTreeChunk>> genResult = new ArrayList<List<ParseTreeChunk>>();
- try {
- List<String> resultList = search(query,domain,lang,numbOfHits);
-
- BingResponse resp = populateBingHit(resultList.get(0));
- //printSearchResult(resultList.get(0));
- for(int i=0; i<resp.getHits().size(); i++){
- {
- for( int j=i+1; j<resp.getHits().size(); j++){
- HitBase h1 = resp.getHits().get(i);
- HitBase h2 = resp.getHits().get(j);
- String snapshot1 = StringCleaner.processSnapshotForMatching(h1.getTitle()+ " . "+h1.getAbstractText());
- String snapshot2 = StringCleaner.processSnapshotForMatching(h2.getTitle()+ " . "+h2.getAbstractText());
- SentencePairMatchResult matchRes = sm.assessRelevance(snapshot1, snapshot2);
- List<List<ParseTreeChunk>> matchResult = matchRes.getMatchResult();
- genResult.addAll(matchResult);
- }
- }
- }
-
- } catch (Exception e) {
- System.err.print("Problem extracting taxonomy node");
- }
-
- return genResult;
- }
-
- public static void main(String[] args){
- TaxonomyExtenderViaMebMining self = new TaxonomyExtenderViaMebMining();
- self.extendTaxonomy("src/test/resources/taxonomies/irs_dom.ari", "tax", "en");
-
- }
+ }
}
Modified: opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/similarity/apps/SearchResultsProcessorTest.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/similarity/apps/SearchResultsProcessorTest.java?rev=1306658&r1=1306657&r2=1306658&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/similarity/apps/SearchResultsProcessorTest.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/similarity/apps/SearchResultsProcessorTest.java Thu Mar 29 00:29:11 2012
@@ -1,39 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package opennlp.tools.similarity.apps;
import java.util.List;
import junit.framework.TestCase;
-public class SearchResultsProcessorTest extends TestCase{
- SearchResultsProcessor proc = new SearchResultsProcessor();
-
-
- public void testSearchOrder(){
- List<HitBase> res = proc.runSearch("How can I pay tax on my income abroad");
-
- // we verify that top answers have high similarity score
- System.out.println(res);
- HitBase first = res.get(0);
- assertTrue( first.getGenerWithQueryScore()>3.0);
- //assertTrue(first.getTitle().indexOf("Foreign")>-1 && first.getTitle().indexOf("earned")>-1);
-
- HitBase second = res.get(1);
- assertTrue( second.getGenerWithQueryScore()>1.9);
- //assertTrue(second.getTitle().indexOf("living abroad")>-1);
- proc.close();
-
- }
-
- public void testSearchOrder2(){
- List<HitBase> res = proc.runSearch(
- "Can I estimate what my income tax would be by using my last pay");
-
- System.out.println(res);
- HitBase first = res.get(0);
- assertTrue( first.getGenerWithQueryScore()>1.9);
-
- HitBase second = res.get(1);
- assertTrue( second.getGenerWithQueryScore()>1.9);
- proc.close();
- }
+public class SearchResultsProcessorTest extends TestCase {
+ SearchResultsProcessor proc = new SearchResultsProcessor();
+
+ public void testSearchOrder() {
+ List<HitBase> res = proc.runSearch("How can I pay tax on my income abroad");
+
+ // we verify that top answers have high similarity score
+ System.out.println(res);
+ HitBase first = res.get(0);
+ assertTrue(first.getGenerWithQueryScore() > 3.0);
+ // assertTrue(first.getTitle().indexOf("Foreign")>-1 &&
+ // first.getTitle().indexOf("earned")>-1);
+
+ HitBase second = res.get(1);
+ assertTrue(second.getGenerWithQueryScore() > 1.9);
+ // assertTrue(second.getTitle().indexOf("living abroad")>-1);
+ proc.close();
+
+ }
+
+ public void testSearchOrder2() {
+ List<HitBase> res = proc
+ .runSearch("Can I estimate what my income tax would be by using my last pay");
+
+ System.out.println(res);
+ HitBase first = res.get(0);
+ assertTrue(first.getGenerWithQueryScore() > 1.9);
+
+ HitBase second = res.get(1);
+ assertTrue(second.getGenerWithQueryScore() > 1.9);
+ proc.close();
+ }
}
Added: opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyBuildMatchTest.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyBuildMatchTest.java?rev=1306658&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyBuildMatchTest.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/test/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyBuildMatchTest.java Thu Mar 29 00:29:11 2012
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.similarity.apps.taxo_builder;
+
+import java.util.List;
+
+import junit.framework.TestCase;
+
+public class TaxonomyBuildMatchTest extends TestCase {
+
+ public void testTaxonomySeedImport() {
+ AriAdapter ad = new AriAdapter();
+ ad.getChainsFromARIfile("src/test/resources/taxonomies/irs_dom.ari");
+ System.out.println(ad.lemma_AssocWords);
+ assertTrue(ad.lemma_AssocWords.size() > 0);
+ }
+
+ public void testTaxonomyBuild() {
+ TaxonomyExtenderViaMebMining self = new TaxonomyExtenderViaMebMining();
+ self.extendTaxonomy("src/test/resources/taxonomies/irs_dom.ari", "tax",
+ "en");
+ self.close();
+ assertTrue(self.getAssocWords_ExtendedAssocWords().size() > 0);
+ }
+
+ public void testTaxonomyMatch() {
+ TaxoQuerySnapshotMatcher matcher = new TaxoQuerySnapshotMatcher("src/test/resources/taxonomies/irs_domTaxo.dat");
+ int score = matcher.getTaxoScore("Can Form 1040 EZ be used to claim the earned income credit.",
+ "Can Form 1040EZ be used to claim the earned income credit? . Must I be entitled to claim a child as a dependent to claim the earned income credit based on the child being ");
+
+ System.out.println("The score is: "+ score);
+ assertTrue(score>3);
+ matcher.close();
+ }
+}