You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by bg...@apache.org on 2012/03/29 00:16:44 UTC
svn commit: r1306623 - in /opennlp/sandbox/opennlp-similarity: ./ resources/
src/main/java/opennlp/tools/similarity/apps/taxo_builder/
src/main/java/opennlp/tools/similarity/apps/utils/
src/main/java/opennlp/tools/textsimilarity/chunker2matcher/
Author: bgalitsky
Date: Wed Mar 28 22:16:43 2012
New Revision: 1306623
URL: http://svn.apache.org/viewvc?rev=1306623&view=rev
Log:
OPENNLP-436
Auto Taxonomy Learner for Search Relevance Improvement based on Similarity
Added:
opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/
opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/AriAdapter.java
opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/Languages.java
opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java
opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java
opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomySerializer.java
opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/taxonomy.txt
opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/FileHandler.java
Removed:
opennlp/sandbox/opennlp-similarity/resources/
Modified:
opennlp/sandbox/opennlp-similarity/pom.xml
opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
Modified: opennlp/sandbox/opennlp-similarity/pom.xml
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/pom.xml?rev=1306623&r1=1306622&r2=1306623&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/pom.xml (original)
+++ opennlp/sandbox/opennlp-similarity/pom.xml Wed Mar 28 22:16:43 2012
@@ -70,6 +70,11 @@
<artifactId>tika-core</artifactId>
<version>0.7</version>
</dependency>
+ <dependency>
+ <groupId>xstream.codehaus.org</groupId>
+ <artifactId>xstream</artifactId>
+ <version>1.4.2</version>
+ </dependency>
</dependencies>
<build>
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/AriAdapter.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/AriAdapter.java?rev=1306623&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/AriAdapter.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/AriAdapter.java Wed Mar 28 22:16:43 2012
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.similarity.apps.taxo_builder;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+
+/**
+ * This class makes it possible to use old prolog-files as the bases for taxonomy-learner.
+ * It cleans the prolog files and returns with Strings which can be used for the taxonomy extender process.
+ *
+ */
+public class AriAdapter {
+ //income_taks(state,company(cafeteria,_)):-do(71100).
+ Map<String, List<List<String>>> lemma_AssocWords = new HashMap<String, List<List<String>>>();
+ public void getChainsFromARIfile(String fileName) {
+
+ try {
+ BufferedReader br = new BufferedReader( new InputStreamReader(new FileInputStream(fileName)));
+ String line;
+ while((line = br.readLine()) != null) {
+ if (line.length()<10 || line.startsWith("%") || line.startsWith(":"))
+ continue;
+ String chain0 = line.replace("_,", "&").replace("_)", "&").replace(":-do(", "&").replace(":-var","&").
+ replace("taks","tax").
+ replace(":- do(", "&").replace("X=","&").replace(":-","&").replace("[X|_]","&").replace("nonvar","&").replace("var","&").
+ replace('(', '&').replace(')', '&').replace(',', '&').replace('.', '&').
+ replace("&&&","&").replace("&&","&").replace("&"," ");
+ String[] chains = chain0.split(" ");
+ List<String> chainList = new ArrayList<String>(); //Arrays.asList(chains);
+ for(String word: chains){
+ if (word!=null && word.length()>2 && word.indexOf("0")<0 && word.indexOf("1")<0 && word.indexOf("2")<0
+ && word.indexOf("3")<0 && word.indexOf("4")<0 && word.indexOf("5")<0 )
+ chainList.add(word);
+ }
+ if (chains.length<1 || chainList.size()<1 || chainList.get(0).length()<3)
+ continue;
+ String entry = chainList.get(0);
+ if (entry.length()<3)
+ continue;
+ chainList.remove(entry);
+ List<List<String>> res = lemma_AssocWords.get(entry);
+ if (res==null){
+ List<List<String>> resList = new ArrayList<List<String>>();
+ resList.add(chainList);
+ lemma_AssocWords.put(entry, resList);
+ } else {
+ res.add(chainList);
+ lemma_AssocWords.put(entry, res);
+ }
+ }
+ }catch (Exception e){
+ e.printStackTrace();
+
+ }
+ }
+
+ public static void main(String[] args){
+
+ AriAdapter ad = new AriAdapter();
+ ad.getChainsFromARIfile("src/test/resources/taxonomies/irs_dom.ari");
+ System.out.println(ad.lemma_AssocWords);
+
+ }
+
+}
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/Languages.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/Languages.java?rev=1306623&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/Languages.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/Languages.java Wed Mar 28 22:16:43 2012
@@ -0,0 +1,5 @@
+package opennlp.tools.similarity.apps.taxo_builder;
+
+public enum Languages {
+ ENGLISH,SPANISH,GERMAN,FRENCH,ITALIAN
+}
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java?rev=1306623&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java Wed Mar 28 22:16:43 2012
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps.taxo_builder;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Hashtable;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Logger;
+
+import opennlp.tools.similarity.apps.utils.FileHandler;
+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+
+import com.thoughtworks.xstream.XStream;
+
+
+/**
+ * This class can be used to generate scores based on the overlapping between a text and a given taxonomy.
+ *
+ */
+public class TaxoQuerySnapshotMatcher {
+
+ ParserChunker2MatcherProcessor sm ;
+ //XStream xStream= new XStream();
+ Map<String, List<List<String>>> lemma_ExtendedAssocWords;
+ TaxonomySerializer taxo;
+ private static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.taxo_builder.TaxoQuerySnapshotMatcher");
+
+
+ public TaxoQuerySnapshotMatcher() {
+ sm = ParserChunker2MatcherProcessor.getInstance();
+ taxo = TaxonomySerializer.readTaxonomy("src/test/resources/taxonomies/irs_domTaxo.dat");
+ }
+ /**
+ * Can be used to generate scores based on the overlapping between a text and a given taxonomy.
+ * @param query The query string the user used for ask a question.
+ * @param snapshot The abstract of a hit the system gave back
+ * @return
+ */
+ public int getTaxoScore(String query, String snapshot){
+
+ lemma_ExtendedAssocWords=(HashMap<String, List<List<String>>>) taxo.getLemma_ExtendedAssocWords();
+
+ query=query.toLowerCase();
+ snapshot=snapshot.toLowerCase();
+ String[] queryWords = sm.getTokenizer().tokenize(query);
+ String[] snapshotWords = sm.getTokenizer().tokenize(snapshot);
+
+ List<String> queryList = Arrays.asList(queryWords);
+ List<String> snapshotList = Arrays.asList(snapshotWords);
+
+ List<String> commonBetweenQuerySnapshot = (new ArrayList<String>(queryList));
+ commonBetweenQuerySnapshot.retainAll(snapshotList);//Still could be duplicated words (even more if I would retain all the opposite ways)
+
+ int score = 0;
+ List<String> accumCommonParams = new ArrayList<String>();
+ for(String qWord: commonBetweenQuerySnapshot){
+ if (!lemma_ExtendedAssocWords.containsKey(qWord))
+ continue;
+ List<List<String>> foundParams = new ArrayList<List<String>>();
+ foundParams=lemma_ExtendedAssocWords.get(qWord);
+
+ for(List<String> paramsForGivenMeaning: foundParams){
+ paramsForGivenMeaning.retainAll(queryList);
+ paramsForGivenMeaning.retainAll(snapshotList);
+ int size = paramsForGivenMeaning.size();
+
+ if (size>0 && !accumCommonParams.containsAll(paramsForGivenMeaning)){
+ score+=size;
+ accumCommonParams.addAll(paramsForGivenMeaning);
+ }
+ }
+ }
+ return score;
+ }
+
+ /**
+ * It loads a serialized taxonomy in .dat format and serializes it into a much more readable XML format.
+ * @param taxonomyPath
+ * @param taxonomyXML_Path
+ * */
+
+ public void convertDatToXML(String taxonomyXML_Path, TaxonomySerializer taxo){
+ XStream xStream = new XStream();
+ FileHandler fileHandler = new FileHandler();
+ try {
+ fileHandler.writeToTextFile(xStream.toXML(taxo), taxonomyXML_Path, false);
+ } catch (Exception e) {
+ e.printStackTrace();
+ LOG.info(e.toString());
+ }
+
+ }
+
+ public void xmlWork (){
+ TaxoQuerySnapshotMatcher matcher = new TaxoQuerySnapshotMatcher();
+ XStream xStream = new XStream();
+ FileHandler fileHandler = new FileHandler();
+ matcher.taxo = (TaxonomySerializer)xStream.fromXML(fileHandler.readFromTextFile("src/test/resources/taxo_English.xml"));
+ }
+ /**
+ * demonstrates the usage of the taxonomy matcher
+ * @param args
+ */
+ static public void main(String[] args){
+
+ TaxoQuerySnapshotMatcher matcher = new TaxoQuerySnapshotMatcher();
+
+ System.out.println("The score is: "+matcher.getTaxoScore("Can Form 1040 EZ be used to claim the earned income credit.",
+ "Can Form 1040EZ be used to claim the earned income credit? . Must I be entitled to claim a child as a dependent to claim the earned income credit based on the child being "));
+
+
+ }
+}
+
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java?rev=1306623&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java Wed Mar 28 22:16:43 2012
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.similarity.apps.taxo_builder;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Logger;
+
+import opennlp.tools.similarity.apps.BingResponse;
+import opennlp.tools.similarity.apps.BingWebQueryRunner;
+import opennlp.tools.similarity.apps.HitBase;
+import opennlp.tools.similarity.apps.utils.StringCleaner;
+import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
+import opennlp.tools.textsimilarity.SentencePairMatchResult;
+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+
+
+/**
+ * Results of taxonomy learning are two maps
+ * 0) For an entity like tax it gives all lists of associated parameters obtained from the
+ * taxonomy kernel (done manually)
+ * Now, given 0, we obtain the derived list of parameters as commonalities of search results snapshots
+ * output map 1) for the entity, derived list
+ * output map 2) for such manual list of words -> derived list of words
+ *
+ *
+ */
+
+
+public class TaxonomyExtenderViaMebMining extends BingWebQueryRunner{
+ private static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.taxo_builder.TaxonomyExtenderSearchResultFromYahoo");
+ private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
+ ParserChunker2MatcherProcessor sm ;
+
+ private Map<String, List<List<String>>> lemma_ExtendedAssocWords = new HashMap<String, List<List<String>>>();
+ private Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords = new HashMap<List<String>, List<List<String>>>();
+ private PorterStemmer ps;
+
+ public Map<List<String>, List<List<String>>> getAssocWords_ExtendedAssocWords() {
+ return assocWords_ExtendedAssocWords;
+ }
+
+ public Map<String, List<List<String>>> getLemma_ExtendedAssocWords() {
+ return lemma_ExtendedAssocWords;
+ }
+
+ public void setLemma_ExtendedAssocWords(
+ Map<String, List<List<String>>> lemma_ExtendedAssocWords) {
+ this.lemma_ExtendedAssocWords = lemma_ExtendedAssocWords;
+ }
+
+ public TaxonomyExtenderViaMebMining(){
+ try {
+ sm = ParserChunker2MatcherProcessor.getInstance();
+ } catch (Exception e){ // now try 'local' openNLP
+ System.err.println("Problem loading synt matcher");
+
+ }
+ ps = new PorterStemmer();
+
+ }
+
+ private List<List<String>>
+ getCommonWordsFromList_List_ParseTreeChunk(List<List<ParseTreeChunk>> matchList, List<String> queryWordsToRemove,
+ List<String> toAddAtEnd){
+ List<List<String>> res = new ArrayList<List<String>>();
+ for(List<ParseTreeChunk> chunks: matchList){
+ List<String> wordRes = new ArrayList<String>();
+ for (ParseTreeChunk ch: chunks){
+ List<String> lemmas = ch.getLemmas();
+ for(int w=0; w< lemmas.size(); w++)
+ if ( (!lemmas.get(w).equals("*")) &&
+ ((ch.getPOSs().get(w).startsWith("NN") || ch.getPOSs().get(w).startsWith("VB"))) &&
+ lemmas.get(w).length()>2){
+ String formedWord = lemmas.get(w);
+ String stemmedFormedWord = ps.stem(formedWord);
+ if (!stemmedFormedWord.startsWith("invalid"))
+ wordRes.add(formedWord);
+ }
+ }
+ wordRes = new ArrayList<String>(new HashSet<String>(wordRes));
+ wordRes.removeAll(queryWordsToRemove);
+ if (wordRes.size()>0){
+ wordRes.addAll(toAddAtEnd);
+ res.add(wordRes);
+ }
+ }
+ res = new ArrayList<List<String>>(new HashSet<List<String>>(res));
+ return res;
+ }
+
+ public void extendTaxonomy(String fileName, String domain, String lang){
+ AriAdapter ad = new AriAdapter();
+ ad.getChainsFromARIfile(fileName);
+ List<String> entries = new ArrayList<String>((ad.lemma_AssocWords.keySet()));
+ try {
+ for(String entity: entries ){ //.
+ List<List<String>> paths = ad.lemma_AssocWords.get(entity);
+ for(List<String> taxoPath: paths){
+ String query = taxoPath.toString()+ " " + entity + " "+ domain; // todo: query forming function here
+ query = query.replace('[', ' ').replace(']',' ').replace(',', ' ').replace('_', ' ');
+ List<List<ParseTreeChunk>> matchList = runSearchForTaxonomyPath(query, "", lang, 30);
+ List<String> toRemoveFromExtension = new ArrayList<String>(taxoPath);
+ toRemoveFromExtension.add(entity); toRemoveFromExtension.add(domain);
+ List<List<String>> resList = getCommonWordsFromList_List_ParseTreeChunk(matchList, toRemoveFromExtension, taxoPath);
+ assocWords_ExtendedAssocWords.put(taxoPath, resList);
+ resList.add(taxoPath);
+ lemma_ExtendedAssocWords.put(entity, resList);
+ }
+ }
+ } catch (Exception e){
+ System.err.println("Problem taxonomy matching");
+ }
+
+ TaxonomySerializer ser = new TaxonomySerializer(lemma_ExtendedAssocWords, assocWords_ExtendedAssocWords);
+ ser.writeTaxonomy(fileName.replace(".ari", "Taxo.dat"));
+ }
+
+ public List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query, String domain, String lang, int numbOfHits) {
+ List<List<ParseTreeChunk>> genResult = new ArrayList<List<ParseTreeChunk>>();
+ try {
+ List<String> resultList = search(query,domain,lang,numbOfHits);
+
+ BingResponse resp = populateBingHit(resultList.get(0));
+ //printSearchResult(resultList.get(0));
+ for(int i=0; i<resp.getHits().size(); i++){
+ {
+ for( int j=i+1; j<resp.getHits().size(); j++){
+ HitBase h1 = resp.getHits().get(i);
+ HitBase h2 = resp.getHits().get(j);
+ String snapshot1 = StringCleaner.processSnapshotForMatching(h1.getTitle()+ " . "+h1.getAbstractText());
+ String snapshot2 = StringCleaner.processSnapshotForMatching(h2.getTitle()+ " . "+h2.getAbstractText());
+ SentencePairMatchResult matchRes = sm.assessRelevance(snapshot1, snapshot2);
+ List<List<ParseTreeChunk>> matchResult = matchRes.getMatchResult();
+ genResult.addAll(matchResult);
+ }
+ }
+ }
+
+ } catch (Exception e) {
+ System.err.print("Problem extracting taxonomy node");
+ }
+
+ return genResult;
+ }
+
+ public static void main(String[] args){
+ TaxonomyExtenderViaMebMining self = new TaxonomyExtenderViaMebMining();
+ self.extendTaxonomy("src/test/resources/taxonomies/irs_dom.ari", "tax", "en");
+
+ }
+
+}
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomySerializer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomySerializer.java?rev=1306623&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomySerializer.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomySerializer.java Wed Mar 28 22:16:43 2012
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.similarity.apps.taxo_builder;
+
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.Serializable;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * This class stores the taxonomy on the file-system
+ * @author Boris
+ *
+ */
+public class TaxonomySerializer implements Serializable {
+
+ private static final long serialVersionUID = 7431412616514648388L;
+ private Map<String, List<List<String>>> lemma_ExtendedAssocWords = new HashMap<String, List<List<String>>>();
+ private Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords = new HashMap<List<String>, List<List<String>>>();
+
+
+ public TaxonomySerializer(
+ Map<String, List<List<String>>> lemma_ExtendedAssocWords,
+ Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords) {
+
+ this.lemma_ExtendedAssocWords = lemma_ExtendedAssocWords;
+ this.assocWords_ExtendedAssocWords = assocWords_ExtendedAssocWords;
+ }
+ public TaxonomySerializer() {
+ // TODO Auto-generated constructor stub
+ }
+ public Map<List<String>, List<List<String>>> getAssocWords_ExtendedAssocWords() {
+ return assocWords_ExtendedAssocWords;
+ }
+ public Map<String, List<List<String>>> getLemma_ExtendedAssocWords() {
+ return lemma_ExtendedAssocWords;
+ }
+ public void setLemma_ExtendedAssocWords(
+ Map<String, List<List<String>>> lemma_ExtendedAssocWords) {
+ this.lemma_ExtendedAssocWords = lemma_ExtendedAssocWords;
+ }
+ public void setAssocWords_ExtendedAssocWords(
+ Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords) {
+ this.assocWords_ExtendedAssocWords = assocWords_ExtendedAssocWords;
+ }
+
+ public void writeTaxonomy(String filename){
+ FileOutputStream fos = null;
+ ObjectOutputStream out = null;
+ try {
+ fos = new FileOutputStream(filename);
+ out = new ObjectOutputStream(fos);
+ out.writeObject(this);
+ out.close();
+ }
+ catch(IOException ex) { ex.printStackTrace(); }
+
+ }
+
+ public static TaxonomySerializer readTaxonomy(String filename){
+ TaxonomySerializer data = null;
+ FileInputStream fis = null;
+ ObjectInputStream in = null;
+ try
+ {
+ fis = new FileInputStream(filename);
+ in = new ObjectInputStream(fis);
+ data = (TaxonomySerializer)in.readObject();
+ in.close();
+ }
+ catch(IOException ex) { ex.printStackTrace(); }
+ catch(ClassNotFoundException ex) { ex.printStackTrace(); }
+
+ //System.out.print(data.lemma_ExtendedAssocWords);
+
+ return data;
+
+ }
+}
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/taxonomy.txt
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/taxonomy.txt?rev=1306623&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/taxonomy.txt (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/taxonomy.txt Wed Mar 28 22:16:43 2012
@@ -0,0 +1 @@
[... 3 lines stripped ...]
Added: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/FileHandler.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/FileHandler.java?rev=1306623&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/FileHandler.java (added)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/FileHandler.java Wed Mar 28 22:16:43 2012
@@ -0,0 +1,368 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps.utils;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.ByteArrayOutputStream;
+import java.io.EOFException;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import org.apache.log4j.Logger;
+
+
+/**
+ *This class responsible to save data to files as well as read out!
+ *It is capable to handle text and binary files.
+ */
+public class FileHandler {
+
+ private static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.utils.FileHandler");
+
+
+ public void writeToTextFile(String data,String filepath,boolean append) throws IOException {
+ try{
+ BufferedWriter out = new BufferedWriter(new FileWriter(filepath, append));
+ out.write(data + "\n");
+ out.close();
+ } catch (IOException e) {
+ LOG.error(e);
+ e.printStackTrace();
+ }
+ }
+ /**
+ * Writes data from an arrayList<String> to a text-file where each line of the text represented by an element in the list.
+ * @param list
+ * @param filePath
+ * @param append
+ * @throws Exception
+ */
+ public void writeToTextFile(ArrayList<String> list, String filePath, boolean append) throws Exception {
+ FileWriter outFile = null;
+ Iterator<String> it = list.iterator();
+ if (!append) {
+ outFile = new FileWriter(filePath);
+ PrintWriter out = new PrintWriter(outFile);
+ while (it.hasNext()) {
+ out.println((String) it.next());
+ }
+ outFile.close();
+ } else {
+ int tmp = 0;
+ while (it.hasNext()) {
+ if (tmp == 0) {
+ appendtofile("\n" + (String) it.next(), filePath);
+ } else {
+ appendtofile((String) it.next(), filePath);
+ }
+ tmp++;
+ }
+ }
+ }
+
+ public void writeObjectToFile(Object obj, String filepath, boolean append) {
+ if(!isFileOrDirectoryExists(getDirPathfromFullPath(filepath))){
+ createFolder(getDirPathfromFullPath(filepath));
+ }
+ ObjectOutputStream outputStream = null;
+ try {
+ outputStream = new ObjectOutputStream(new FileOutputStream(filepath));
+ outputStream.writeObject(obj);
+ } catch (IOException e) {
+ LOG.error(e);
+ }
+ }
+ public Object readObjectfromFile(String filePath){
+ ObjectInputStream inputStream = null;
+ try {
+ //Construct the ObjectInputStream object
+ inputStream = new ObjectInputStream(new FileInputStream(filePath));
+ Object obj = null;
+ while ((obj = inputStream.readObject()) != null) {
+ return obj;
+ }
+ } catch (EOFException ex) { //This exception will be caught when EOF is reached
+ LOG.error("End of file reached.",ex);
+ } catch (ClassNotFoundException ex) {
+ LOG.error(ex);
+ } catch (FileNotFoundException ex) {
+ LOG.error(ex);
+ } catch (IOException ex) {
+ LOG.error(ex);
+ } finally {
+ //Close the ObjectInputStream
+ try {
+ if (inputStream != null) {
+ inputStream.close();
+ }
+ } catch (IOException ex) {
+ LOG.error(ex);
+ }
+ }
+ return null;
+ }
+ /**
+ * Creates a byte array from any object.
+ *
+ * I wanted to use it when I write out object to files! (This is not in use right now, I may move it into other class)
+ *
+ * @param obj
+ * @return
+ * @throws java.io.IOException
+ */
+ public byte[] getBytes(Object obj) throws java.io.IOException{
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ ObjectOutputStream oos = new ObjectOutputStream(bos);
+ oos.writeObject(obj);
+ oos.flush();
+ oos.close();
+ bos.close();
+ byte [] data = bos.toByteArray();
+ return data;
+ }
+
+ /**
+ * Fetches all content from a text file, and return it as a String.
+ * @return
+ */
+ public String readFromTextFile(String filePath) {
+ StringBuilder contents = new StringBuilder();
+ // ...checks on aFile are edited
+ File aFile = new File(filePath);
+
+ try {
+ // use buffering, reading one line at a time
+ // FileReader always assumes default encoding is OK!
+ // TODO be sure that the default encoding is OK!!!!! Otherwise
+ // change it
+
+ BufferedReader input = new BufferedReader(new FileReader(aFile));
+ try {
+ String line = null; // not declared within while loop
+ /*
+ * readLine is a bit quirky : it returns the content of a line
+ * MINUS the newline. it returns null only for the END of the
+ * stream. it returns an empty String if two newlines appear in
+ * a row.
+ */
+ while ((line = input.readLine()) != null) {
+ contents.append(line);
+ contents.append(System.getProperty("line.separator"));
+ }
+ } finally {
+ input.close();
+ }
+ } catch (IOException ex) {
+ LOG.error("fileName: "+filePath,ex);
+ }
+ return contents.toString();
+ }
+ /**
+ * Reads text file line-wise each line will be an element in the resulting list
+ * @param filePath
+ * @return
+ */
+ public List<String> readLinesFromTextFile(String filePath){
+ List<String> lines= new ArrayList<String>();
+ // ...checks on aFile are edited
+ File aFile = new File(filePath);
+ try {
+ // use buffering, reading one line at a time
+ // FileReader always assumes default encoding is OK!
+ // TODO be sure that the default encoding is OK!!!!! Otherwise
+ // change it
+
+ BufferedReader input = new BufferedReader(new FileReader(aFile));
+ try {
+ String line = null; // not declared within while loop
+ /*
+ * readLine is a bit quirky : it returns the content of a line
+ * MINUS the newline. it returns null only for the END of the
+ * stream. it returns an empty String if two newlines appear in
+ * a row.
+ */
+ while ((line = input.readLine()) != null) {
+ lines.add(line);
+ }
+ } finally {
+ input.close();
+ }
+ } catch (IOException ex) {
+ LOG.error(ex);
+ }
+ return lines;
+ }
+
+
+
+ private void appendtofile(String data, String filePath) {
+ try {
+ BufferedWriter out = new BufferedWriter(new FileWriter(filePath,true));
+ out.write(data + "\n");
+ out.close();
+ } catch (IOException e) {
+ }
+ }
+ public void createFolder(String path){
+ if(!isFileOrDirectoryExists(path)){
+ File file = new File(path);
+ try{
+ file.mkdirs();
+ }catch (Exception e) {
+ LOG.error("Directory already exists or the file-system is read only",e);
+ }
+ }
+ }
+ public boolean isFileOrDirectoryExists(String path){
+ File file=new File(path);
+ boolean exists = file.exists();
+ return exists;
+ }
+ /**
+ * Separates the directory-path from a full file-path
+ * @param filePath
+ * @return
+ */
+ private String getDirPathfromFullPath(String filePath){
+ String dirPath="";
+ if(filePath!=null){
+ if(filePath!=""&&filePath.contains("\\"))
+ dirPath =filePath.substring(0,filePath.lastIndexOf("\\"));
+ }
+ return dirPath;
+ }
+ /**
+ * Returns the file-names of the files in a folder (not paths only names) (Not recursive)
+ * @param dirPath
+ * @return
+ */
+ public ArrayList<String> getFileNamesInFolder(String dirPath){
+ ArrayList<String> fileNames= new ArrayList<String>();
+
+ File folder = new File(dirPath);
+ File[] listOfFiles = folder.listFiles();
+
+ for (int i = 0; i < listOfFiles.length; i++) {
+ if (listOfFiles[i].isFile()) {
+ fileNames.add(listOfFiles[i].getName());
+ } else if (listOfFiles[i].isDirectory()) {
+ //TODO if I want to use it recursive I should handle this case
+ }
+ }
+ return fileNames;
+ }
+
+ public void deleteAllfilesinDir(String dirName){
+ ArrayList<String> fileNameList=getFileNamesInFolder(dirName);
+ if(fileNameList!=null){
+ for(int i=0; i<fileNameList.size();i++){
+ try{
+ deleteFile(dirName+fileNameList.get(i));
+ }catch(IllegalArgumentException e){
+ LOG.error("No way to delete file: "+dirName+fileNameList.get(i),e);
+ }
+ }
+ }
+ }
+ public void deleteFile(String filePath) throws IllegalArgumentException{
+ // A File object to represent the filename
+ File f = new File(filePath);
+ // Make sure the file or directory exists and isn't write protected
+ if (!f.exists())
+ throw new IllegalArgumentException(
+ "Delete: no such file or directory: " + filePath);
+
+ if (!f.canWrite())
+ throw new IllegalArgumentException("Delete: write protected: "
+ + filePath);
+ // If it is a directory, make sure it is empty
+ if (f.isDirectory()) {
+ String[] files = f.list();
+ if (files.length > 0)
+ throw new IllegalArgumentException(
+ "Delete: directory not empty: " + filePath);
+ }
+ // Attempt to delete it
+ boolean success = f.delete();
+ if (!success)
+ throw new IllegalArgumentException("Delete: deletion failed");
+ }
+
+ public boolean deleteDirectory(File path) {
+ if( path.exists() ) {
+ File[] files = path.listFiles();
+ for(int i=0; i<files.length; i++) {
+ if(files[i].isDirectory()) {
+ deleteDirectory(files[i]);
+ }
+ else {
+ files[i].delete();
+ }
+ }
+ }
+ return( path.delete() );
+ }
+
+ /**
+ * Returns the absolute-file-paths of the files in a directory (not recursive)
+ * @param dirPath
+ * @return
+ */
+ public ArrayList<String> getFilePathsInFolder(String dirPath){
+ ArrayList<String> filePaths= new ArrayList<String>();
+
+ File folder = new File(dirPath);
+ File[] listOfFiles = folder.listFiles();
+ if(listOfFiles==null)
+ return null;
+ for (int i = 0; i < listOfFiles.length; i++) {
+ if (listOfFiles[i].isFile()) {
+ filePaths.add(listOfFiles[i].getAbsolutePath());
+ } else if (listOfFiles[i].isDirectory()) {
+ //TODO if I want to use it recursive I should handle this case
+ }
+ }
+ return filePaths;
+ }
+ /**
+ * Returns the number of individual files in a directory (Not ercursive)
+ * @param dirPath
+ * @return
+ */
+ public int getFileNumInFolder(String dirPath){
+ int num=0;
+ try{
+ num=getFileNamesInFolder(dirPath).size();
+ }catch (Exception e) {
+ num=0;
+ }
+ return num;
+ }
+
+}
Modified: opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java?rev=1306623&r1=1306622&r2=1306623&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java (original)
+++ opennlp/sandbox/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java Wed Mar 28 22:16:43 2012
@@ -74,12 +74,36 @@ public class ParserChunker2MatcherProces
private static final String MODEL_DIR_KEY = "nlp.models.dir";
// TODO config
// this is where resources should live
- private static String MODEL_DIR, MODEL_DIR_REL = "resources/models111";
+ private static String MODEL_DIR, MODEL_DIR_REL = "resources/models";
protected static ParserChunker2MatcherProcessor instance;
private SentenceDetector sentenceDetector;
private Tokenizer tokenizer;
private POSTagger posTagger;
+ public SentenceDetector getSentenceDetector() {
+ return sentenceDetector;
+ }
+
+ public void setSentenceDetector(SentenceDetector sentenceDetector) {
+ this.sentenceDetector = sentenceDetector;
+ }
+
+ public Tokenizer getTokenizer() {
+ return tokenizer;
+ }
+
+ public void setTokenizer(Tokenizer tokenizer) {
+ this.tokenizer = tokenizer;
+ }
+
+ public ChunkerME getChunker() {
+ return chunker;
+ }
+
+ public void setChunker(ChunkerME chunker) {
+ this.chunker = chunker;
+ }
+
private Parser parser;
private ChunkerME chunker;
private final int NUMBER_OF_SECTIONS_IN_SENTENCE_CHUNKS = 5;
@@ -261,17 +285,17 @@ public class ParserChunker2MatcherProces
tags = POSlist.toArray(new String[0]);
if (toks.length != tags.length){
- LOG.info("disagreement between toks and tags; sent = '"+sentence + "'\n tags = "+tags +
+ LOG.finest("disagreement between toks and tags; sent = '"+sentence + "'\n tags = "+tags +
"\n will now try this sentence in lower case" );
node = parseSentenceNode(sentence.toLowerCase());
if (node==null){
- LOG.info("Problem parsing sentence '"+sentence);
+ LOG.finest("Problem parsing sentence '"+sentence);
return null;
}
POSlist = node.getOrderedPOSList();
tags = POSlist.toArray(new String[0]);
if (toks.length != tags.length){
- LOG.info("AGAIN: disagreement between toks and tags for lower case! ");
+ LOG.finest("AGAIN: disagreement between toks and tags for lower case! ");
if (toks.length>tags.length){
String[] newToks = new String[tags.length];
for(int i = 0; i<tags.length; i++ ){