You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2015/07/28 11:16:26 UTC
svn commit: r1693045 - in /opennlp/sandbox/opennlp-wsd/src:
main/java/opennlp/tools/disambiguator/
main/java/opennlp/tools/disambiguator/DatasetsReader/
main/java/opennlp/tools/disambiguator/ims/
test/java/opennlp/tools/disambiguator/
Author: joern
Date: Tue Jul 28 09:16:25 2015
New Revision: 1693045
URL: http://svn.apache.org/r1693045
Log:
OPENNLP-790
- Fix for the IMS approach to Support Semsor3.0 data
- The output format is now [Source SenseKey] so it corresponds to that of Lesk.
- Removed some unused variables.
- Added Some parameters to let the user select the source of data he wants to use.
- Implemented the IMS Evaluator.
- Added and clarified some parts of the documentation.
Thanks to Mondher Bouazizi for providing a patch.
Added:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IParagraph.java (with props)
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/ISentence.java (with props)
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IWord.java (with props)
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SemcorReaderExtended.java (with props)
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java (with props)
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java (with props)
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java?rev=1693045&r1=1693044&r2=1693045&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java (original)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java Tue Jul 28 09:16:25 2015
@@ -19,8 +19,14 @@
package opennlp.tools.disambiguator;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.HashMap;
import opennlp.tools.disambiguator.lesk.Lesk;
import net.sf.extjwnl.JWNLException;
@@ -28,6 +34,11 @@ import net.sf.extjwnl.data.POS;
public class Constants {
+ private static String resourcesFolder = "src\\test\\resources\\";
+
+ private static String englishDict = resourcesFolder
+ + "models\\en-lemmatizer.dict";
+
public static String osPathChar = "\\";
// List of all the PoS tags
@@ -133,16 +144,19 @@ public class Constants {
if (results != null) {
if (disambiguator instanceof Lesk) {
+ POS pos;
+ long offset;
+ double score;
String[] parts;
for (String result : results) {
- parts = result.split(" ");
+ parts = result.split("@");
+ pos = POS.getPOSForKey(parts[0]);
+ offset = Long.parseLong(parts[1]);
+ score = Double.parseDouble(parts[3]);
try {
- Constants.print("score : "
- + parts[2]
- + " for : "
- + Loader.getDictionary().getWordBySenseKey(parts[1])
- .getSynset().getGloss());
+ Constants.print("score : " + score + " for : "
+ + Loader.getDictionary().getSynsetAt(pos, offset).getGloss());
} catch (JWNLException e) {
e.printStackTrace();
}
@@ -183,7 +197,60 @@ public class Constants {
}
}
- // return the PoS (Class POS) out of the PoS-tag
+ /**
+ * Extract the list of ALL English words
+ *
+ * @param dict
+ * this file is the same that is used in the simple Lemmatizer
+ * (i.e.,"en-lemmatizer.dict")
+ *
+ * @return a list of all the English words
+ */
+ public static HashMap<String, Object> getEnglishWords(String dict) {
+
+ HashMap<String, Object> words = new HashMap<String, Object>();
+
+ BufferedReader br = null;
+
+ File file = new File(englishDict);
+
+ if (file.exists()) {
+
+ try {
+ br = new BufferedReader(new FileReader(file));
+ String line = br.readLine();
+ while (line != null) {
+ line = br.readLine();
+ if (line != null) {
+ String word = line.split("\\t")[0];
+ words.put(word, null);
+ }
+ }
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ } finally {
+ if (br != null) {
+ try {
+ br.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ return words;
+ }
+
+ /**
+ * return the PoS (Class POS) out of the PoS-tag
+ *
+ * @param posTag
+ * PoS tag (e.g., "JJS", "NNP", etc.)
+ * @return the Part of Speech (type {@link POS})
+ */
public static POS getPOS(String posTag) {
ArrayList<String> adjective = new ArrayList<String>(Arrays.asList("JJ",
@@ -208,16 +275,73 @@ public class Constants {
}
+ /**
+ * Check whether a PoS Tag is relevant of not. A PoS Tag is considered
+ * relevant when it corresponds to:
+ * <ul>
+ * <li>VERB</li>
+ * <li>ADJECTIVE</li>
+ * <li>ADVERB</li>
+ * <li>NOUN</li>
+ * </ul>
+ *
+ * @param posTag
+ * the PoS Tag to verify the relevance.
+ * @return whether a PoS Tag corresponds to a relevant Part of Speech (type
+ * {@link POS}) or not ( true} if it is, false} otherwise)
+ */
public static boolean isRelevant(String posTag) {
return getPOS(posTag) != null;
}
+ /**
+ * Check whether a PoS Tag is relevant of not. A PoS Tag is considered
+ * relevant when it is:
+ * <ul>
+ * <li>VERB</li>
+ * <li>ADJECTIVE</li>
+ * <li>ADVERB</li>
+ * <li>NOUN</li>
+ * </ul>
+ *
+ * @param pos
+ * The Part of Speech of Type {@link POS}
+ * @return whether a Part of Speech is relevant (true) or not (false)
+ */
public static boolean isRelevant(POS pos) {
return pos.equals(POS.ADJECTIVE) || pos.equals(POS.ADVERB)
|| pos.equals(POS.NOUN) || pos.equals(POS.VERB);
}
- // Check whether a list of arrays contains an array
+ public static String getPOSabbreviation(String posTag) {
+
+ if (posTag == null) {
+ return null;
+ }
+ if (posTag.startsWith("JJ")) {
+ return "a";
+ } else if (posTag.startsWith("RB")) {
+ return "r";
+ } else if (posTag.startsWith("VB") || posTag.equals("MD")) {
+ return "v";
+ } else if (posTag.startsWith("NN")) {
+ return "n";
+ }
+
+ return null;
+
+ }
+
+ /**
+ * Check whether a list of arrays contains an array
+ *
+ * @param array
+ * The array To check
+ * @param fullList
+ * The full list of Arrays
+ * @return whether the {@link ArrayList} of arrays contains the array (true)
+ * or not (false)
+ */
public static boolean belongsTo(String[] array, ArrayList<String[]> fullList) {
for (String[] refArray : fullList) {
if (areStringArraysEqual(array, refArray))
@@ -226,7 +350,15 @@ public class Constants {
return false;
}
- // Check whether two arrays of strings are equal
+ /**
+ * Check whether two arrays of strings are equal
+ *
+ * @param array1
+ * first array
+ * @param array2
+ * second array
+ * @return whether the two arrays are identical (true) or not (false)
+ */
public static boolean areStringArraysEqual(String[] array1, String[] array2) {
if (array1.equals(null) || array2.equals(null))
@@ -244,4 +376,5 @@ public class Constants {
return true;
}
+
}
Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IParagraph.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IParagraph.java?rev=1693045&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IParagraph.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IParagraph.java Tue Jul 28 09:16:25 2015
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.DatasetsReader;
+
+import java.util.ArrayList;
+
+public class IParagraph {
+
+ protected int pnum;
+ protected ArrayList<ISentence> isentences;
+
+ public IParagraph() {
+ super();
+ this.isentences = new ArrayList<ISentence>();
+ }
+
+ public IParagraph(int pnum) {
+ super();
+ this.pnum = pnum;
+ this.isentences = new ArrayList<ISentence>();
+ }
+
+ public IParagraph(int pnum, ArrayList<ISentence> sentences) {
+ super();
+ this.pnum = pnum;
+ this.isentences = sentences;
+ }
+
+ public int getPnum() {
+ return pnum;
+ }
+
+ public void setPnum(int pnum) {
+ this.pnum = pnum;
+ }
+
+ public ArrayList<ISentence> getSsentences() {
+ return isentences;
+ }
+
+ public void setIsentences(ArrayList<ISentence> isentences) {
+ this.isentences = isentences;
+ }
+
+ public void addIsentence(ISentence isentence) {
+ this.isentences.add(isentence);
+ }
+
+ @Override
+ public String toString() {
+ String paragraph = "";
+ for (int i = 0; i < this.isentences.size(); i++) {
+ paragraph = paragraph + " " + this.isentences.get(i).toString();
+ }
+ return paragraph.substring(1, paragraph.length());
+
+ }
+
+ /**
+ * This return TRUE only and only if the paragraph contains the word and it is
+ * sense-tagged
+ *
+ * @param wordTag
+ * @return {@value Boolean.true} if the word exists in the paragraph and is
+ * sense-tagged
+ *
+ */
+ public boolean contains(String wordTag) {
+
+ for (ISentence isentence : this.getSsentences()) {
+ for (IWord iword : isentence.getIwords()) {
+ if (iword.equals(iword))
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+}
Propchange: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IParagraph.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/ISentence.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/ISentence.java?rev=1693045&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/ISentence.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/ISentence.java Tue Jul 28 09:16:25 2015
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.DatasetsReader;
+
+import java.util.ArrayList;
+
+public class ISentence {
+
+ protected int pnum;
+ protected int snum;
+ protected ArrayList<IWord> iwords;
+
+ public ISentence() {
+ super();
+ this.iwords = new ArrayList<IWord>();
+ }
+
+ public ISentence(int pnum, int snum) {
+ super();
+ this.pnum = pnum;
+ this.snum = snum;
+ this.iwords = new ArrayList<IWord>();
+ }
+
+ public ISentence(int pnum, int snum, ArrayList<IWord> iwords) {
+ super();
+ this.pnum = pnum;
+ this.snum = snum;
+ this.iwords = iwords;
+ }
+
+ public int getPnum() {
+ return pnum;
+ }
+
+ public void setPnum(int pnum) {
+ this.pnum = pnum;
+ }
+
+ public int getSnum() {
+ return snum;
+ }
+
+ public void setSnum(int snum) {
+ this.snum = snum;
+ }
+
+ public ArrayList<IWord> getIwords() {
+ return iwords;
+ }
+
+ public void setIwords(ArrayList<IWord> iwords) {
+ this.iwords = iwords;
+ }
+
+ public void addIword(IWord iword) {
+ this.iwords.add(iword);
+ }
+
+ @Override
+ public String toString() {
+ String sentence = "";
+ for (int i = 0; i < this.iwords.size(); i++) {
+ sentence = sentence + " " + this.iwords.get(i).toString();
+ }
+ return sentence.substring(1, sentence.length());
+
+ }
+
+}
Propchange: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/ISentence.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IWord.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IWord.java?rev=1693045&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IWord.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IWord.java Tue Jul 28 09:16:25 2015
@@ -0,0 +1,298 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.DatasetsReader;
+
+import opennlp.tools.disambiguator.Constants;
+
+public class IWord {
+
+ public static enum Type {
+ WORD(1, "word"), PUNCTUATIONMARK(2, "pm");
+
+ public int code;
+ public String type;
+
+ private Type(int code, String type) {
+ this.code = code;
+ this.type = type;
+ }
+ }
+
+ protected int pnum;
+ protected int snum;
+ protected int wnum;
+
+ // Type refers to the type of word in the sentence
+ protected Type type;
+
+ protected String word;
+ protected String cmd;
+ protected String pos;
+ protected String lemma;
+ protected String wnsn;
+ protected String lexsn;
+
+ public IWord() {
+ super();
+ }
+
+ public IWord(String lemma, String pos) {
+ super();
+ this.word = lemma;
+ this.lemma = lemma;
+ this.pos = pos;
+ }
+
+ /**
+ * This serves to create a DISAMBIGUATED word instance
+ *
+ * @param pnum
+ * id of the paragraph
+ * @param snum
+ * id of the sentence
+ * @param wnum
+ * id of the word in the sentence
+ * @param type
+ * the type in this case is {@link Type.DWORD}
+ * @param word
+ * The raw word, as it appears in the sentence
+ * @param cmd
+ * Whether it is semantically disambiguated or not (or to be
+ * disambiguated)
+ * @param pos
+ * The PoS Tag of the word
+ * @param lemma
+ * The lemma of the word
+ * @param wnsn
+ * The integer sense number corresponding to the WordNet output
+ * display
+ * @param lexsn
+ * The "Sense_key" that indicates the WordNet sense to which word
+ * should be linked
+ *
+ */
+ public IWord(int pnum, int snum, int wnum, Type type, String word,
+ String cmd, String pos, String lemma, String wnsn, String lexsn) {
+ super();
+ this.pnum = pnum;
+ this.snum = snum;
+ this.wnum = wnum;
+ this.type = type;
+ this.word = word;
+ this.cmd = cmd;
+ this.pos = pos;
+ this.lemma = lemma;
+ this.wnsn = wnsn;
+ this.lexsn = lexsn;
+ }
+
+ /**
+ * This serves to create a NON DISAMBIGUATED word instance
+ *
+ * @param pnum
+ * id of the paragraph
+ * @param snum
+ * id of the sentence
+ * @param type
+ * the type in this case is {@link Type.DWORD}
+ * @param word
+ * The raw word, as it appears in the sentence
+ * @param cmd
+ * Whether it is semantically disambiguated or not (or to be
+ * disambiguated)
+ * @param pos
+ * The PoS Tag of the word
+ *
+ */
+ public IWord(int pnum, int snum, int wnum, Type type, String word,
+ String cmd, String pos) {
+ super();
+ this.wnum = wnum;
+ this.pnum = pnum;
+ this.snum = snum;
+ this.type = type;
+ this.word = word;
+ this.cmd = cmd;
+ this.pos = pos;
+ }
+
+ /**
+ * This serves to create a punctuation instances
+ *
+ * @param type
+ * The type as in {@link Type}
+ * @param word
+ * The punctuation mark, as it appears in the sentence
+ */
+ public IWord(int pnum, int snum, int wnum, Type type, String word) {
+ super();
+ this.pnum = pnum;
+ this.snum = snum;
+ this.type = type;
+ this.word = word;
+ }
+
+ public int getPnum() {
+ return pnum;
+ }
+
+ public void setPnum(int pnum) {
+ this.pnum = pnum;
+ }
+
+ public int getSnum() {
+ return snum;
+ }
+
+ public void setSnum(int snum) {
+ this.snum = snum;
+ }
+
+ public int getWnum() {
+ return wnum;
+ }
+
+ public void setWnum(int wnum) {
+ this.wnum = wnum;
+ }
+
+ public String getWord() {
+ return word;
+ }
+
+ public void setWord(String word) {
+ this.word = word;
+ }
+
+ public Type getType() {
+ return type;
+ }
+
+ public void setType(Type type) {
+ this.type = type;
+ }
+
+ public String getCmd() {
+ return cmd;
+ }
+
+ public void setCmd(String cmd) {
+ this.cmd = cmd;
+ }
+
+ public String getPos() {
+ return pos;
+ }
+
+ public void setPos(String pos) {
+ this.pos = pos;
+ }
+
+ public String getLemma() {
+ return lemma;
+ }
+
+ public void setLemma(String lemma) {
+ this.lemma = lemma;
+ }
+
+ public String getWnsn() {
+ return wnsn;
+ }
+
+ public void setWnsn(String wnsn) {
+ this.wnsn = wnsn;
+ }
+
+ public String getLexsn() {
+ return lexsn;
+ }
+
+ public void setLexsn(String lexsn) {
+ this.lexsn = lexsn;
+ }
+
+ @Override
+ public String toString() {
+ return this.word;
+ }
+
+ public boolean equals(Object oword) {
+
+ if (!(oword instanceof IWord))
+ return false;
+ if (oword == this)
+ return true;
+
+ IWord iword = (IWord) oword;
+
+ if (this.lemma != null && iword.getLemma() != null) {
+ if (iword.getLemma().equals(this.getLemma())
+ && Constants.getPOS(iword.getPos()).equals(
+ Constants.getPOS(this.getPos()))) {
+ return true;
+ }
+ } else {
+ if (this.word.equals(iword.getWord())
+ && Constants.getPOSabbreviation(this.getPos()).equals(
+ Constants.getPOSabbreviation(iword.getPos()))) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public boolean isInstanceOf(String wordTag) {
+
+ String tag = Constants.getPOSabbreviation(this.getPos());
+
+ String oword = wordTag.split("\\.")[0];
+ String otag = wordTag.split("\\.")[1];
+
+ if (this.lemma != null) {
+ if (this.lemma.equals(oword) && tag.equals(otag)) {
+ if (this.lexsn != null) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ public boolean senseEquals(Object oword) {
+
+ if (!(oword instanceof IWord))
+ return false;
+ if (oword == this)
+ return true;
+
+ IWord iword = (IWord) oword;
+
+ if (iword.getLemma().equals(this.getLemma())
+ && Constants.getPOS(iword.getPos()).equals(
+ Constants.getPOS(this.getPos()))
+ && iword.getLexsn().equals(this.getLexsn())) {
+ return true;
+ }
+
+ return false;
+ }
+
+}
Propchange: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IWord.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SemcorReaderExtended.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SemcorReaderExtended.java?rev=1693045&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SemcorReaderExtended.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SemcorReaderExtended.java Tue Jul 28 09:16:25 2015
@@ -0,0 +1,301 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.DatasetsReader;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import opennlp.tools.disambiguator.WordToDisambiguate;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+/**
+ * This reads one semcor file. It requires the
+ *
+ */
+public class SemcorReaderExtended {
+
+ private static final String ELEMENT_CONTEXTFILE = "contextfile";
+ private static final String ATTRIBUTE_CONCORDANCE = "concordance";
+
+ private static final String ELEMENT_CONTEXT = "context";
+ private static final String ATTRIBUTE_FILENAME = "filename";
+ private static final String ATTRIBUTE_PARAS = "paras";
+
+ private static final String ELEMENT_PARAGRAPH = "p";
+ private static final String ATTRIBUTE_PARAGRAPHNUM = "pnum";
+
+ private static final String ELEMENT_SENTENCE = "s";
+ private static final String ATTRIBUTE_SENTENCENUM = "snum";
+
+ private static final String ELEMENT_WORDFORM = "wf";
+ private static final String ATTRIBUTE_CMD = "cmd";
+ private static final String ATTRIBUTE_RDF = "rdf";
+ private static final String ATTRIBUTE_POS = "pos";
+ private static final String ATTRIBUTE_LEMMA = "lemma";
+ private static final String ATTRIBUTE_WNSN = "wnsn";
+ private static final String ATTRIBUTE_LEXSN = "lexsn";
+
+ private static final String ELEMENT_PUNCTUATION = "punc";
+
+ private static String path = "src\\test\\resources\\semcor3.0\\";
+ private static String[] folders = { "brown1", "brown2", "brownv" };
+ private static String tagfiles = "\\tagfiles\\";
+
+ public SemcorReaderExtended() {
+ super();
+ }
+
+ /**
+ * This serves to read one Semcor XML file
+ */
+ public ArrayList<ISentence> readFile(String file) {
+
+ ArrayList<ISentence> result = new ArrayList<ISentence>();
+
+ try {
+
+ File xmlFile = new File(file);
+ DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
+ DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
+ Document doc = dBuilder.parse(xmlFile);
+
+ doc.getDocumentElement().normalize();
+
+ NodeList paragraphs = doc.getElementsByTagName(ELEMENT_PARAGRAPH);
+
+ for (int i = 0; i < paragraphs.getLength(); i++) {
+
+ Node nParagraph = paragraphs.item(i);
+
+ if (nParagraph.getNodeType() == Node.ELEMENT_NODE) {
+
+ Element eParagraph = (Element) nParagraph;
+ // THE PARAGRAPH ID
+ int paragraphID = Integer.parseInt(eParagraph
+ .getAttribute(ATTRIBUTE_PARAGRAPHNUM));
+
+ NodeList nSentences = nParagraph.getChildNodes();
+
+ for (int j = 1; j < nSentences.getLength(); j++) {
+
+ Node nSentence = nSentences.item(j);
+ if (nSentence.getNodeType() == Node.ELEMENT_NODE) {
+
+ Element eSentence = (Element) nSentence;
+ // THE SENTENCE ID
+ int sentenceID = Integer.parseInt(eSentence
+ .getAttribute(ATTRIBUTE_SENTENCENUM));
+ ISentence isentence = new ISentence(paragraphID, sentenceID);
+
+ NodeList nWords = nSentence.getChildNodes();
+
+ int wnum = 0;
+ for (int k = 0; k < nWords.getLength(); k++) {
+ Node nWord = nWords.item(k);
+
+ if (nWord.getNodeType() == Node.ELEMENT_NODE) {
+
+ if (nWord.getNodeName().equals(ELEMENT_WORDFORM)) {
+
+ Element eWord = (Element) nWord;
+
+ if (eWord.getAttribute(ATTRIBUTE_CMD).equals("done")) {
+ // if the word is already disambiguated
+ String word = eWord.getTextContent();
+ String cmd = eWord.getAttribute(ATTRIBUTE_CMD);
+ String pos = eWord.getAttribute(ATTRIBUTE_POS);
+ String lemma = eWord.getAttribute(ATTRIBUTE_LEMMA);
+ String wnsn = eWord.getAttribute(ATTRIBUTE_WNSN);
+ String lexsn = eWord.getAttribute(ATTRIBUTE_LEXSN);
+
+ IWord iword = new IWord(paragraphID, sentenceID, wnum,
+ IWord.Type.WORD, word, cmd, pos, lemma, wnsn, lexsn);
+ isentence.addIword(iword);
+ wnum++;
+
+ // System.out.println("*** " + iword.toString() + " ***");
+
+ } else {
+ // if the word is not disambiguated
+ String word = eWord.getTextContent();
+ String cmd = eWord.getAttribute(ATTRIBUTE_CMD);
+ String pos = eWord.getAttribute(ATTRIBUTE_POS);
+
+ IWord iword = new IWord(paragraphID, sentenceID, wnum,
+ IWord.Type.WORD, word, cmd, pos);
+ isentence.addIword(iword);
+ wnum++;
+ }
+
+ } else if (nWord.getNodeName().equals(ELEMENT_PUNCTUATION)) {
+ Element eWord = (Element) nWord;
+ String word = eWord.getTextContent();
+ IWord iword = new IWord(paragraphID, sentenceID, wnum,
+ IWord.Type.PUNCTUATIONMARK, word);
+ isentence.addIword(iword);
+ wnum++;
+ }
+
+ }
+
+ }
+ result.add(isentence);
+ }
+ }
+ }
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ return result;
+ }
+
+ public ArrayList<WordToDisambiguate> getSemcorOneFileData(String file,
+ String wordTag) {
+
+ ArrayList<WordToDisambiguate> setInstances = new ArrayList<WordToDisambiguate>();
+
+ try {
+
+ ArrayList<ISentence> isentences = readFile(file);
+ for (int j = 0; j < isentences.size(); j++) {
+ ISentence isentence = isentences.get(j);
+ ArrayList<IWord> iwords = isentence.getIwords();
+ for (int k = 0; k < iwords.size(); k++) {
+ IWord iword = iwords.get(k);
+ if (iword.isInstanceOf(wordTag)) {
+
+ String sentence;
+ int index;
+
+ if (j == 0) {
+ // case of the first sentence, we consider the current sentence
+ // and the next two ones
+ sentence = isentences.get(j).toString() + " "
+ + isentences.get(j + 1).toString() + " "
+ + isentences.get(j + 2).toString();
+ index = k;
+ } else if (j == isentences.size() - 1) {
+ // case of the last sentence, we consider the current sentence and
+ // the previous two ones
+ sentence = isentences.get(j - 2).toString() + " "
+ + isentences.get(j - 1).toString() + " "
+ + isentences.get(j).toString();
+ index = isentences.get(j - 2).getIwords().size()
+ + isentences.get(j - 1).getIwords().size() + k;
+ } else {
+ // case of a sentence in the middle, we consider the previous
+ // sentence + the current one + the next one
+ sentence = isentences.get(j - 1).toString() + " "
+ + isentences.get(j).toString() + " "
+ + isentences.get(j + 1).toString();
+ index = isentences.get(j - 1).getIwords().size() + k;
+ }
+ ArrayList<String> senses = new ArrayList<String>();
+ String sense = iword.getLexsn();
+ if (sense != null) {
+ senses.add(sense);
+ }
+
+ if (!senses.isEmpty()) {
+ WordToDisambiguate wtd = new WordToDisambiguate(
+ sentence.split("\\s"), index, senses);
+ setInstances.add(wtd);
+ }
+
+ }
+ }
+
+ }
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ return setInstances;
+
+ }
+
+ /**
+ * One Semcor folder reader: This reads all the files in one semcor folder,
+ * and return all the instances in the format {@link WordToDisambiguate} of a
+ * specific word
+ *
+ * @param folder
+ * the name of the folder. Three folders exist in Semcor3.0, which
+ * are ["brown1", "brown2", "brownv"]
+ * @param wordTag
+ * The word, of which we are looking for the instances
+ * @return the list of the {@link WordToDisambiguate} instances
+ */
+ public ArrayList<WordToDisambiguate> getSemcorFolderData(String folder,
+ String wordTag) {
+
+ ArrayList<WordToDisambiguate> result = new ArrayList<WordToDisambiguate>();
+
+ String directory = path + folder + tagfiles;
+ File tempFolder = new File(directory);
+ File[] listOfFiles;
+
+ if (tempFolder.isDirectory()) {
+ listOfFiles = tempFolder.listFiles();
+ for (File file : listOfFiles) {
+
+ ArrayList<WordToDisambiguate> list = getSemcorOneFileData(directory
+ + file.getName(), wordTag);
+ result.addAll(list);
+ }
+ }
+
+ return result;
+
+ }
+
+ /**
+ * Semcor reader: This reads all the files in semcor, and return all the
+ * instances in the format {@link WordToDisambiguate} of a specific word
+ *
+ * @param wordTag
+ * The word, of which we are looking for the instances
+ * @return the list of the {@link WordToDisambiguate} instances of the word to
+ * disambiguate
+ */
+ public ArrayList<WordToDisambiguate> getSemcorData(String wordTag) {
+
+ ArrayList<WordToDisambiguate> result = new ArrayList<WordToDisambiguate>();
+
+ for (String folder : folders) {
+ ArrayList<WordToDisambiguate> list = getSemcorFolderData(folder, wordTag);
+ result.addAll(list);
+ }
+
+ return result;
+
+ }
+
+}
Propchange: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SemcorReaderExtended.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java?rev=1693045&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java Tue Jul 28 09:16:25 2015
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.DatasetsReader;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+import opennlp.tools.disambiguator.WordToDisambiguate;
+import opennlp.tools.disambiguator.ims.WTDIMS;
+
+/**
+ * This class handles the extraction of Senseval-3 data from the different files
+ * (training data, dictionary instances, etc.)
+ */
+public class SensevalReader {
+
+ private String resourcesFolder = "src\\test\\resources\\";
+ protected String sensevalDirectory = resourcesFolder + "senseval3\\";
+
+ protected String data = sensevalDirectory + "EnglishLS.train";
+ protected String sensemapFile = sensevalDirectory + "EnglishLS.sensemap";
+ protected String wordList = sensevalDirectory + "EnglishLS.train.key";
+
+ // protected String dict = sensevalDirectory + "EnglishLS.dictionary.xml";
+ // protected String map = sensevalDirectory + "EnglishLS.sensemap";
+
+ /**
+ * The XML file of Senseval presents some issues that need to be fixed first
+ */
+ private String fixXmlFile() {
+
+ // TODO fix this !
+
+ return null;
+ }
+
+ public SensevalReader() {
+ super();
+ }
+
+ /**
+ * This extracts the equivalent senses. This serves in the case of the
+ * coarse-grained disambiguation
+ *
+ * @param sensemapFile
+ * the file containing the equivalent senses, each set of equivalent
+ * senses per line
+ * @return a {@link HashMap} conaining the new sense ID ({@link Integer}) and
+ * an {@link ArrayList} of the equivalent senses original IDs
+ */
+ public HashMap<Integer, ArrayList<String>> getEquivalentSense() {
+
+ HashMap<Integer, ArrayList<String>> mappedSenses = new HashMap<Integer, ArrayList<String>>();
+
+ try (BufferedReader wordsList = new BufferedReader(new FileReader(
+ sensemapFile))) {
+
+ int index = 0;
+
+ String line;
+
+ while ((line = wordsList.readLine()) != null) {
+
+ String[] temp = line.split("\\s");
+
+ ArrayList<String> tempSenses = new ArrayList<String>();
+
+ for (String sense : temp) {
+ if (sense.length() > 1) {
+ tempSenses.add(sense);
+ }
+ }
+
+ mappedSenses.put(index, tempSenses);
+ index++;
+
+ }
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ return mappedSenses;
+
+ }
+
+ /**
+ * This returns the list of words available in the Senseval data
+ *
+ * @return {@link ArrayList} of the words available on the current Senseval
+ * set
+ */
+ public ArrayList<String> getSensevalWords() {
+
+ ArrayList<String> wordTags = new ArrayList<String>();
+
+ try (BufferedReader br = new BufferedReader(new FileReader(wordList))) {
+
+ String line;
+
+ while ((line = br.readLine()) != null) {
+
+ String word = line.split("\\s")[0];
+
+ if (!wordTags.contains(word)) {
+ wordTags.add(word);
+ }
+
+ }
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ return wordTags;
+
+ }
+
+ /**
+ * Main Senseval Reader: This checks if the data corresponding to the words to
+ * disambiguate exist in the folder, and extract the
+ * {@link WordToDisambiguate} instances
+ *
+ * @param wordTag
+ * The word, of which we are looking for the instances
+ * @return the list of the {@link WordToDisambiguate} instances of the word to
+ * disambiguate
+ */
+ public ArrayList<WordToDisambiguate> getSensevalData(String wordTag) {
+
+ ArrayList<WordToDisambiguate> setInstances = new ArrayList<WordToDisambiguate>();
+
+ try {
+
+ File xmlFile = new File(data);
+ DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
+ DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
+ Document doc = dBuilder.parse(xmlFile);
+
+ doc.getDocumentElement().normalize();
+
+ NodeList lexelts = doc.getElementsByTagName("lexelt");
+
+ for (int i = 0; i < lexelts.getLength(); i++) {
+
+ Node nLexelt = lexelts.item(i);
+
+ if (nLexelt.getNodeType() == Node.ELEMENT_NODE) {
+ Element eLexelt = (Element) nLexelt;
+
+ if (eLexelt.getAttribute("item").equals(wordTag)) {
+
+ NodeList nInstances = nLexelt.getChildNodes();
+
+ for (int j = 1; j < nInstances.getLength(); j++) {
+
+ Node nInstance = nInstances.item(j);
+
+ if (nInstance.getNodeType() == Node.ELEMENT_NODE) {
+
+ Element eInstance = (Element) nInstance;
+
+ String[] wordPos = eLexelt.getAttribute("item").split("\\.");
+ String word = wordPos[0]; // Word
+ String tag; // Part of Speech
+
+ if (wordPos[1].equals("n")) {
+ tag = "noun";
+ } else if (wordPos[1].equals("v")) {
+ tag = "verb";
+ } else if (wordPos[1].equals("a")) {
+ tag = "adjective";
+ } else {
+ tag = "adverb";
+ }
+
+ String id = eInstance.getAttribute("id");
+ String source = eInstance.getAttribute("docsrc");
+
+ ArrayList<String> answers = new ArrayList<String>();
+ String sentence = "";
+ String rawWord = "";
+
+ NodeList nChildren = nInstance.getChildNodes();
+
+ for (int k = 1; k < nChildren.getLength(); k++) {
+ Node nChild = nChildren.item(k);
+
+ if (nChild.getNodeName().equals("answer")) {
+ // String answer =
+ // nChild.getAttributes().item(0).getTextContent();
+ String senseid = nChild.getAttributes().item(1)
+ .getTextContent();
+
+ String temp = senseid;
+ // String[] temp = { answer, senseid };
+ answers.add(temp);
+ }
+
+ if (nChild.getNodeName().equals("context")) {
+ sentence = ((Element) nChild).getTextContent();
+
+ if (nChild.hasChildNodes()) {
+ // textbefore =
+ // nChild.getChildNodes().item(0).getTextContent();
+ rawWord = nChild.getChildNodes().item(1).getTextContent();
+ // textAfter =
+ // nChild.getChildNodes().item(2).getTextContent();
+ }
+ }
+
+ }
+
+ WTDIMS wordToDisambiguate = new WTDIMS(word, answers, sentence,
+ rawWord);
+ setInstances.add(wordToDisambiguate);
+ }
+ }
+
+ }
+
+ }
+
+ }
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ return setInstances;
+
+ }
+
+}
Propchange: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java?rev=1693045&r1=1693044&r2=1693045&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java (original)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java Tue Jul 28 09:16:25 2015
@@ -44,17 +44,13 @@ import opennlp.tools.disambiguator.ims.W
* check {@link https://www.comp.nus.edu.sg/~nght/pubs/ims.pdf} for details
* about this approach
*/
-
public class FeaturesExtractor {
- /**
- * Constructor
- */
public FeaturesExtractor() {
super();
}
- // IMS approach
+ // IMS
private String[] extractPosOfSurroundingWords(String[] sentence,
int wordIndex, int windowSize) {
@@ -230,4 +226,4 @@ public class FeaturesExtractor {
// SST approach
-}
\ No newline at end of file
+}
Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java?rev=1693045&r1=1693044&r2=1693045&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java (original)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java Tue Jul 28 09:16:25 2015
@@ -30,6 +30,7 @@ import net.sf.extjwnl.data.POS;
import net.sf.extjwnl.dictionary.Dictionary;
import net.sf.extjwnl.dictionary.MorphologicalProcessor;
import opennlp.tools.cmdline.postag.POSModelLoader;
+import opennlp.tools.disambiguator.DatasetsReader.SensevalReader;
import opennlp.tools.lemmatizer.SimpleLemmatizer;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
@@ -44,7 +45,7 @@ import opennlp.tools.util.InvalidFormatE
public class Loader {
- private static DataExtractor dExtractor = new DataExtractor();
+ private static SensevalReader dExtractor = new SensevalReader();
private static String modelsDir = "src\\test\\resources\\models\\";
@@ -64,7 +65,6 @@ public class Loader {
private static HashMap<String, Object> englishWords;
- // Constructor
public Loader() {
super();
load();
@@ -102,8 +102,8 @@ public class Loader {
public static HashMap<String, Object> getEnglishWords() {
if (englishWords == null || englishWords.keySet().isEmpty()) {
- englishWords = dExtractor.getEnglishWords(modelsDir
- + "en-lemmatizer.dict");
+ englishWords = Constants
+ .getEnglishWords(modelsDir + "en-lemmatizer.dict");
}
return englishWords;
}
Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java?rev=1693045&r1=1693044&r2=1693045&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java (original)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java Tue Jul 28 09:16:25 2015
@@ -20,6 +20,7 @@
package opennlp.tools.disambiguator;
import java.util.ArrayList;
+import java.util.Arrays;
import net.sf.extjwnl.data.POS;
@@ -53,6 +54,22 @@ public class WordToDisambiguate {
this.sense = sense;
}
+ public WordToDisambiguate(String[] sentence, int wordIndex,
+ ArrayList<String> senseIDs) throws IllegalArgumentException {
+ super();
+
+ if (wordIndex > sentence.length) {
+ throw new IllegalArgumentException("The index is out of bounds !");
+ }
+
+ this.sentence = sentence;
+ this.posTags = PreProcessor.tag(sentence);
+
+ this.wordIndex = wordIndex;
+
+ this.senseIDs = senseIDs;
+ }
+
public WordToDisambiguate(String[] sentence, int wordIndex) {
this(sentence, wordIndex, -1);
}
@@ -66,7 +83,7 @@ public class WordToDisambiguate {
this.sense = -1;
}
-
+
// Sentence
public String[] getSentence() {
return sentence;
@@ -97,15 +114,17 @@ public class WordToDisambiguate {
String ref = "";
- if (Constants.getPOS(this.posTags[wordIndex]).equals(POS.VERB)) {
- ref = wordBaseForm + ".v";
- } else if (Constants.getPOS(this.posTags[wordIndex]).equals(POS.NOUN)) {
- ref = wordBaseForm + ".n";
- } else if (Constants.getPOS(this.posTags[wordIndex]).equals(POS.ADJECTIVE)) {
- ref = wordBaseForm + ".a";
- } else if (Constants.getPOS(this.posTags[wordIndex]).equals(POS.ADVERB)) {
- ref = wordBaseForm + ".r";
- } else {
+ if ((Constants.getPOS(this.posTags[wordIndex]) != null)) {
+ if (Constants.getPOS(this.posTags[wordIndex]).equals(POS.VERB)) {
+ ref = wordBaseForm + ".v";
+ } else if (Constants.getPOS(this.posTags[wordIndex]).equals(POS.NOUN)) {
+ ref = wordBaseForm + ".n";
+ } else if (Constants.getPOS(this.posTags[wordIndex])
+ .equals(POS.ADJECTIVE)) {
+ ref = wordBaseForm + ".a";
+ } else if (Constants.getPOS(this.posTags[wordIndex]).equals(POS.ADVERB)) {
+ ref = wordBaseForm + ".r";
+ }
}
@@ -147,5 +166,11 @@ public class WordToDisambiguate {
public String toString() {
return (wordIndex + "\t" + getWord() + "\n" + sentence);
}
+
+ public void print() {
+ Constants.print("Sentence: " + Arrays.asList(sentence) + "\n" +
+ "Index: " + wordIndex + "\n" +
+ "Word: "+ getWord() + "\n" +
+ "Sense ID: " + senseIDs.get(0));
+ }
}
-
Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java?rev=1693045&r1=1693044&r2=1693045&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java (original)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java Tue Jul 28 09:16:25 2015
@@ -41,24 +41,25 @@ import java.io.IOException;
import java.io.InputStream;
import java.security.InvalidParameterException;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.zip.GZIPInputStream;
-import opennlp.tools.disambiguator.DictionaryInstance;
+import net.sf.extjwnl.JWNLException;
import net.sf.extjwnl.data.POS;
import net.sf.extjwnl.data.Synset;
+import net.sf.extjwnl.data.Word;
import opennlp.tools.ml.model.MaxentModel;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.ObjectStreamUtils;
import opennlp.tools.util.Span;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.disambiguator.Constants;
-import opennlp.tools.disambiguator.DataExtractor;
import opennlp.tools.disambiguator.FeaturesExtractor;
-import opennlp.tools.disambiguator.PreProcessor;
import opennlp.tools.disambiguator.WSDParameters;
import opennlp.tools.disambiguator.WordPOS;
import opennlp.tools.disambiguator.WSDisambiguator;
+import opennlp.tools.disambiguator.WordToDisambiguate;
+import opennlp.tools.disambiguator.DatasetsReader.SemcorReaderExtended;
+import opennlp.tools.disambiguator.DatasetsReader.SensevalReader;
/**
* Implementation of the <b>It Makes Sense</b> approach originally proposed in
@@ -80,85 +81,108 @@ public class IMS implements WSDisambigua
private final IMSContextGenerator cg;
private FeaturesExtractor fExtractor = new FeaturesExtractor();
- private DataExtractor dExtractor = new DataExtractor();
+ /**
+ * Sets the input parameters to the default ones
+ *
+ * @throws InvalidParameterException
+ */
public IMS() {
super();
+ // Loader loader = new Loader();
this.parameters = new IMSParameters();
- ;
this.cg = parameters.createContextGenerator();
}
+ /**
+ * Initializes the loader object and sets the input parameters
+ *
+ * @param parameters
+ * The parameters to be used
+ * @throws InvalidParameterException
+ */
public IMS(IMSParameters parameters) {
super();
this.parameters = parameters;
this.cg = this.parameters.createContextGenerator();
}
- // Internal Methods
- private String getTrainingFileName(WTDIMS wtd) {
-
- String wordBaseForm = PreProcessor
- .lemmatize(wtd.getWord(), wtd.getPosTag());
+ /**
+ * Returns that parameter settings of the IMS object.
+ *
+ * @return the parameter settings
+ */
+ @Override
+ public WSDParameters getParams() {
+ return this.parameters;
+ }
- String ref = "";
+ /**
+ * Returns that parameter settings of the IMS object. The returned parameters
+ * are of type {@link IMSParameters}
+ *
+ * @return the parameter settings
+ */
+ public IMSParameters getParameters() {
+ return this.parameters;
+ }
- if (Constants.getPOS(wtd.getPosTag()).equals(POS.VERB)) {
- ref = wordBaseForm + ".v";
- } else if (Constants.getPOS(wtd.getPosTag()).equals(POS.NOUN)) {
- ref = wordBaseForm + ".n";
- } else if (Constants.getPOS(wtd.getPosTag()).equals(POS.ADJECTIVE)) {
- ref = wordBaseForm + ".a";
- } else if (Constants.getPOS(wtd.getPosTag()).equals(POS.ADVERB)) {
- ref = wordBaseForm + ".r";
+ /**
+ * If the parameters are null, set the default ones. Otherwise, only set them
+ * if they valid. Invalid parameters will return a exception (and set the
+ * parameters to the default ones)
+ *
+ * @param Input
+ * parameters
+ * @throws InvalidParameterException
+ */
+ @Override
+ public void setParams(WSDParameters parameters)
+ throws InvalidParameterException {
+ if (parameters == null) {
+ this.parameters = new IMSParameters();
} else {
-
+ if (parameters.isValid()) {
+ this.parameters = (IMSParameters) parameters;
+ } else {
+ this.parameters = new IMSParameters();
+ throw new InvalidParameterException("wrong parameters");
+ }
}
- return ref;
}
- private void saveAllSurroundingWords(ArrayList<WTDIMS> trainingData,
- String wordTag) {
-
- ArrayList<String> surrWords = fExtractor
- .extractTrainingSurroundingWords(trainingData);
-
- File file = new File(parameters.getTrainingDataDirectory() + wordTag
- + ".sw");
- if (!file.exists()) {
- try {
-
- file.createNewFile();
-
- FileWriter fw = new FileWriter(file.getAbsoluteFile());
- BufferedWriter bw = new BufferedWriter(fw);
-
- for (String surrWord : surrWords) {
- bw.write(surrWord);
- bw.newLine();
- }
-
- bw.close();
-
- System.out.println("Done");
-
- } catch (IOException e) {
- e.printStackTrace();
+ /**
+ * If the parameters are null, set the default ones. Otherwise, only set them
+ * if they valid. Invalid parameters will return a exception (and set the
+ * parameters to the default ones)
+ *
+ * @param Input
+ * parameters
+ * @throws InvalidParameterException
+ */
+ public void setParams(IMSParameters parameters)
+ throws InvalidParameterException {
+ if (parameters == null) {
+ this.parameters = new IMSParameters();
+ } else {
+ if (parameters.isValid()) {
+ this.parameters = parameters;
+ } else {
+ this.parameters = new IMSParameters();
+ throw new InvalidParameterException("wrong parameters");
}
-
}
-
}
+ // Internal Methods
private ArrayList<String> getAllSurroundingWords(String wordTag) {
ArrayList<String> surrWords = new ArrayList<String>();
BufferedReader br = null;
- File file = new File(parameters.getTrainingDataDirectory() + wordTag
- + ".sw");
+ File file = new File(IMSParameters.trainingDataDirectory + wordTag + ".sw");
if (file.exists()) {
@@ -191,47 +215,47 @@ public class IMS implements WSDisambigua
}
- private ArrayList<WTDIMS> extractTrainingData(String wordTrainingXmlFile,
- HashMap<String, ArrayList<DictionaryInstance>> senses) {
+ private void saveAllSurroundingWords(ArrayList<WTDIMS> trainingInstances,
+ String wordTag) {
- /**
- * word tag has to be in the format "word.t" (e.g., "activate.v", "smart.a",
- * etc.)
- */
-
- ArrayList<WTDIMS> trainingData = dExtractor
- .extractWSDInstances(wordTrainingXmlFile);
-
- for (WTDIMS word : trainingData) {
- for (String senseId : word.getSenseIDs()) {
- for (String dictKey : senses.keySet()) {
- for (DictionaryInstance instance : senses.get(dictKey)) {
- if (senseId.equals(instance.getId())) {
- word.setSense(Integer.parseInt(dictKey.split("_")[1]));
- break;
- }
- }
- }
+ ArrayList<String> surrWords = fExtractor
+ .extractTrainingSurroundingWords(trainingInstances);
+
+ File file = new File(IMSParameters.trainingDataDirectory + wordTag + ".sw");
+ if (!file.exists()) {
+
+ try {
+ file.createNewFile();
+ } catch (IOException e) {
+ System.out
+ .println("Unable to create the List of Surrounding Words file !");
}
}
- return trainingData;
- }
+ try {
+ FileWriter fw = new FileWriter(file.getAbsoluteFile());
+ BufferedWriter bw = new BufferedWriter(fw);
- private void extractFeature(WTDIMS word) {
+ for (String surrWord : surrWords) {
+ bw.write(surrWord);
+ bw.newLine();
+ }
- fExtractor.extractIMSFeatures(word, this.parameters.getWindowSize(),
- this.parameters.getNgram());
+ bw.close();
+ } catch (IOException e) {
+ System.out
+ .println("Unable to create the List of Surrounding Words file !");
+ e.printStackTrace();
+ }
- }
+ System.out.println("Done");
- private HashMap<String, String> getWordDictionaryInstance(WTDIMS wtd) {
+ }
- String dict = parameters.getDict();
- String map = parameters.getMap();
+ private void extractFeature(WTDIMS word) {
- return dExtractor.getDictionaryInstance(dict, map,
- this.getTrainingFileName(wtd));
+ fExtractor.extractIMSFeatures(word, this.parameters.getWindowSize(),
+ this.parameters.getNgram());
}
@@ -240,19 +264,37 @@ public class IMS implements WSDisambigua
String word = wordToDisambiguate.getRawWord();
POS pos = Constants.getPOS(wordToDisambiguate.getPosTag());
- WordPOS wordPOS = new WordPOS(word, pos);
+ if (pos != null) {
- ArrayList<Synset> synsets = wordPOS.getSynsets();
+ WordPOS wordPOS = new WordPOS(word, pos);
- int size = synsets.size();
+ ArrayList<Synset> synsets = wordPOS.getSynsets();
- String[] senses = new String[size];
+ int size = synsets.size();
- for (int i = 0; i < size; i++) {
- senses[i] = synsets.get(i).getGloss();
- }
+ String[] senses = new String[size];
- return senses;
+ for (int i = 0; i < size; i++) {
+ String senseKey = null;
+ for (Word wd : synsets.get(i).getWords()) {
+ if (wd.getLemma().equals(
+ wordToDisambiguate.getRawWord().split("\\.")[0])) {
+ try {
+ senseKey = wd.getSenseKey();
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ }
+ senses[i] = senseKey;
+ break;
+ }
+ }
+
+ }
+ return senses;
+ } else {
+ System.out.println("The word has no definitions in WordNet !");
+ return null;
+ }
}
@@ -260,78 +302,60 @@ public class IMS implements WSDisambigua
* Method for training a model
*
* @param wordTag
- * : the word to disambiguate. It should be written in the format
+ * the word to disambiguate. It should be written in the format
* "word.p" (Exp: "write.v", "well.r", "smart.a", "go.v"
* @param trainParams
- * : the parameters used for training
+ * the parameters used for training
+ * @param trainingInstances
+ * the training data in the format {@link WTDIMS}
*/
- public void train(String wordTag, TrainingParameters trainParams) {
-
- String dict = parameters.getDict();
- String map = parameters.getMap();
+ public void train(String wordTag, TrainingParameters trainParams,
+ ArrayList<WTDIMS> trainingInstances) {
- String wordTrainingxmlFile = parameters.getRawDataDirectory() + wordTag
- + ".xml";
- String wordTrainingbinFile = parameters.getTrainingDataDirectory()
- + wordTag + ".gz";
-
- File bf = new File(wordTrainingxmlFile);
+ String wordTrainingbinFile = IMSParameters.trainingDataDirectory + wordTag
+ + ".gz";
ObjectStream<Event> IMSes = null;
- if (bf.exists() && !bf.isDirectory()) {
-
- HashMap<String, ArrayList<DictionaryInstance>> senses = dExtractor
- .extractWordSenses(dict, map, wordTag);
-
- ArrayList<WTDIMS> instances = extractTrainingData(wordTrainingxmlFile,
- senses);
-
- for (WTDIMS wtd : instances) {
- extractFeature(wtd);
- }
-
- saveAllSurroundingWords(instances, wordTag);
-
- for (WTDIMS wtd : instances) {
- extractFeature(wtd);
- }
+ for (WTDIMS wtd : trainingInstances) {
+ extractFeature(wtd);
+ }
- ArrayList<String> surrWords = getAllSurroundingWords(wordTag);
+ saveAllSurroundingWords(trainingInstances, wordTag);
- for (WTDIMS wtd : instances) {
- fExtractor.serializeIMSFeatures(wtd, surrWords);
- }
+ ArrayList<String> surrWords = getAllSurroundingWords(wordTag);
- ArrayList<Event> events = new ArrayList<Event>();
+ for (WTDIMS wtd : trainingInstances) {
+ fExtractor.serializeIMSFeatures(wtd, surrWords);
+ }
- for (WTDIMS wtd : instances) {
+ ArrayList<Event> events = new ArrayList<Event>();
- int sense = wtd.getSense();
+ for (WTDIMS wtd : trainingInstances) {
- String[] context = cg.getContext(wtd);
+ String sense = wtd.getSenseIDs().get(0);
- Event ev = new Event(sense + "", context);
+ String[] context = cg.getContext(wtd);
- events.add(ev);
+ Event ev = new Event(sense + "", context);
- IMSes = ObjectStreamUtils.createObjectStream(events);
+ events.add(ev);
- }
+ IMSes = ObjectStreamUtils.createObjectStream(events);
- DataIndexer indexer;
- try {
- indexer = new OnePassDataIndexer((ObjectStream<Event>) IMSes);
- MaxentModel trainedMaxentModel = GIS.trainModel(200, indexer);
- File outFile = new File(wordTrainingbinFile);
- AbstractModelWriter writer = new SuffixSensitiveGISModelWriter(
- (AbstractModel) trainedMaxentModel, outFile);
- writer.persist();
+ }
- } catch (IOException e) {
- e.printStackTrace();
- }
+ DataIndexer indexer;
+ try {
+ indexer = new OnePassDataIndexer((ObjectStream<Event>) IMSes);
+ MaxentModel trainedMaxentModel = GIS.trainModel(200, indexer);
+ File outFile = new File(wordTrainingbinFile);
+ AbstractModelWriter writer = new SuffixSensitiveGISModelWriter(
+ (AbstractModel) trainedMaxentModel, outFile);
+ writer.persist();
+ } catch (IOException e) {
+ e.printStackTrace();
}
}
@@ -339,17 +363,17 @@ public class IMS implements WSDisambigua
/**
* Load an existing model
*
- * @param binFile
- * : Location of the already trained model
+ * @param trainedModel
+ * Name of the file of the already trained model
* @return the model trained
*/
- public MaxentModel load(String binFile) {
+ public MaxentModel load(String trainedModel) {
MaxentModel loadedMaxentModel = null;
FileInputStream inputStream;
try {
- inputStream = new FileInputStream(binFile);
+ inputStream = new FileInputStream(trainedModel);
InputStream decodedInputStream = new GZIPInputStream(inputStream);
DataReader modelReader = new PlainTextFileDataReader(decodedInputStream);
loadedMaxentModel = new GISModelReader(modelReader).getModel();
@@ -373,25 +397,29 @@ public class IMS implements WSDisambigua
@Override
public String[] disambiguate(String[] inputText, int inputWordIndex) {
- String rawDataDirectory = this.parameters.getRawDataDirectory();
- String trainingDataDirectory = this.parameters.getTrainingDataDirectory();
+ String trainingDataDirectory = IMSParameters.trainingDataDirectory;
+
+ File file = new File(trainingDataDirectory);
+
+ if (!file.exists()) {
+ file.mkdirs();
+ }
WTDIMS word = new WTDIMS(inputText, inputWordIndex);
fExtractor.extractIMSFeatures(word, this.parameters.getWindowSize(),
this.parameters.getNgram());
- String wordTag = getTrainingFileName(word);
+ String wordTag = word.getWordTag();
- String wordTrainingxmlFile = rawDataDirectory + wordTag + ".xml";
String wordTrainingbinFile = trainingDataDirectory + wordTag + ".gz";
File bf = new File(wordTrainingbinFile);
MaxentModel loadedMaxentModel = null;
String outcome = "";
+
if (bf.exists() && !bf.isDirectory()) {
- // if the model file exists already
- // System.out.println("the model file was found !");
+ // If the trained model exists
ArrayList<String> surrWords = getAllSurroundingWords(wordTag);
fExtractor.serializeIMSFeatures(word, surrWords);
@@ -402,11 +430,40 @@ public class IMS implements WSDisambigua
outcome = loadedMaxentModel.getBestOutcome(outcomeProbs);
} else {
- bf = new File(wordTrainingxmlFile);
- if (bf.exists() && !bf.isDirectory()) {
- // if the xml file exists already
- // System.out.println("the xml file was found !");
- train(wordTag, null);
+ // Depending on the source, go fetch the training data
+ ArrayList<WTDIMS> trainingInstances = new ArrayList<WTDIMS>();
+ switch (this.parameters.getSource().code) {
+ case 1: {
+ SemcorReaderExtended sReader = new SemcorReaderExtended();
+ for (WordToDisambiguate ti : sReader.getSemcorData(wordTag)) {
+ WTDIMS imsIT = new WTDIMS(ti);
+ extractFeature(imsIT);
+ trainingInstances.add(imsIT);
+ }
+ break;
+ }
+
+ case 2: {
+ SensevalReader sReader = new SensevalReader();
+ for (WordToDisambiguate ti : sReader.getSensevalData(wordTag)) {
+ WTDIMS imsIT = (WTDIMS) ti;
+ extractFeature(imsIT);
+ trainingInstances.add(imsIT);
+ }
+ break;
+ }
+
+ case 3: {
+ // TODO check the case when the user selects his own data set (make an
+ // interface to collect training data)
+ break;
+ }
+ }
+
+ if (!trainingInstances.isEmpty()) {
+
+ train(wordTag, null, trainingInstances);
+
ArrayList<String> surrWords = getAllSurroundingWords(wordTag);
fExtractor.serializeIMSFeatures(word, surrWords);
@@ -418,21 +475,22 @@ public class IMS implements WSDisambigua
double[] outcomeProbs = loadedMaxentModel.eval(context);
outcome = loadedMaxentModel.getBestOutcome(outcomeProbs);
}
+
}
if (!outcome.equals("")) {
- HashMap<String, String> senses = getWordDictionaryInstance(word);
+ // System.out.println("The sense is [" + outcome + "] : " /*+
+ // Loader.getDictionary().getWordBySenseKey(outcome.split("%")[1]).getSynset().getGloss()*/);
- String index = wordTag + "_" + outcome;
+ outcome = "WordNet " + wordTag.split("\\.")[0] + "%" + outcome;
- String[] s = { senses.get(index) };
+ String[] s = { outcome };
return s;
} else {
// if no training data exist
- // System.out.println("No training data available, the MFS is returned !");
String[] s = getMostFrequentSense(word);
return s;
}
@@ -454,17 +512,4 @@ public class IMS implements WSDisambigua
return null;
}
- // TODO fix the conflicts in parameters with Anthony's code
- @Override
- public WSDParameters getParams() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public void setParams(WSDParameters params) throws InvalidParameterException {
- // TODO Auto-generated method stub
-
- }
-
}
Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java?rev=1693045&r1=1693044&r2=1693045&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java (original)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java Tue Jul 28 09:16:25 2015
@@ -19,6 +19,8 @@
package opennlp.tools.disambiguator.ims;
+import java.io.File;
+
import opennlp.tools.disambiguator.WSDParameters;
/**
@@ -27,40 +29,63 @@ import opennlp.tools.disambiguator.WSDPa
*/
public class IMSParameters extends WSDParameters {
+ public static enum Source {
+ SEMCOR(1, "semcor"), SEMEVAL(2, "semeval"), OTHER(3, "other");
+
+ public int code;
+ public String src;
+
+ private Source(int code, String src) {
+ this.code = code;
+ this.src = src;
+ }
+ }
+
protected String languageCode;
protected int windowSize;
protected int ngram;
+ protected Source source;
- protected String resourcesFolder = "src\\test\\resources\\supervised\\";
-
- protected String rawDataDirectory = resourcesFolder + "raw\\";
- protected String trainingDataDirectory = resourcesFolder + "models\\";
- protected String dictionaryDirectory = resourcesFolder + "dictionary\\";
-
- protected String dict = dictionaryDirectory + "EnglishLS.dictionary.xml";
- protected String map = dictionaryDirectory + "EnglishLS.sensemap";
-
- public IMSParameters() {
- super();
- this.languageCode = "En";
- this.windowSize = 3;
- this.ngram = 2;
- }
+ public static final String resourcesFolder = "src\\test\\resources\\";
+ public static final String trainingDataDirectory = resourcesFolder
+ + "supervised\\models\\";
/**
+ * This constructor takes only two parameters. The default language used is
+ * <i>English</i>
*
* @param windowSize
- * : the size of the window used for the extraction of the features
+ * the size of the window used for the extraction of the features
* qualified of Surrounding Words
* @param ngram
- * : the number words used for the extraction of features qualified
- * of Local Collocations
+ * the number words used for the extraction of features qualified of
+ * Local Collocations
+ * @param source
+ * the source of the training data
*/
- public IMSParameters(int windowSize, int ngram) {
+ public IMSParameters(int windowSize, int ngram, Source source) {
super();
this.languageCode = "En";
this.windowSize = windowSize;
this.ngram = ngram;
+ this.source = source;
+ this.isCoarseSense = false;
+
+ File folder = new File(trainingDataDirectory);
+ if (!folder.exists())
+ folder.mkdirs();
+ }
+
+ public IMSParameters() {
+ this(3, 2, Source.SEMCOR);
+ }
+
+ public IMSParameters(Source source) {
+ this(3, 2, source);
+ }
+
+ public IMSParameters(int windowSize, int ngram) {
+ this(windowSize, ngram, Source.SEMCOR);
}
public String getLanguageCode() {
@@ -87,52 +112,12 @@ public class IMSParameters extends WSDPa
this.ngram = ngram;
}
- public String getRawDataDirectory() {
- return rawDataDirectory;
- }
-
- public void setRawDataDirectory(String rawDataDirectory) {
- this.rawDataDirectory = rawDataDirectory;
- }
-
- public String getTrainingDataDirectory() {
- return trainingDataDirectory;
- }
-
- public void setTrainingDataDirectory(String trainingDataDirectory) {
- this.trainingDataDirectory = trainingDataDirectory;
- }
-
- public String getDictionaryDirectory() {
- return dictionaryDirectory;
- }
-
- public void setDictionaryDirectory(String dictionaryDirectory) {
- this.dictionaryDirectory = dictionaryDirectory;
- }
-
- public String getDict() {
- return dict;
- }
-
- public void setDict(String dict) {
- this.dict = dict;
- }
-
- public String getMap() {
- return map;
- }
-
- public void setMap(String map) {
- this.map = map;
- }
-
- public String getResourcesFolder() {
- return resourcesFolder;
+ public Source getSource() {
+ return source;
}
- public void setResourcesFolder(String resourcesFolder) {
- this.resourcesFolder = resourcesFolder;
+ public void setSource(Source source) {
+ this.source = source;
}
void init() {
@@ -149,7 +134,7 @@ public class IMSParameters extends WSDPa
@Override
public boolean isValid() {
// TODO Auto-generated method stub
- return false;
+ return true;
}
}
Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java?rev=1693045&r1=1693044&r2=1693045&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java (original)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java Tue Jul 28 09:16:25 2015
@@ -21,6 +21,8 @@ package opennlp.tools.disambiguator.ims;
import java.util.ArrayList;
+import net.sf.extjwnl.data.POS;
+import opennlp.tools.disambiguator.Constants;
import opennlp.tools.disambiguator.PreProcessor;
import opennlp.tools.disambiguator.WordToDisambiguate;
@@ -41,8 +43,8 @@ public class WTDIMS extends WordToDisamb
super(sentence, word);
}
- public WTDIMS(String xmlWord, ArrayList<String> xmlAnswers,
- String xmlSentence, String xmlrawWord) {
+ public WTDIMS(String xmlWord, ArrayList<String> senseIDs, String xmlSentence,
+ String xmlrawWord) {
super();
// this.word = xmlWord;
@@ -57,10 +59,15 @@ public class WTDIMS extends WordToDisamb
}
}
- this.senseIDs = xmlAnswers;
+ this.senseIDs = senseIDs;
}
+ public WTDIMS(WordToDisambiguate wtd) {
+ super(wtd.getSentence(), wtd.getWordIndex(), wtd.getSense());
+ this.senseIDs = wtd.getSenseIDs();
+ }
+
public String[] getPosOfSurroundingWords() {
return posOfSurroundingWords;
}
@@ -93,4 +100,25 @@ public class WTDIMS extends WordToDisamb
this.features = features;
}
+ public String getWordTag() {
+
+ String wordBaseForm = PreProcessor.lemmatize(this.getWord(),
+ this.getPosTag());
+
+ String ref = "";
+
+ if ((Constants.getPOS(this.getPosTag()) != null)) {
+ if (Constants.getPOS(this.getPosTag()).equals(POS.VERB)) {
+ ref = wordBaseForm + ".v";
+ } else if (Constants.getPOS(this.getPosTag()).equals(POS.NOUN)) {
+ ref = wordBaseForm + ".n";
+ } else if (Constants.getPOS(this.getPosTag()).equals(POS.ADJECTIVE)) {
+ ref = wordBaseForm + ".a";
+ } else if (Constants.getPOS(this.getPosTag()).equals(POS.ADVERB)) {
+ ref = wordBaseForm + ".r";
+ }
+ }
+
+ return ref;
+ }
}
Added: opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java?rev=1693045&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java Tue Jul 28 09:16:25 2015
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import opennlp.tools.disambiguator.DatasetsReader.SensevalReader;
+import opennlp.tools.disambiguator.ims.IMS;
+import opennlp.tools.disambiguator.ims.IMSParameters;
+import opennlp.tools.disambiguator.ims.WTDIMS;
+
+import org.junit.Test;
+
+public class IMSEvaluatorTest {
+
+ static SensevalReader seReader = new SensevalReader();
+
+ @Test
+ public static void main(String[] args) {
+ Constants.print("Evaluation Started");
+
+ IMS ims = new IMS();
+ IMSParameters imsParams = new IMSParameters();
+ ims.setParams(imsParams);
+
+ ArrayList<String> words = seReader.getSensevalWords();
+
+ for (String word : words) {
+ WSDEvaluator evaluator = new WSDEvaluator(ims);
+
+ // don't take verbs because they are not from WordNet
+ if (!word.split("\\.")[1].equals("v")) {
+
+ ArrayList<WTDIMS> instances = getTestData(word);
+
+ if (instances != null) {
+ Constants.print("------------------" + word + "------------------");
+ for (WordToDisambiguate instance : instances) {
+ // Constants.print("sense IDs : " + instance.senseIDs);
+ evaluator.evaluateSample(instance);
+ }
+ Constants.print(evaluator.toString());
+ } else {
+ Constants.print("null instances");
+ }
+ }
+
+ }
+
+ }
+
+ /**
+ * For a specific word, return the Semeval3 corresponding instances in form of
+ * {@link WSDIMS}
+ *
+ * @param wordTag
+ * the word of which the instances are to be collected. wordTag has
+ * to be in the format "word.POS" (e.g., "activate.v", "smart.a",
+ * etc.)
+ * @return list of {@link WSDIMS} instances of the wordTag
+ */
+ protected static ArrayList<WTDIMS> getTestData(String wordTag) {
+
+ ArrayList<WTDIMS> instances = new ArrayList<WTDIMS>();
+ for (WordToDisambiguate wtd : seReader.getSensevalData(wordTag)) {
+ WTDIMS wtdims = new WTDIMS(wtd);
+ instances.add(wtdims);
+ }
+
+ return instances;
+ }
+
+}
Propchange: opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
------------------------------------------------------------------------------
svn:mime-type = text/plain