You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2015/06/24 22:19:05 UTC
svn commit: r1687358 - in /opennlp/sandbox/opennlp-wsd: ./ src/ src/main/
src/main/java/ src/main/java/opennlp/ src/main/java/opennlp/tools/
src/main/java/opennlp/tools/disambiguator/
src/main/java/opennlp/tools/disambiguator/ims/ src/main/java/opennlp...
Author: joern
Date: Wed Jun 24 20:19:05 2015
New Revision: 1687358
URL: http://svn.apache.org/r1687358
Log:
Added initial version of the wsd component. Thanks to Anthony Beylerian and Mondher Bouazizi for the contribution.
Added:
opennlp/sandbox/opennlp-wsd/
opennlp/sandbox/opennlp-wsd/src/
opennlp/sandbox/opennlp-wsd/src/main/
opennlp/sandbox/opennlp-wsd/src/main/java/
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Node.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/PreProcessor.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordSense.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/FeaturesExtractor.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/WTDLesk.java
opennlp/sandbox/opennlp-wsd/src/test/
opennlp/sandbox/opennlp-wsd/src/test/java/
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java?rev=1687358&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java Wed Jun 24 20:19:05 2015
@@ -0,0 +1,134 @@
+package opennlp.tools.disambiguator;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import net.sf.extjwnl.data.POS;
+
+
+public class Constants {
+
+ public static String osPathChar = "\\";
+
+ // List of all the PoS tags
+ public static String[] allPOS = { "CC", "CD", "DT", "EX", "FW", "IN", "JJ",
+ "JJR", "JJS", "LS", "MD", "NN", "NNS", "NNP", "NNPS", "PDT", "POS",
+ "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB",
+ "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB" };
+
+ // List of the PoS tags of which the senses are to be extracted
+ public static String[] relevantPOS = { "JJ", "JJR", "JJS", "NN", "NNS", "RB", "RBR", "RBS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ" };
+
+
+ // List of Negation Words
+ public static ArrayList<String> negationWords = new ArrayList<String>(
+ Arrays.asList("not", "no", "never", "none", "nor", "non"));
+
+ // List of Stop Words
+ public static ArrayList<String> stopWords = new ArrayList<String>(Arrays.asList( "a", "able", "about", "above", "according", "accordingly", "across", "actually", "after",
+ "afterwards", "again", "against", "ain't", "all", "allow", "allows", "almost", "alone", "along", "already", "also",
+ "although", "always", "am", "among", "amongst", "an", "and", "another", "any", "anybody", "anyhow", "anyone", "anything",
+ "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "aren't", "around", "as", "aside", "ask",
+ "asking", "associated", "at", "available", "away", "awfully", "be", "became", "because", "become", "becomes", "becoming", "been",
+ "before", "beforehand", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "both",
+ "brief", "but", "by", "came", "can", "cannot", "cant", "can't", "cause", "causes", "certain", "certainly", "changes", "clearly",
+ "c'mon", "co", "com", "come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing",
+ "contains", "corresponding", "could", "couldn't", "course", "c's", "currently", "definitely", "described", "despite", "did", "didn't",
+ "different", "do", "does", "doesn't", "doing", "done", "don't", "down", "downwards", "during", "each", "edu", "eg", "eight",
+ "either", "else", "elsewhere", "enough", "entirely", "especially", "et", "etc", "even", "ever", "every", "everybody", "everyone",
+ "everything", "everywhere", "ex", "exactly", "example", "except", "far", "few", "fifth", "first", "five", "followed", "following",
+ "follows", "for", "former", "formerly", "forth", "four", "from", "further", "furthermore", "get", "gets", "getting", "given",
+ "gives", "go", "goes", "going", "gone", "got", "gotten", "greetings", "had", "hadn't", "happens", "hardly", "has", "hasn't",
+ "have", "haven't", "having", "he", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "here's", "hereupon",
+ "hers", "herself", "he's", "hi", "him", "himself", "his", "hither", "hopefully", "how", "howbeit", "however", "i", "i'd", "ie", "if",
+ "ignored", "i'll", "i'm", "immediate", "in", "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates", "inner", "insofar",
+ "instead", "into", "inward", "is", "isn't", "it", "it'd", "it'll", "its", "it's", "itself", "i've", "just", "keep", "keeps", "kept",
+ "know", "known", "knows", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "let's", "like",
+ "liked", "likely", "little", "look", "looking", "looks", "ltd", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile",
+ "merely", "might", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "name", "namely", "nd", "near", "nearly",
+ "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "no", "nobody", "non", "none", "noone",
+ "nor", "normally", "not", "nothing", "novel", "now", "nowhere", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "on",
+ "once", "one", "ones", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside",
+ "over", "overall", "own", "particular", "particularly", "per", "perhaps", "placed", "please", "plus", "possible", "presumably",
+ "probably", "provides", "que", "quite", "qv", "rather", "rd", "re", "really", "reasonably", "regarding", "regardless", "regards",
+ "relatively", "respectively", "right", "said", "same", "saw", "say", "saying", "says", "second", "secondly", "see", "seeing",
+ "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several",
+ "shall", "she", "should", "shouldn't", "since", "six", "so", "some", "somebody", "somehow", "someone", "something", "sometime",
+ "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying", "still", "sub", "such", "sup", "sure",
+ "take", "taken", "tell", "tends", "th", "than", "thank", "thanks", "thanx", "that", "thats", "that's", "the", "their", "theirs",
+ "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "theres", "there's",
+ "thereupon", "these", "they", "they'd", "they'll", "they're", "they've", "think", "third", "this", "thorough", "thoroughly",
+ "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "took", "toward", "towards",
+ "tried", "tries", "truly", "try", "trying", "t's", "twice", "two", "un", "under", "unfortunately", "unless", "unlikely", "until",
+ "unto", "up", "upon", "us", "use", "used", "useful", "uses", "using", "usually", "value", "various", "very", "via", "viz", "vs",
+ "want", "wants", "was", "wasn't", "way", "we", "we'd", "welcome", "well", "we'll", "went", "were", "we're", "weren't", "we've",
+ "what", "whatever", "what's", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "where's",
+ "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "who's", "whose", "why", "will",
+ "willing", "wish", "with", "within", "without", "wonder", "won't", "would", "wouldn't", "yes", "yet", "you", "you'd", "you'll",
+ "your", "you're", "yours", "yourself", "yourselves", "you've", "zero"));
+
+ // Print a text in the console
+ public static void print(Object in) {
+ System.out.println(in);
+ }
+
+ public static void print(Object[] array) {
+ System.out.println(Arrays.asList(array));
+ }
+
+ public static void print(Object[][] array) {
+ System.out.print("[");
+ for (int i = 0; i < array.length; i++) {
+ print(array[i]);
+ if (i != array.length - 1) {
+ System.out.print("\n");
+ }
+ print("]");
+ }
+ }
+
+ // return the PoS (Class POS) out of the PoS-tag
+ public static POS getPOS(String posTag) {
+
+ ArrayList<String> adjective = new ArrayList<String>(Arrays.asList("JJ", "JJR", "JJS"));
+ ArrayList<String> adverb = new ArrayList<String>(Arrays.asList("RB", "RBR", "RBS"));
+ ArrayList<String> noun = new ArrayList<String>(Arrays.asList("NN", "NNS", "NNP", "NNPS"));
+ ArrayList<String> verb = new ArrayList<String>(Arrays.asList("VB", "VBD", "VBG", "VBN", "VBP", "VBZ"));
+
+ if (adjective.contains(posTag)) return POS.ADJECTIVE;
+ else if (adverb.contains(posTag)) return POS.ADVERB;
+ else if (noun.contains(posTag)) return POS.NOUN;
+ else if (verb.contains(posTag)) return POS.VERB;
+ else return null;
+
+ }
+
+ // Check whether a list of arrays contains an array
+ public static boolean belongsTo(String[] array, ArrayList<String[]> fullList) {
+ for (String[] refArray : fullList) {
+ if (areStringArraysEqual(array, refArray))
+ return true;
+ }
+ return false;
+ }
+
+ // Check whether two arrays of strings are equal
+ public static boolean areStringArraysEqual(String[] array1, String[] array2) {
+
+ if (array1.equals(null) || array2.equals(null))
+ return false;
+
+ if (array1.length != array2.length) {
+ return false;
+ }
+ for (int i = 0; i < array1.length; i++) {
+ if (!array1[i].equals(array2[i])) {
+ return false;
+ }
+ }
+
+ return true;
+
+ }
+
+}
Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java?rev=1687358&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java Wed Jun 24 20:19:05 2015
@@ -0,0 +1,227 @@
+package opennlp.tools.disambiguator;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.HashMap;
+
+import net.sf.extjwnl.JWNLException;
+import net.sf.extjwnl.data.POS;
+import net.sf.extjwnl.dictionary.Dictionary;
+import net.sf.extjwnl.dictionary.MorphologicalProcessor;
+import opennlp.tools.cmdline.postag.POSModelLoader;
+import opennlp.tools.lemmatizer.SimpleLemmatizer;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSTaggerME;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.util.InvalidFormatException;
+
+public class Loader {
+
+ private static String modelsDir = "src\\test\\resources\\opennlp\\tools\\disambiguator\\";
+
+ private static SentenceDetectorME sdetector;
+ private static Tokenizer tokenizer;
+ private static POSTaggerME tagger;
+ private static NameFinderME nameFinder;
+ private static SimpleLemmatizer lemmatizer;
+
+ private static Dictionary dictionary;
+ private static MorphologicalProcessor morph;
+ private static boolean IsInitialized = false;
+
+ // local caches for faster lookup
+ private static HashMap<String,Object> stemCache;
+ private static HashMap<String,Object> stopCache;
+ private static HashMap<String,Object> relvCache;
+
+
+
+ // Constructor
+ public Loader(){
+ super();
+ load();
+ }
+
+ public static HashMap<String,Object> getRelvCache(){
+ if (relvCache==null || relvCache.keySet().isEmpty()){
+ relvCache = new HashMap<String, Object>();
+ for (String t : Constants.relevantPOS){
+ relvCache.put(t, null);
+ }
+ }
+ return relvCache;
+ }
+
+ public static HashMap<String,Object> getStopCache(){
+ if (stopCache==null || stopCache.keySet().isEmpty()){
+ stopCache = new HashMap<String, Object>();
+ for (String s : Constants.stopWords){
+ stopCache.put(s, null);
+ }
+ }
+ return stopCache;
+ }
+
+ public static HashMap<String,Object> getStemCache(){
+ if (stemCache==null || stemCache.keySet().isEmpty()){
+ stemCache = new HashMap<String,Object>();
+ for (Object pos : POS.getAllPOS()){
+ stemCache.put(((POS)pos).getKey(),new HashMap());
+ }
+ }
+ return stemCache;
+ }
+
+ public static MorphologicalProcessor getMorph(){
+ if (morph==null){
+ morph = dictionary.getMorphologicalProcessor();
+ }
+ return morph;
+ }
+
+ public static Dictionary getDictionary(){
+ if (dictionary==null){
+ try {
+ dictionary = Dictionary.getDefaultResourceInstance();
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ }
+ }
+ return dictionary;
+ }
+
+ public static SimpleLemmatizer getLemmatizer(){
+ if (lemmatizer==null){
+ try {
+ lemmatizer = new SimpleLemmatizer (new FileInputStream(modelsDir + "en-lemmatizer.dict"));
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ return lemmatizer;
+ }
+
+ public static NameFinderME getNameFinder(){
+ if (nameFinder==null){
+ TokenNameFinderModel nameFinderModel;
+ try {
+ nameFinderModel = new TokenNameFinderModel(new FileInputStream(modelsDir + "en-ner-person.bin"));
+ nameFinder = new NameFinderME(nameFinderModel);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ return nameFinder;
+ }
+
+ public static POSTaggerME getTagger(){
+ if (tagger==null){
+ POSModel posTaggerModel = new POSModelLoader().load(new File(modelsDir + "en-pos-maxent.bin"));
+ tagger = new POSTaggerME(posTaggerModel);
+ }
+ return tagger;
+ }
+
+ public static SentenceDetectorME getSDetector(){
+ if (sdetector==null){
+ try {
+ SentenceModel enSentModel = new SentenceModel(new FileInputStream(modelsDir + "en-sent.bin"));
+ sdetector = new SentenceDetectorME(enSentModel);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ return sdetector;
+ }
+
+ public static Tokenizer getTokenizer(){
+ if (tokenizer == null){
+ try {
+ TokenizerModel tokenizerModel = new TokenizerModel(new FileInputStream(modelsDir + "en-token.bin"));
+ tokenizer = new TokenizerME(tokenizerModel);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ }
+ return tokenizer;
+ }
+
+ public static boolean isInitialized(){
+ return (dictionary !=null
+ && morph !=null
+ && stemCache !=null
+ && stopCache !=null
+ && relvCache !=null);
+ }
+
+ public void load(){
+ try {
+ SentenceModel enSentModel = new SentenceModel(new FileInputStream(modelsDir + "en-sent.bin"));
+ sdetector = new SentenceDetectorME(enSentModel);
+
+ TokenizerModel TokenizerModel = new TokenizerModel(new FileInputStream(modelsDir + "en-token.bin"));
+ tokenizer = new TokenizerME(TokenizerModel);
+
+
+ POSModel posTaggerModel = new POSModelLoader().load(new File(modelsDir + "en-pos-maxent.bin"));
+ tagger = new POSTaggerME(posTaggerModel);
+
+ TokenNameFinderModel nameFinderModel = new TokenNameFinderModel(new FileInputStream(modelsDir + "en-ner-person.bin"));
+ nameFinder = new NameFinderME(nameFinderModel);
+
+ lemmatizer = new SimpleLemmatizer (new FileInputStream(modelsDir + "en-lemmatizer.dict"));
+
+ dictionary = Dictionary.getDefaultResourceInstance();
+ morph = dictionary.getMorphologicalProcessor();
+
+ // loading lookup caches
+ stemCache = new HashMap();
+ for (Object pos : POS.getAllPOS()){
+ stemCache.put(((POS)pos).getKey(),new HashMap());
+ }
+
+ stopCache = new HashMap<String, Object>();
+ for (String s : Constants.stopWords){
+ stopCache.put(s, null);
+ }
+ relvCache = new HashMap<String, Object>();
+ for (String t : Constants.relevantPOS){
+ relvCache.put(t, null);
+ }
+
+
+ if (isInitialized()){
+ Constants.print("loading was succesfull");
+ }else{
+ Constants.print("loading was unsuccesfull");
+ }
+
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (InvalidFormatException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public static void unload ()
+ {
+ dictionary.close();
+ }
+
+
+
+}
Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Node.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Node.java?rev=1687358&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Node.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Node.java Wed Jun 24 20:19:05 2015
@@ -0,0 +1,158 @@
+package opennlp.tools.disambiguator;
+
+import java.util.ArrayList;
+
+import net.sf.extjwnl.JWNLException;
+import net.sf.extjwnl.data.PointerUtils;
+import net.sf.extjwnl.data.Synset;
+import net.sf.extjwnl.data.Word;
+import net.sf.extjwnl.data.list.PointerTargetNode;
+import net.sf.extjwnl.data.list.PointerTargetNodeList;
+
+
+
+/**
+ * Convenience class to access some features.
+ */
+
+public class Node {
+
+ public Synset parent;
+ public Synset synset;
+
+ protected ArrayList<WordPOS> senseRelevantWords;
+
+ public ArrayList<Synset> hypernyms = new ArrayList<Synset>();
+ public ArrayList<Synset> hyponyms = new ArrayList<Synset>();
+ public ArrayList<Synset> meronyms = new ArrayList<Synset>();
+ public ArrayList<Synset> holonyms = new ArrayList<Synset>();
+
+ public ArrayList<WordPOS> synonyms = new ArrayList<WordPOS>();
+
+
+ public Node(Synset parent, Synset synSet, ArrayList<WordPOS> senseRelevantWords) {
+ this.parent = parent;
+ this.synset = synSet;
+ this.senseRelevantWords = senseRelevantWords;
+ }
+
+ public Node(Synset synSet, ArrayList<WordPOS> senseRelevantWords) {
+ this.synset = synSet;
+ this.senseRelevantWords = senseRelevantWords;
+ }
+
+
+ public ArrayList<WordPOS> getSenseRelevantWords() {
+ return senseRelevantWords;
+ }
+
+ public void setSenseRelevantWords(ArrayList<WordPOS> senseRelevantWords) {
+ this.senseRelevantWords = senseRelevantWords;
+ }
+
+ public String getSense() {
+ return this.synset.getGloss().toString();
+ }
+
+
+ public void setHypernyms() {
+ // PointerUtils pointerUtils = PointerUtils.get();
+ PointerTargetNodeList phypernyms = new PointerTargetNodeList();
+ try {
+ phypernyms = PointerUtils.getDirectHypernyms(this.synset);
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ } catch (NullPointerException e) {
+ System.err.println("Error finding the hypernyms");
+ e.printStackTrace();
+ }
+
+ for (int i = 0; i < phypernyms.size(); i++) {
+ PointerTargetNode ptn = (PointerTargetNode) phypernyms.get(i);
+ this.hypernyms.add(ptn.getSynset());
+ }
+
+ }
+
+ public void setMeronyms() {
+ //PointerUtils pointerUtils = PointerUtils.getInstance();
+ PointerTargetNodeList pmeronyms = new PointerTargetNodeList();
+ try {
+ pmeronyms = PointerUtils.getMeronyms(this.synset);
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ } catch (NullPointerException e) {
+ System.err.println("Error finding the meronyms");
+ e.printStackTrace();
+ }
+
+ for (int i = 0; i < pmeronyms.size(); i++) {
+ PointerTargetNode ptn = (PointerTargetNode) pmeronyms.get(i);
+ this.meronyms.add(ptn.getSynset());
+ }
+ }
+
+ public void setHolonyms() {
+ // PointerUtils pointerUtils = PointerUtils.getInstance();
+ PointerTargetNodeList pholonyms = new PointerTargetNodeList();
+ try {
+ pholonyms = PointerUtils.getHolonyms(this.synset);
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ } catch (NullPointerException e) {
+ System.err.println("Error finding the holonyms");
+ e.printStackTrace();
+ }
+
+ for (int i = 0; i < pholonyms.size(); i++) {
+ PointerTargetNode ptn = (PointerTargetNode) pholonyms.get(i);
+ this.holonyms.add(ptn.getSynset());
+ }
+
+ }
+
+ public void setHyponyms() {
+ // PointerUtils pointerUtils = PointerUtils.getInstance();
+ PointerTargetNodeList phyponyms = new PointerTargetNodeList();
+ try {
+ phyponyms = PointerUtils.getDirectHyponyms(this.synset);
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ } catch (NullPointerException e) {
+ System.err.println("Error finding the hyponyms");
+ e.printStackTrace();
+ }
+
+ for (int i = 0; i < phyponyms.size(); i++) {
+ PointerTargetNode ptn = (PointerTargetNode) phyponyms.get(i);
+ this.hyponyms.add(ptn.getSynset());
+ }
+ }
+
+ public void setSynonyms()
+ {
+ for (Word word : synset.getWords())
+ synonyms.add(new WordPOS(word.toString(),word.getPOS()));
+ }
+
+ public ArrayList<Synset> getHypernyms() {
+ return hypernyms;
+ }
+
+ public ArrayList<Synset> getHyponyms() {
+ return hyponyms;
+ }
+
+ public ArrayList<Synset> getMeronyms() {
+ return meronyms;
+ }
+ public ArrayList<Synset> getHolonyms() {
+ return holonyms;
+ }
+
+ public ArrayList<WordPOS> getSynonyms()
+ {
+ return synonyms;
+ }
+
+}
Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/PreProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/PreProcessor.java?rev=1687358&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/PreProcessor.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/PreProcessor.java Wed Jun 24 20:19:05 2015
@@ -0,0 +1,163 @@
+package opennlp.tools.disambiguator;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import net.sf.extjwnl.JWNLException;
+import net.sf.extjwnl.data.IndexWord;
+import net.sf.extjwnl.data.POS;
+import opennlp.tools.util.Span;
+
+
+
+public class PreProcessor {
+
+ public PreProcessor() {
+ super();
+ }
+
+ public static String[] split(String text) {
+ return Loader.getSDetector().sentDetect(text);
+ }
+
+ public static String[] tokenize(String sentence) {
+ return Loader.getTokenizer().tokenize(sentence);
+ }
+
+ public static String[] tag(String[] tokenizedSentence) {
+ return Loader.getTagger().tag(tokenizedSentence);
+ }
+
+ public static String lemmatize(String word, String posTag) {
+ return Loader.getLemmatizer().lemmatize(word, posTag);
+ }
+
+ public static boolean isName(String word) {
+ Span nameSpans[] = Loader.getNameFinder().find(new String[] { word });
+ return (nameSpans.length != 0);
+ }
+
+ public static ArrayList<WordPOS> getAllRelevantWords(String[] sentence) {
+
+ ArrayList<WordPOS> relevantWords = new ArrayList<WordPOS>();
+
+ String[] tags = tag(sentence);
+
+ for (int i = 0; i<sentence.length; i++) {
+ if (!Loader.getStopCache().containsKey(sentence[i])) {
+ if (Loader.getRelvCache().containsKey(tags[i])) {
+ relevantWords.add(new WordPOS(sentence[i],Constants.getPOS(tags[i])));
+ }
+
+ }
+ }
+ return relevantWords;
+ }
+
+
+ public static ArrayList<WordPOS> getAllRelevantWords(WordToDisambiguate word) {
+ return getAllRelevantWords(word.getSentence());
+ }
+
+
+ public static ArrayList<WordPOS> getRelevantWords(WordToDisambiguate word, int winBackward, int winForward) {
+
+ ArrayList<WordPOS> relevantWords = new ArrayList<WordPOS>();
+
+ String[] sentence = word.getSentence();
+ String[] tags = tag(sentence);
+
+ int index = word.getWordIndex();
+
+ for (int i = index - winBackward; i<=index + winForward; i++) {
+
+ if (i >= 0 && i < sentence.length && i != index) {
+ if (!Loader.getStopCache().containsKey(sentence[i])) {
+
+ if (Loader.getRelvCache().containsKey(tags[i])) {
+ relevantWords.add(new WordPOS(sentence[i],Constants.getPOS(tags[i])));
+ }
+
+ }
+ }
+ }
+ return relevantWords;
+ }
+
+
+ /**
+ * Stem a single word with WordNet dictionnary
+ *
+ * @param wordToStem
+ * word to be stemmed
+ * @return stemmed list of words
+ */
+ public static List StemWordWithWordNet(WordPOS wordToStem) {
+ if (!Loader.isInitialized()
+ || wordToStem == null)
+ return null;
+ ArrayList<String> stems = new ArrayList();
+ try {
+ for (Object pos : POS.getAllPOS()){
+ stems.addAll(Loader.getMorph().lookupAllBaseForms((POS)pos, wordToStem.getWord())) ;
+ }
+
+ if (stems.size()>0)
+ return stems;
+ else{
+ return null;
+ }
+
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ }
+ return null;
+ }
+
+ /**
+ * Stem a single word tries to look up the word in the stemCache HashMap If
+ * the word is not found it is stemmed with WordNet and put into stemCache
+ *
+ * @param wordToStem
+ * word to be stemmed
+ * @return stemmed word list, null means the word is incorrect
+ */
+ public static List Stem(WordPOS wordToStem) {
+
+ // check if we already cached the stem map
+ HashMap posMap = (HashMap) Loader.getStemCache().get(wordToStem.getPOS().getKey());
+
+ // don't check words with digits in them
+ if (containsNumbers(wordToStem.getWord())){
+ return null;
+ }
+
+ List stemList = (List) posMap.get(wordToStem.getWord());
+ if (stemList != null){ // return it if we already cached it
+ return stemList;
+
+ } else { // unCached list try to stem it
+ stemList = StemWordWithWordNet(wordToStem);
+ if (stemList != null) {
+ // word was recognized and stemmed with wordnet:
+ // add it to cache and return the stemmed list
+ posMap.put(wordToStem.getWord(),stemList);
+ Loader.getStemCache().put(wordToStem.getPOS().getKey(), posMap);
+ return stemList;
+ }else{ // could not be stemmed add it anyway (as incorrect with null list)
+ posMap.put(wordToStem.getWord(), null);
+ Loader.getStemCache().put(wordToStem.getPOS().getKey(), posMap);
+ return null;
+ }
+ }
+ }
+
+ public static boolean containsNumbers(String word) {
+ // checks if the word is or contains a number
+ return word.matches(".*[0-9].*");
+ }
+
+
+
+}
Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java?rev=1687358&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java Wed Jun 24 20:19:05 2015
@@ -0,0 +1,15 @@
+package opennlp.tools.disambiguator;
+
+import opennlp.tools.util.Span;
+
+/**
+ * The interface for word sense disambiguators.
+ */
+public interface WSDisambiguator {
+
+ public String[] disambiguate(String[] inputText,int inputWordIndex);
+
+ public String[] disambiguate(String[] inputText, Span[] inputWordSpans);
+
+
+}
\ No newline at end of file
Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java?rev=1687358&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java Wed Jun 24 20:19:05 2015
@@ -0,0 +1,94 @@
+package opennlp.tools.disambiguator;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import net.sf.extjwnl.JWNLException;
+import net.sf.extjwnl.data.IndexWord;
+import net.sf.extjwnl.data.POS;
+import net.sf.extjwnl.data.Synset;
+import net.sf.extjwnl.dictionary.Dictionary;
+
+
+public class WordPOS {
+
+ private String word;
+ private List stems;
+ private POS pos;
+
+ // Constructor
+ public WordPOS(String word, POS pos) throws IllegalArgumentException{
+ if (word==null || pos ==null){
+ throw new IllegalArgumentException("Args are null");
+ }
+ this.word = word;
+ this.pos = pos;
+ }
+
+ public String getWord() {
+ return word;
+ }
+
+ public POS getPOS() {
+ return pos;
+ }
+
+ public List getStems() {
+ if (stems==null){
+ return PreProcessor.Stem(this);
+ }else{
+ return stems;
+ }
+ }
+
+
+ // Return the synsets (thus the senses) of the current word
+ public ArrayList<Synset> getSynsets() {
+
+ IndexWord indexWord;
+ try {
+ indexWord = Loader.getDictionary().lookupIndexWord(pos, word);
+ List<Synset> synsets = indexWord.getSenses();
+ return (new ArrayList<Synset>(synsets));
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ }
+ return null;
+ }
+
+ // uses Stemming to check if two words are equivalent
+ public boolean isStemEquivalent(WordPOS wordToCompare) {
+ // check if there is intersection in the stems;
+ List originalList = this.getStems();
+ List listToCompare = wordToCompare.getStems();
+
+// Constants.print("+++++++++++++++++++++ ::: "+ this.getWord());
+// Constants.print("+++++++++++++++++++++ ::: "+ wordToCompare.getWord());
+// Constants.print("the first list is \n"+originalList.toString());
+// Constants.print("the second list is \n"+listToCompare.toString());
+
+ if(originalList==null || listToCompare==null){ // any of the two requested words do not exist
+ return false;
+ }else{
+ return !Collections.disjoint(originalList, listToCompare);
+ }
+
+ }
+
+
+ // uses Lemma to check if two words are equivalent
+ public boolean isLemmaEquivalent(WordPOS wordToCompare) {
+ // TODO use lemmatizer to compare with lemmas
+
+ ArrayList<String> lemmas_word = new ArrayList();
+ ArrayList<String> lemmas_wordToCompare = new ArrayList();
+
+ for (String pos : Constants.allPOS){
+ Loader.getLemmatizer().lemmatize(wordToCompare.getWord(), pos);
+ }
+ return false;
+ }
+
+}
Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordSense.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordSense.java?rev=1687358&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordSense.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordSense.java Wed Jun 24 20:19:05 2015
@@ -0,0 +1,75 @@
+package opennlp.tools.disambiguator;
+
+import java.util.ArrayList;
+import java.util.concurrent.Semaphore;
+
+import opennlp.tools.disambiguator.lesk.WTDLesk;
+
+public class WordSense implements Comparable{
+
+ protected WTDLesk WTDLesk;
+ protected Node node;
+ protected int id;
+ protected double score;
+
+
+ public WordSense(WTDLesk WTDLesk, Node node) {
+ super();
+ this.WTDLesk = WTDLesk;
+ this.node = node;
+ }
+
+ public WordSense() {
+ super();
+ }
+
+
+ public WTDLesk getWTDLesk() {
+ return WTDLesk;
+ }
+
+ public void setWTDLesk(WTDLesk WTDLesk) {
+ this.WTDLesk = WTDLesk;
+ }
+
+
+ public Node getNode() {
+ return node;
+ }
+
+ public void setNode(Node node) {
+ this.node = node;
+ }
+
+
+ public double getScore() {
+ return score;
+ }
+
+ public void setScore(double score) {
+ this.score = score;
+ }
+
+ public int getId() {
+ return id;
+ }
+
+ public void setId(int id) {
+ this.id = id;
+ }
+
+
+ public int compareTo(Object o) {
+ return (this.score-((WordSense)o).score)<0?1:-1;
+ }
+
+
+ public String getSense() {
+ return node.getSense();
+ }
+
+
+
+}
+
+
Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java?rev=1687358&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java Wed Jun 24 20:19:05 2015
@@ -0,0 +1,95 @@
+package opennlp.tools.disambiguator;
+
+
+
+public class WordToDisambiguate {
+
+ protected String [] sentence;
+ protected int wordIndex;
+ protected String posTag;
+
+ protected int sense;
+
+
+
+ /**
+ * Constructor
+ */
+
+
+ public WordToDisambiguate(String[] sentence, int wordIndex, int sense) throws IllegalArgumentException{
+ super();
+
+ if (wordIndex>sentence.length){
+ throw new IllegalArgumentException("The index is out of bounds !");
+ }
+ this.sentence = sentence;
+ this.wordIndex = wordIndex;
+ String[] posTags = PreProcessor.tag(sentence);
+ this.posTag = posTags[wordIndex];
+ this.sense = sense;
+ }
+
+ public WordToDisambiguate(String[] sentence, int wordIndex) {
+ this(sentence,wordIndex,-1);
+ }
+
+
+
+ /**
+ * Getters and Setters
+ */
+
+
+
+ // sentence
+ public String[] getSentence() {
+ return sentence;
+ }
+
+ public void setSentence(String[] sentence) {
+ this.sentence = sentence;
+ }
+
+
+ // word
+ public int getWordIndex() {
+ return wordIndex;
+ }
+
+ public void setWordIndex(int wordIndex) {
+ this.wordIndex = wordIndex;
+ }
+
+ public String getWord(){
+ return sentence[wordIndex];
+ }
+
+
+ // posTag
+ public String getPosTag() {
+ return posTag;
+ }
+
+ public void setPosTag(String posTag) {
+ this.posTag = posTag;
+ }
+
+
+ // sense
+ public int getSense() {
+ return sense;
+ }
+
+ public void setSense(int sense) {
+ this.sense = sense;
+ }
+
+
+
+
+
+
+
+
+}
Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/FeaturesExtractor.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/FeaturesExtractor.java?rev=1687358&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/FeaturesExtractor.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/FeaturesExtractor.java Wed Jun 24 20:19:05 2015
@@ -0,0 +1,112 @@
+package opennlp.tools.disambiguator.ims;
+
+import java.util.ArrayList;
+
+import opennlp.tools.disambiguator.Constants;
+import opennlp.tools.disambiguator.Loader;
+
+
+public class FeaturesExtractor {
+
+
+
+ public FeaturesExtractor() {
+ super();
+ }
+
+
+ /**
+ * @Algorithm: IMS (It Makes Sense)
+ *
+ * The following methods serve to extract the features for the algorithm IMS.
+ */
+
+ public String[] extractPosOfSurroundingWords (String[] sentence, int wordIndex, int numberOfWords) {
+
+ String[] taggedSentence = Loader.getTagger().tag(sentence);
+
+ String[] tags = new String[2*numberOfWords+1];
+
+ int j = 0;
+
+ for (int i = wordIndex - numberOfWords; i < wordIndex + numberOfWords ; i++) {
+ if (i < 0 || i >= sentence.length) {
+ tags[j] = "null";
+ } else {
+ tags[j] = taggedSentence[i];
+ }
+ j++;
+ }
+
+ return tags;
+ }
+
+
+ public String[] extractSurroundingWords(String[] sentence, int wordIndex) {
+
+ String[] posTags = Loader.getTagger().tag(sentence);
+
+ Constants.print(posTags);
+
+ ArrayList<String> contextWords = new ArrayList<String>();
+
+ for (int i = 0; i < sentence.length; i++) {
+
+ if (!Constants.stopWords.contains(sentence[i].toLowerCase())
+ && (wordIndex != i)) {
+
+ String word = sentence[i].toLowerCase().replaceAll("[^a-z]", "").trim();
+
+ if (!word.equals("")) {
+ String lemma = Loader.getLemmatizer().lemmatize(sentence[i], posTags[i]);
+ contextWords.add(lemma);
+ }
+
+
+
+
+ }
+ }
+
+ return contextWords.toArray(new String[contextWords.size()]);
+ }
+
+
+ public ArrayList<String[]> extractLocalCollocations(String[] sentence, int wordIndex, int range) {
+ /**
+ * Here the author used only 11 features of this type. the range was set to 3 (bigrams extracted in a way that they are at max separated
+ * by 1 word).
+ */
+
+ ArrayList<String[]> localCollocations = new ArrayList<String[]>();
+
+ for (int i = wordIndex - range; i <= wordIndex + range ; i++) {
+
+ if (!(i < 0 || i > sentence.length - 2)) {
+ if ((i != wordIndex) && (i+1 != wordIndex) && (i+1 < wordIndex + range)) {
+ String[] lc = {sentence[i], sentence[i+1]};
+ localCollocations.add(lc);
+ }
+ if ((i != wordIndex) && (i+2 != wordIndex) && (i+2 < wordIndex + range)) {
+ String[] lc = {sentence[i], sentence[i+2]};
+ localCollocations.add(lc);
+ }
+ }
+
+ }
+
+ return localCollocations;
+ }
+
+
+ /**
+ * @Algorithm: SST
+ *
+ * The following methods serve to extract the features for the algorithm SST.
+ */
+
+
+
+
+
+}
Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java?rev=1687358&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java Wed Jun 24 20:19:05 2015
@@ -0,0 +1,99 @@
+package opennlp.tools.disambiguator.ims;
+
+import java.util.ArrayList;
+
+import opennlp.tools.disambiguator.WSDisambiguator;
+import opennlp.tools.util.Span;
+
+public class IMS implements WSDisambiguator{
+
+ FeaturesExtractor fExtractor = new FeaturesExtractor();
+
+ /**
+ * PARAMETERS
+ */
+
+ int numberOfSurroundingWords;
+ int ngram;
+
+
+
+ /**
+ * Constructors
+ */
+
+ public IMS() {
+ super();
+ numberOfSurroundingWords = 3;
+ ngram = 2;
+ }
+
+ public IMS(int numberOfSurroundingWords, int ngram) {
+ super();
+ this.numberOfSurroundingWords = numberOfSurroundingWords;
+ this.ngram = ngram;
+ }
+
+
+
+ /**
+ * INTERNAL METHODS
+ */
+
+ private void extractFeature(ArrayList<WTDIMS> words) {
+
+ for (WTDIMS word : words) {
+
+ word.setPosOfSurroundingWords(fExtractor.extractPosOfSurroundingWords(word.getSentence(), word.getWordIndex(), numberOfSurroundingWords));
+
+ word.setSurroundingWords(fExtractor.extractSurroundingWords(word.getSentence(), word.getWordIndex()));
+
+ word.setLocalCollocations(fExtractor.extractLocalCollocations(word.getSentence(), word.getWordIndex(), ngram));
+
+ }
+
+ }
+
+ private ArrayList<WTDIMS> extractTrainingData(String xmlFile) {
+
+ ArrayList<WTDIMS> trainingData = new ArrayList<WTDIMS>();
+
+ /**
+ * TODO Processing of the xml File here (To check the format of the data)
+ */
+
+ return trainingData;
+ }
+
+
+ public void train(String trainingSetFile) { // TODO To revise after finihsing the implementation of the collector
+
+ ArrayList<WTDIMS> instances = extractTrainingData(trainingSetFile);
+
+ extractFeature(instances);
+
+
+
+ }
+
+
+ public void load (String binFile) {
+ // TODO After finishing training the training data
+
+ }
+
+
+ @Override
+ public String[] disambiguate(String[] inputText, int inputWordIndex) {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ @Override
+ public String[] disambiguate(String[] inputText, Span[] inputWordSpans) {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+
+}
Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java?rev=1687358&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java Wed Jun 24 20:19:05 2015
@@ -0,0 +1,56 @@
+package opennlp.tools.disambiguator.ims;
+import java.util.ArrayList;
+
+import opennlp.tools.disambiguator.WordToDisambiguate;
+
+
+public class WTDIMS extends WordToDisambiguate {
+
+ protected String[] posOfSurroundingWords;
+ protected String[] surroundingWords;
+ protected ArrayList<String[]> localCollocations;
+
+
+
+ /**
+ * Constructor
+ */
+ public WTDIMS(String[] sentence, int word, int sense) {
+ super(sentence, word, sense);
+ }
+
+
+
+ /**
+ * Getters and Setters
+ */
+
+
+ public String[] getPosOfSurroundingWords() {
+ return posOfSurroundingWords;
+ }
+
+ public void setPosOfSurroundingWords(String[] posOfSurroundingWords) {
+ this.posOfSurroundingWords = posOfSurroundingWords;
+ }
+
+
+ public String[] getSurroundingWords() {
+ return surroundingWords;
+ }
+
+ public void setSurroundingWords(String[] surroundingWords) {
+ this.surroundingWords = surroundingWords;
+ }
+
+
+ public ArrayList<String[]> getLocalCollocations() {
+ return localCollocations;
+ }
+
+ public void setLocalCollocations(ArrayList<String[]> localCollocations) {
+ this.localCollocations = localCollocations;
+ }
+
+
+}
Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java?rev=1687358&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java Wed Jun 24 20:19:05 2015
@@ -0,0 +1,685 @@
+package opennlp.tools.disambiguator.lesk;
+
+import java.security.InvalidParameterException;
+import java.util.ArrayList;
+
+
+
+import java.util.Collections;
+import java.util.Map;
+
+import opennlp.tools.disambiguator.Constants;
+import opennlp.tools.disambiguator.Loader;
+import opennlp.tools.disambiguator.Node;
+import opennlp.tools.disambiguator.PreProcessor;
+import opennlp.tools.disambiguator.WSDisambiguator;
+import opennlp.tools.disambiguator.WordPOS;
+import opennlp.tools.disambiguator.WordSense;
+import opennlp.tools.util.Span;
+import net.sf.extjwnl.data.Synset;
+
+
+/**
+ * Class for the Lesk algorithm and variants.
+ */
+
+public class Lesk implements WSDisambiguator{
+
+ protected LeskParameters params;
+
+ public Loader loader;
+
+ public Lesk(){
+ this(null);
+ }
+
+ public Lesk(LeskParameters params) throws InvalidParameterException{
+ loader = new Loader();
+ this.setParams(params);
+ }
+
+ public void setParams(LeskParameters params) throws InvalidParameterException{
+ if(params==null){
+ this.params = new LeskParameters();
+ }
+ else{
+ if (params.isValid()){
+ this.params = params;
+ }else{
+ throw new InvalidParameterException("wrong params");
+ }
+ }
+ }
+
+ public ArrayList<WordSense> basic(WTDLesk wtd) {
+
+ ArrayList<WordPOS> relvWords = PreProcessor.getAllRelevantWords(wtd);
+ WordPOS word = new WordPOS(wtd.getWord(), Constants.getPOS(wtd.getPosTag()));
+
+ ArrayList<Synset> synsets = word.getSynsets();
+ ArrayList<Node> nodes = new ArrayList<Node>();
+
+ for (Synset synset : synsets) {
+ Node node = new Node(synset, relvWords);
+ nodes.add(node);
+ }
+
+ ArrayList<WordSense> scoredSenses = updateSenses(nodes);
+
+ for (WordSense wordSense : scoredSenses) {
+ wordSense.setWTDLesk(wtd);
+ int count = 0;
+ for (WordPOS senseWordPOS : wordSense.getNode().getSenseRelevantWords()) {
+ ArrayList stems = (ArrayList)PreProcessor.Stem(senseWordPOS);
+ for (WordPOS sentenceWordPOS : relvWords) {
+ // TODO change to lemma check
+ if (sentenceWordPOS.isStemEquivalent(senseWordPOS)) {
+ count = count + 1;
+ }
+ }
+ }
+ wordSense.setScore(count);
+ }
+
+ return scoredSenses;
+ }
+
+ public ArrayList<WordSense> basicContextual(WTDLesk wtd) {
+ return this.basicContextual(wtd,LeskParameters.DFLT_WIN_SIZE);
+ }
+
+ public ArrayList<WordSense> basicContextual(WTDLesk wtd, int windowSize) {
+ return this.basicContextual(wtd, windowSize,windowSize);
+ }
+
+ public ArrayList<WordSense> basicContextual(WTDLesk wtd, int windowBackward, int windowForward) {
+
+ ArrayList<WordPOS> relvWords = PreProcessor.getRelevantWords(wtd, windowBackward, windowForward);
+ WordPOS word = new WordPOS(wtd.getWord(), Constants.getPOS(wtd.getPosTag()));
+
+ ArrayList<Synset> synsets = word.getSynsets();
+ ArrayList<Node> nodes = new ArrayList<Node>();
+
+
+ for (Synset synset : synsets) {
+ Node node = new Node(synset, relvWords);
+ nodes.add(node);
+ }
+
+ ArrayList<WordSense> scoredSenses = updateSenses(nodes);
+
+
+ for (WordSense wordSense : scoredSenses) {
+ wordSense.setWTDLesk(wtd);
+
+ int count = 0;
+ for (WordPOS senseWordPOS : wordSense.getNode().getSenseRelevantWords()) {
+
+ for (WordPOS sentenceWordPOS : relvWords) {
+ // TODO change to lemma check
+ if (sentenceWordPOS.isStemEquivalent(senseWordPOS)) {
+ count = count + 1;
+ }
+ }
+
+ }
+ wordSense.setScore(count);
+
+ }
+
+ Collections.sort(scoredSenses);
+
+ return scoredSenses;
+ }
+
+ public ArrayList<WordSense> extended(WTDLesk wtd,
+ int depth, double depthScoreWeight, boolean includeSynonyms,
+ boolean includeHypernyms, boolean includeHyponyms,
+ boolean includeMeronyms, boolean includeHolonyms) {
+
+ return extendedContextual(wtd, 0, depth,
+ depthScoreWeight, includeSynonyms, includeHypernyms,
+ includeHyponyms, includeMeronyms, includeHolonyms);
+
+ }
+
+ public ArrayList<WordSense> extendedContextual(WTDLesk wtd,
+ int depth, double depthScoreWeight,
+ boolean includeSynonyms, boolean includeHypernyms,
+ boolean includeHyponyms, boolean includeMeronyms,
+ boolean includeHolonyms){
+
+ return extendedContextual(wtd, LeskParameters.DFLT_WIN_SIZE,
+ depth, depthScoreWeight, includeSynonyms, includeHypernyms,
+ includeHyponyms, includeMeronyms, includeHolonyms);
+
+ }
+
+ public ArrayList<WordSense> extendedContextual(WTDLesk wtd,
+ int windowSize, int depth, double depthScoreWeight,
+ boolean includeSynonyms, boolean includeHypernyms,
+ boolean includeHyponyms, boolean includeMeronyms,
+ boolean includeHolonyms) {
+
+ return extendedContextual(wtd, windowSize, windowSize,
+ depth, depthScoreWeight, includeSynonyms, includeHypernyms,
+ includeHyponyms, includeMeronyms, includeHolonyms);
+ }
+
+ public ArrayList<WordSense> extendedContextual(WTDLesk wtd,
+ int windowBackward, int windowForward, int depth,
+ double depthScoreWeight, boolean includeSynonyms,
+ boolean includeHypernyms, boolean includeHyponyms,
+ boolean includeMeronyms, boolean includeHolonyms) {
+
+ ArrayList<WordPOS> relvWords = PreProcessor.getRelevantWords(wtd,windowBackward,windowForward);
+ WordPOS word = new WordPOS(wtd.getWord(), Constants.getPOS(wtd.getPosTag()));
+
+ ArrayList<Synset> synsets = word.getSynsets();
+ ArrayList<Node> nodes = new ArrayList<Node>();
+
+ for (Synset synset : synsets) {
+ Node node = new Node(synset, relvWords);
+ nodes.add(node);
+ }
+
+ ArrayList<WordSense> scoredSenses = basicContextual(wtd,windowBackward, windowForward);
+
+ for (WordSense wordSense : scoredSenses) {
+
+ if (includeSynonyms) {
+ wordSense.setScore(wordSense.getScore()
+ + depthScoreWeight
+ * assessSynonyms(wordSense.getNode().getSynonyms(),relvWords));
+ }
+
+ if (includeHypernyms) {
+ fathomHypernyms(wordSense, wordSense.getNode().synset,
+ relvWords, depth, depth, depthScoreWeight);
+ }
+
+ if (includeHyponyms) {
+
+ fathomHyponyms(wordSense, wordSense.getNode().synset,
+ relvWords, depth, depth, depthScoreWeight);
+ }
+
+ if (includeMeronyms) {
+
+ fathomMeronyms(wordSense, wordSense.getNode().synset,
+ relvWords, depth, depth, depthScoreWeight);
+
+ }
+
+ if (includeHolonyms) {
+
+ fathomHolonyms(wordSense, wordSense.getNode().synset,
+ relvWords, depth, depth, depthScoreWeight);
+
+ }
+
+ }
+
+ return scoredSenses;
+
+ }
+
+ public ArrayList<WordSense> extendedExponential(WTDLesk wtd,
+ int depth,
+ double intersectionExponent,double depthExponent, boolean includeSynonyms,
+ boolean includeHypernyms, boolean includeHyponyms,
+ boolean includeMeronyms, boolean includeHolonyms) {
+
+ return extendedExponentialContextual(wtd, 0, depth,
+ intersectionExponent, depthExponent, includeSynonyms,
+ includeHypernyms, includeHyponyms,
+ includeMeronyms, includeHolonyms);
+
+ }
+
+ public ArrayList<WordSense> extendedExponentialContextual(WTDLesk wtd,
+ int depth,
+ double intersectionExponent,double depthExponent, boolean includeSynonyms,
+ boolean includeHypernyms, boolean includeHyponyms,
+ boolean includeMeronyms, boolean includeHolonyms) {
+
+ return extendedExponentialContextual(wtd, LeskParameters.DFLT_WIN_SIZE,
+ depth, intersectionExponent,depthExponent, includeSynonyms, includeHypernyms,
+ includeHyponyms, includeMeronyms, includeHolonyms);
+ }
+
+ public ArrayList<WordSense> extendedExponentialContextual(WTDLesk wtd,
+ int windowSize, int depth,
+ double intersectionExponent,double depthExponent, boolean includeSynonyms,
+ boolean includeHypernyms, boolean includeHyponyms,
+ boolean includeMeronyms, boolean includeHolonyms) {
+
+ return extendedExponentialContextual(wtd, windowSize, windowSize,
+ depth, intersectionExponent,depthExponent, includeSynonyms, includeHypernyms,
+ includeHyponyms, includeMeronyms, includeHolonyms);
+ }
+
+ public ArrayList<WordSense> extendedExponentialContextual(WTDLesk wtd,
+ int windowBackward, int windowForward, int depth,
+ double intersectionExponent,double depthExponent, boolean includeSynonyms,
+ boolean includeHypernyms, boolean includeHyponyms,
+ boolean includeMeronyms, boolean includeHolonyms) {
+ ArrayList<WordPOS> relvWords = PreProcessor.getRelevantWords(wtd,windowBackward,windowForward);
+ WordPOS word = new WordPOS(wtd.getWord(), Constants.getPOS(wtd.getPosTag()));
+
+ ArrayList<Synset> synsets = word.getSynsets();
+ ArrayList<Node> nodes = new ArrayList<Node>();
+
+ for (Synset synset : synsets) {
+ Node node = new Node(synset, relvWords);
+ nodes.add(node);
+ }
+
+ ArrayList<WordSense> scoredSenses = basicContextual(wtd, windowForward, windowBackward);
+
+ for (WordSense wordSense : scoredSenses) {
+
+
+ if (includeSynonyms) {
+ wordSense.setScore(wordSense.getScore() + Math.pow(assessSynonyms(wordSense.getNode().getSynonyms(),
+ relvWords),intersectionExponent));
+ }
+
+ if (includeHypernyms) {
+ fathomHypernymsExponential(wordSense, wordSense.getNode().synset,
+ relvWords, depth, depth,intersectionExponent, depthExponent);
+ }
+
+ if (includeHyponyms) {
+
+ fathomHyponymsExponential(wordSense, wordSense.getNode().synset,
+ relvWords, depth, depth, intersectionExponent,depthExponent);
+ }
+
+ if (includeMeronyms) {
+
+ fathomMeronymsExponential(wordSense, wordSense.getNode().synset,
+ relvWords, depth, depth, intersectionExponent,depthExponent);
+
+ }
+
+ if (includeHolonyms) {
+
+ fathomHolonymsExponential(wordSense, wordSense.getNode().synset,
+ relvWords, depth, depth, intersectionExponent,depthExponent);
+
+ }
+
+ }
+
+ return scoredSenses;
+
+ }
+
+ private void fathomHypernyms(WordSense wordSense, Synset child,
+ ArrayList<WordPOS> relvWords, int depth, int maxDepth,
+ double depthScoreWeight) {
+ if (depth == 0)
+ return;
+
+ String[] tokenizedGloss = Loader.getTokenizer().tokenize(child.getGloss().toString());
+ ArrayList<WordPOS> relvGlossWords = PreProcessor.getAllRelevantWords(tokenizedGloss);
+
+
+ Node childNode = new Node(child, relvGlossWords);
+
+ childNode.setHypernyms();
+ wordSense.setScore(wordSense.getScore()
+ + Math.pow(depthScoreWeight, maxDepth - depth + 1)
+ * assessFeature(childNode.getHypernyms(), relvWords));
+ for (Synset hypernym : childNode.getHypernyms()) {
+ fathomHypernyms(wordSense, hypernym, relvGlossWords, depth - 1, maxDepth,
+ depthScoreWeight);
+ }
+ }
+
+ private void fathomHypernymsExponential(WordSense wordSense, Synset child,
+ ArrayList<WordPOS> relvWords, int depth, int maxDepth,
+ double intersectionExponent, double depthScoreExponent) {
+ if (depth == 0)
+ return;
+
+ String[] tokenizedGloss = Loader.getTokenizer().tokenize(child.getGloss().toString());
+ ArrayList<WordPOS> relvGlossWords = PreProcessor.getAllRelevantWords(tokenizedGloss);
+
+ Node childNode = new Node(child, relvGlossWords);
+
+ childNode.setHypernyms();
+ wordSense
+ .setScore(wordSense.getScore()
+ + Math.pow(
+ assessFeature(childNode.getHypernyms(),
+ relvWords), intersectionExponent)
+ / Math.pow(depth, depthScoreExponent));
+ for (Synset hypernym : childNode.getHypernyms()) {
+
+ fathomHypernymsExponential(wordSense, hypernym, relvGlossWords, depth - 1, maxDepth,
+ intersectionExponent, depthScoreExponent);
+ }
+ }
+
+ private void fathomHyponyms(WordSense wordSense, Synset child,
+ ArrayList<WordPOS> relvWords, int depth, int maxDepth,
+ double depthScoreWeight) {
+ if (depth == 0)
+ return;
+
+ String[] tokenizedGloss = Loader.getTokenizer().tokenize(child.getGloss().toString());
+ ArrayList<WordPOS> relvGlossWords = PreProcessor.getAllRelevantWords(tokenizedGloss);
+
+ Node childNode = new Node(child, relvGlossWords);
+
+ childNode.setHyponyms();
+ wordSense.setScore(wordSense.getScore()
+ + Math.pow(depthScoreWeight, maxDepth - depth + 1)
+ * assessFeature(childNode.getHyponyms(), relvWords));
+ for (Synset hyponym : childNode.getHyponyms()) {
+
+ fathomHyponyms(wordSense, hyponym, relvGlossWords, depth - 1, maxDepth,
+ depthScoreWeight);
+ }
+ }
+
+ private void fathomHyponymsExponential(WordSense wordSense, Synset child,
+ ArrayList<WordPOS> relvWords, int depth, int maxDepth,
+ double intersectionExponent, double depthScoreExponent) {
+ if (depth == 0)
+ return;
+
+ String[] tokenizedGloss = Loader.getTokenizer().tokenize(child.getGloss().toString());
+ ArrayList<WordPOS> relvGlossWords = PreProcessor.getAllRelevantWords(tokenizedGloss);
+
+ Node childNode = new Node(child, relvGlossWords);
+
+ childNode.setHyponyms();
+ wordSense.setScore(wordSense.getScore()
+ + Math.pow(
+ assessFeature(childNode.getHyponyms(), relvWords),
+ intersectionExponent)
+ / Math.pow(depth, depthScoreExponent));
+ for (Synset hyponym : childNode.getHyponyms()) {
+
+ fathomHyponymsExponential(wordSense, hyponym, relvGlossWords, depth - 1, maxDepth,
+ intersectionExponent, depthScoreExponent);
+ }
+ }
+
+ private void fathomMeronyms(WordSense wordSense, Synset child,
+ ArrayList<WordPOS> relvWords, int depth, int maxDepth,
+ double depthScoreWeight) {
+ if (depth == 0)
+ return;
+
+ String[] tokenizedGloss = Loader.getTokenizer().tokenize(child.getGloss().toString());
+ ArrayList<WordPOS> relvGlossWords = PreProcessor.getAllRelevantWords(tokenizedGloss);
+
+ Node childNode = new Node(child, relvGlossWords);
+
+ childNode.setMeronyms();
+ wordSense.setScore(wordSense.getScore()
+ + Math.pow(depthScoreWeight, maxDepth - depth + 1)
+ * assessFeature(childNode.getMeronyms(), relvWords));
+ for (Synset meronym : childNode.getMeronyms()) {
+
+ fathomMeronyms(wordSense, meronym, relvGlossWords, depth - 1, maxDepth,
+ depthScoreWeight);
+ }
+ }
+
+ private void fathomMeronymsExponential(WordSense wordSense, Synset child,
+ ArrayList<WordPOS> relvWords, int depth, int maxDepth,
+ double intersectionExponent, double depthScoreExponent) {
+ if (depth == 0)
+ return;
+
+ String[] tokenizedGloss = Loader.getTokenizer().tokenize(child.getGloss().toString());
+ ArrayList<WordPOS> relvGlossWords = PreProcessor.getAllRelevantWords(tokenizedGloss);
+
+ Node childNode = new Node(child, relvGlossWords);
+
+ childNode.setMeronyms();
+ wordSense.setScore(wordSense.getScore()
+ + Math.pow(
+ assessFeature(childNode.getMeronyms(), relvWords),
+ intersectionExponent)
+ / Math.pow(depth, depthScoreExponent));
+ for (Synset meronym : childNode.getMeronyms()) {
+
+ fathomMeronymsExponential(wordSense, meronym, relvGlossWords, depth - 1, maxDepth,
+ intersectionExponent, depthScoreExponent);
+ }
+ }
+
+ private void fathomHolonyms(WordSense wordSense, Synset child,
+ ArrayList<WordPOS> relvWords, int depth, int maxDepth,
+ double depthScoreWeight) {
+ if (depth == 0)
+ return;
+
+ String[] tokenizedGloss = Loader.getTokenizer().tokenize(child.getGloss().toString());
+ ArrayList<WordPOS> relvGlossWords = PreProcessor.getAllRelevantWords(tokenizedGloss);
+
+ Node childNode = new Node(child, relvGlossWords);
+
+
+ childNode.setHolonyms();
+ wordSense.setScore(wordSense.getScore()
+ + Math.pow(depthScoreWeight, maxDepth - depth + 1)
+ * assessFeature(childNode.getHolonyms(), relvWords));
+ for (Synset holonym : childNode.getHolonyms()) {
+
+ fathomHolonyms(wordSense, holonym, relvGlossWords, depth - 1, maxDepth,
+ depthScoreWeight);
+ }
+ }
+
+ private void fathomHolonymsExponential(WordSense wordSense, Synset child,
+ ArrayList<WordPOS> relvWords, int depth, int maxDepth,
+ double intersectionExponent, double depthScoreExponent) {
+ if (depth == 0)
+ return;
+
+ String[] tokenizedGloss = Loader.getTokenizer().tokenize(child.getGloss().toString());
+ ArrayList<WordPOS> relvGlossWords = PreProcessor.getAllRelevantWords(tokenizedGloss);
+
+ Node childNode = new Node(child, relvGlossWords);
+
+ childNode.setHolonyms();
+ wordSense.setScore(wordSense.getScore()
+ + Math.pow(
+ assessFeature(childNode.getHolonyms(), relvWords),
+ intersectionExponent)
+ / Math.pow(depth, depthScoreExponent));
+ for (Synset holonym : childNode.getHolonyms()) {
+
+ fathomHolonymsExponential(wordSense, holonym, relvGlossWords, depth - 1, maxDepth,
+ intersectionExponent, depthScoreExponent);
+ }
+ }
+
+ private int assessFeature(ArrayList<Synset> featureSynsets,
+ ArrayList<WordPOS> relevantWords) {
+ int count = 0;
+ for (Synset synset : featureSynsets) {
+ Node subNode = new Node(synset, relevantWords);
+
+ String[] tokenizedSense = Loader.getTokenizer().tokenize(subNode.getSense());
+ ArrayList<WordPOS> relvSenseWords = PreProcessor.getAllRelevantWords(tokenizedSense);
+
+ for (WordPOS senseWord : relvSenseWords) {
+ for (WordPOS sentenceWord : relevantWords) {
+ if (sentenceWord.isStemEquivalent(senseWord)) {
+ count = count + 1;
+ }
+ }
+ }
+ }
+ return count;
+ }
+
+ private int assessSynonyms(ArrayList<WordPOS> synonyms,
+ ArrayList<WordPOS> relevantWords) {
+ int count = 0;
+
+ for (WordPOS synonym : synonyms) {
+ for (WordPOS sentenceWord : relevantWords) {
+ // TODO try to switch to lemmatizer
+ if (sentenceWord.isStemEquivalent(synonym)) {
+ count = count + 1;
+ }
+ }
+
+ }
+
+ return count;
+ }
+
+ public ArrayList<WordSense> updateSenses(ArrayList<Node> nodes) {
+
+ ArrayList<WordSense> scoredSenses = new ArrayList<WordSense>();
+
+ for (int i=0; i< nodes.size(); i++ ) {
+ ArrayList<WordPOS> sensesComponents = PreProcessor.getAllRelevantWords(PreProcessor.tokenize(nodes.get(i).getSense()));
+ WordSense wordSense = new WordSense();
+ nodes.get(i).setSenseRelevantWords(sensesComponents);
+ wordSense.setNode(nodes.get(i));
+ wordSense.setId(i);
+ scoredSenses.add(wordSense);
+ }
+ return scoredSenses;
+
+ }
+
+ // disambiguates a WTDLesk and returns an array of sense indexes from WordNet ordered by their score
+ @Override
+ public String[] disambiguate(String[] inputText, int inputWordIndex) {
+ WTDLesk wtd = new WTDLesk(inputText,inputWordIndex);
+ ArrayList<WordSense> wsenses = null;
+
+ switch(this.params.leskType){
+ case LESK_BASIC:
+ wsenses = basic(wtd);
+ break;
+ case LESK_BASIC_CTXT :
+ wsenses = basicContextual(wtd);
+ break;
+ case LESK_BASIC_CTXT_WIN :
+ wsenses = basicContextual(wtd, this.params.win_b_size);
+ break;
+ case LESK_BASIC_CTXT_WIN_BF :
+ wsenses = basicContextual(wtd, this.params.win_b_size, this.params.win_f_size);
+ break;
+ case LESK_EXT :
+ wsenses = extended(wtd,
+ this.params.depth,
+ this.params.depth_weight,
+ this.params.fathom_synonyms,
+ this.params.fathom_hypernyms,
+ this.params.fathom_hyponyms,
+ this.params.fathom_meronyms,
+ this.params.fathom_holonyms);
+ break;
+ case LESK_EXT_CTXT :
+ wsenses = extendedContextual(wtd,
+ this.params.depth,
+ this.params.depth_weight,
+ this.params.fathom_synonyms,
+ this.params.fathom_hypernyms,
+ this.params.fathom_hyponyms,
+ this.params.fathom_meronyms,
+ this.params.fathom_holonyms);
+ break;
+ case LESK_EXT_CTXT_WIN :
+ wsenses = extendedContextual(wtd,
+ this.params.win_b_size,
+ this.params.depth,
+ this.params.depth_weight,
+ this.params.fathom_synonyms,
+ this.params.fathom_hypernyms,
+ this.params.fathom_hyponyms,
+ this.params.fathom_meronyms,
+ this.params.fathom_holonyms);
+ break;
+ case LESK_EXT_CTXT_WIN_BF :
+ wsenses = extendedContextual(wtd,
+ this.params.win_b_size,
+ this.params.win_f_size,
+ this.params.depth,
+ this.params.depth_weight,
+ this.params.fathom_synonyms,
+ this.params.fathom_hypernyms,
+ this.params.fathom_hyponyms,
+ this.params.fathom_meronyms,
+ this.params.fathom_holonyms);
+ break;
+ case LESK_EXT_EXP :
+ wsenses = extendedExponential(wtd,
+ this.params.depth,
+ this.params.iexp,
+ this.params.dexp,
+ this.params.fathom_synonyms,
+ this.params.fathom_hypernyms,
+ this.params.fathom_hyponyms,
+ this.params.fathom_meronyms,
+ this.params.fathom_holonyms);
+ break;
+ case LESK_EXT_EXP_CTXT :
+ wsenses = extendedExponentialContextual(wtd,
+ this.params.depth,
+ this.params.iexp,
+ this.params.dexp,
+ this.params.fathom_synonyms,
+ this.params.fathom_hypernyms,
+ this.params.fathom_hyponyms,
+ this.params.fathom_meronyms,
+ this.params.fathom_holonyms);
+ break;
+ case LESK_EXT_EXP_CTXT_WIN :
+ wsenses = extendedExponentialContextual(wtd,
+ this.params.win_b_size,
+ this.params.depth,
+ this.params.iexp,
+ this.params.dexp,
+ this.params.fathom_synonyms,
+ this.params.fathom_hypernyms,
+ this.params.fathom_hyponyms,
+ this.params.fathom_meronyms,
+ this.params.fathom_holonyms);
+ break;
+ case LESK_EXT_EXP_CTXT_WIN_BF :
+ wsenses = extendedExponentialContextual(wtd,
+ this.params.win_b_size,
+ this.params.win_f_size,
+ this.params.depth,
+ this.params.iexp,
+ this.params.dexp,
+ this.params.fathom_synonyms,
+ this.params.fathom_hypernyms,
+ this.params.fathom_hyponyms,
+ this.params.fathom_meronyms,
+ this.params.fathom_holonyms);
+ break;
+ }
+
+ wsenses = extendedExponentialContextual(wtd, LeskParameters.DFLT_WIN_SIZE,LeskParameters.DFLT_DEPTH,LeskParameters.DFLT_IEXP,LeskParameters.DFLT_DEXP, true,true,true,true,true );
+ Collections.sort(wsenses);
+
+ String[] senses = new String[wsenses.size()];
+ for (int i = 0; i < wsenses.size() ; i++) {
+ senses[i] = wsenses.get(i).getSense();
+ }
+ return senses;
+ }
+
+ @Override
+ public String[] disambiguate(String[] inputText, Span[] inputWordSpans) {
+ // TODO need to work on spans
+ return null;
+ }
+
+}
\ No newline at end of file
Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java?rev=1687358&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java Wed Jun 24 20:19:05 2015
@@ -0,0 +1,107 @@
+package opennlp.tools.disambiguator.lesk;
+
+public class LeskParameters {
+
+ // VARIATIONS
+ public static enum LESK_TYPE {
+ LESK_BASIC,
+ LESK_BASIC_CTXT,
+ LESK_BASIC_CTXT_WIN,
+ LESK_BASIC_CTXT_WIN_BF,
+ LESK_EXT,
+ LESK_EXT_CTXT,
+ LESK_EXT_CTXT_WIN,
+ LESK_EXT_CTXT_WIN_BF,
+ LESK_EXT_EXP,
+ LESK_EXT_EXP_CTXT,
+ LESK_EXT_EXP_CTXT_WIN,
+ LESK_EXT_EXP_CTXT_WIN_BF,
+ }
+
+ // DEFAULTS
+ protected static final LESK_TYPE DFLT_LESK_TYPE = LESK_TYPE.LESK_EXT_EXP_CTXT_WIN;
+ protected static final int DFLT_WIN_SIZE = 4;
+ protected static final int DFLT_DEPTH = 3;
+ protected static final double DFLT_IEXP = 0.3;
+ protected static final double DFLT_DEXP = 0.3;
+
+
+ public LESK_TYPE leskType;
+ public int win_f_size;
+ public int win_b_size;
+ public int depth;
+
+ public boolean fathom_synonyms;
+ public boolean fathom_hypernyms;
+ public boolean fathom_hyponyms;
+ public boolean fathom_meronyms;
+ public boolean fathom_holonyms;
+
+ public double depth_weight;
+ public double iexp;
+ public double dexp;
+
+
+ public LeskParameters(){
+ this.setDefaults();
+ }
+
+ public void setDefaults(){
+ this.leskType = LeskParameters.DFLT_LESK_TYPE;
+ this.win_f_size = LeskParameters.DFLT_WIN_SIZE;
+ this.win_b_size = LeskParameters.DFLT_WIN_SIZE;
+ this.depth = LeskParameters.DFLT_DEPTH;
+ this.iexp = LeskParameters.DFLT_IEXP;
+ this.dexp = LeskParameters.DFLT_DEXP;
+ this.fathom_holonyms = true;
+ this.fathom_hypernyms = true;
+ this.fathom_hyponyms = true;
+ this.fathom_meronyms = true;
+ this.fathom_synonyms = true;
+ }
+
+ // Parameter Validation
+ // TODO make isSet for semantic feature booleans
+ public boolean isValid(){
+
+ switch(this.leskType){
+ case LESK_BASIC:
+ case LESK_BASIC_CTXT :
+ return true;
+ case LESK_BASIC_CTXT_WIN :
+ return (this.win_b_size==this.win_f_size)
+ && this.win_b_size>=0 ;
+ case LESK_BASIC_CTXT_WIN_BF :
+ return (this.win_b_size>=0)
+ && (this.win_f_size>=0) ;
+ case LESK_EXT :
+ case LESK_EXT_CTXT :
+ return (this.depth>=0)
+ && (this.depth_weight >= 0);
+
+ case LESK_EXT_CTXT_WIN :
+ case LESK_EXT_CTXT_WIN_BF :
+ return (this.depth>=0)
+ && (this.depth_weight >= 0)
+ && (this.win_b_size>=0)
+ && (this.win_f_size>=0);
+
+ case LESK_EXT_EXP :
+ case LESK_EXT_EXP_CTXT :
+ return (this.depth>=0)
+ && (this.dexp >= 0)
+ && (this.iexp>=0) ;
+
+ case LESK_EXT_EXP_CTXT_WIN :
+ case LESK_EXT_EXP_CTXT_WIN_BF :
+ return (this.depth>=0)
+ && (this.dexp >= 0)
+ && (this.iexp>=0)
+ && (this.win_b_size>=0)
+ && (this.win_f_size>=0);
+ default :
+ return false;
+ }
+ }
+
+}
Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/WTDLesk.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/WTDLesk.java?rev=1687358&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/WTDLesk.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/WTDLesk.java Wed Jun 24 20:19:05 2015
@@ -0,0 +1,15 @@
+package opennlp.tools.disambiguator.lesk;
+
+import opennlp.tools.disambiguator.WordToDisambiguate;
+
+
+public class WTDLesk extends WordToDisambiguate{
+
+ public WTDLesk(String[] sentence, int wordIndex) {
+ super(sentence,wordIndex,-1);
+ }
+
+
+
+
+}
\ No newline at end of file
Added: opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java?rev=1687358&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java Wed Jun 24 20:19:05 2015
@@ -0,0 +1,83 @@
+package opennlp.tools.disambiguator;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.junit.Test;
+
+import opennlp.tools.cmdline.postag.POSModelLoader;
+import opennlp.tools.disambiguator.Constants;
+import opennlp.tools.disambiguator.Loader;
+import opennlp.tools.disambiguator.WordSense;
+import opennlp.tools.disambiguator.ims.FeaturesExtractor;
+import opennlp.tools.disambiguator.lesk.Lesk;
+import opennlp.tools.disambiguator.lesk.LeskParameters;
+import opennlp.tools.disambiguator.lesk.WTDLesk;
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSTagger;
+import opennlp.tools.postag.POSTaggerME;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+
+
+public class Tester {
+
+ @Test
+ public static void main(String[] args) {
+
+
+ String sentence = "I went fishing for some sea bass.";
+ TokenizerModel TokenizerModel;
+
+ try {
+ TokenizerModel = new TokenizerModel(new FileInputStream("src\\test\\resources\\opennlp\\tools\\disambiguator\\en-token.bin"));
+ Tokenizer tokenizer = new TokenizerME(TokenizerModel);
+
+ String[] words = tokenizer.tokenize(sentence);
+
+ POSModel posTaggerModel = new POSModelLoader().load(new File("src\\test\\resources\\opennlp\\tools\\disambiguator\\en-pos-maxent.bin"));
+ POSTagger tagger = new POSTaggerME(posTaggerModel);
+
+
+ Constants.print("\ntokens :");
+ Constants.print(words);
+ Constants.print(tagger.tag(words));
+
+ Constants.print("\ntesting default lesk :");
+ Lesk lesk = new Lesk();
+ Constants.print(lesk.disambiguate(words, 6));
+
+ Constants.print("\ntesting with null params :");
+ lesk.setParams(null);
+ Constants.print(lesk.disambiguate(words, 6));
+
+ Constants.print("\ntesting with default params");
+ lesk.setParams(new LeskParameters());
+ Constants.print(lesk.disambiguate(words, 6));
+
+ Constants.print("\ntesting with custom params :");
+ LeskParameters leskParams = new LeskParameters();
+ leskParams.leskType = LeskParameters.LESK_TYPE.LESK_BASIC_CTXT_WIN_BF;
+ leskParams.win_b_size = 4;
+ leskParams.depth = 3;
+ lesk.setParams(leskParams);
+ Constants.print(lesk.disambiguate(words, 6));
+
+ /*
+ Constants.print("\ntesting with wrong params should throw exception :");
+ LeskParameters leskWrongParams = new LeskParameters();
+ leskWrongParams.depth = -1;
+ lesk.setParams(leskWrongParams);
+ Constants.print(lesk.disambiguate(words, 6));
+ */
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+
+ }
+
+}