You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2015/07/28 11:16:26 UTC

svn commit: r1693045 - in /opennlp/sandbox/opennlp-wsd/src: main/java/opennlp/tools/disambiguator/ main/java/opennlp/tools/disambiguator/DatasetsReader/ main/java/opennlp/tools/disambiguator/ims/ test/java/opennlp/tools/disambiguator/

Author: joern
Date: Tue Jul 28 09:16:25 2015
New Revision: 1693045

URL: http://svn.apache.org/r1693045
Log:
OPENNLP-790 
- Fix for the IMS approach to Support Semsor3.0 data
- The output format is now [Source SenseKey] so it corresponds to that of Lesk.
- Removed some unused variables.
- Added Some parameters to let the user select the source of data he wants to use.
- Implemented the IMS Evaluator.
- Added and clarified some parts of the documentation.

Thanks to Mondher Bouazizi for providing a patch.

Added:
    opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/
    opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IParagraph.java   (with props)
    opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/ISentence.java   (with props)
    opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IWord.java   (with props)
    opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SemcorReaderExtended.java   (with props)
    opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java   (with props)
    opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java   (with props)
Modified:
    opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
    opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
    opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
    opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java
    opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
    opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java
    opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java

Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java?rev=1693045&r1=1693044&r2=1693045&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java (original)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java Tue Jul 28 09:16:25 2015
@@ -19,8 +19,14 @@
 
 package opennlp.tools.disambiguator;
 
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.HashMap;
 
 import opennlp.tools.disambiguator.lesk.Lesk;
 import net.sf.extjwnl.JWNLException;
@@ -28,6 +34,11 @@ import net.sf.extjwnl.data.POS;
 
 public class Constants {
 
+  private static String resourcesFolder = "src\\test\\resources\\";
+
+  private static String englishDict = resourcesFolder
+      + "models\\en-lemmatizer.dict";
+
   public static String osPathChar = "\\";
 
   // List of all the PoS tags
@@ -133,16 +144,19 @@ public class Constants {
     if (results != null) {
 
       if (disambiguator instanceof Lesk) {
+        POS pos;
+        long offset;
+        double score;
         String[] parts;
 
         for (String result : results) {
-          parts = result.split(" ");
+          parts = result.split("@");
+          pos = POS.getPOSForKey(parts[0]);
+          offset = Long.parseLong(parts[1]);
+          score = Double.parseDouble(parts[3]);
           try {
-            Constants.print("score : "
-                + parts[2]
-                + " for : "
-                + Loader.getDictionary().getWordBySenseKey(parts[1])
-                    .getSynset().getGloss());
+            Constants.print("score : " + score + " for : "
+                + Loader.getDictionary().getSynsetAt(pos, offset).getGloss());
           } catch (JWNLException e) {
             e.printStackTrace();
           }
@@ -183,7 +197,60 @@ public class Constants {
     }
   }
 
-  // return the PoS (Class POS) out of the PoS-tag
+  /**
+   * Extract the list of ALL English words
+   * 
+   * @param dict
+   *          this file is the same that is used in the simple Lemmatizer
+   *          (i.e.,"en-lemmatizer.dict")
+   * 
+   * @return a list of all the English words
+   */
+  public static HashMap<String, Object> getEnglishWords(String dict) {
+
+    HashMap<String, Object> words = new HashMap<String, Object>();
+
+    BufferedReader br = null;
+
+    File file = new File(englishDict);
+
+    if (file.exists()) {
+
+      try {
+        br = new BufferedReader(new FileReader(file));
+        String line = br.readLine();
+        while (line != null) {
+          line = br.readLine();
+          if (line != null) {
+            String word = line.split("\\t")[0];
+            words.put(word, null);
+          }
+        }
+      } catch (FileNotFoundException e) {
+        e.printStackTrace();
+      } catch (IOException e) {
+        e.printStackTrace();
+      } finally {
+        if (br != null) {
+          try {
+            br.close();
+          } catch (IOException e) {
+            e.printStackTrace();
+          }
+        }
+      }
+    }
+
+    return words;
+  }
+
+  /**
+   * return the PoS (Class POS) out of the PoS-tag
+   * 
+   * @param posTag
+   *          PoS tag (e.g., "JJS", "NNP", etc.)
+   * @return the Part of Speech (type {@link POS})
+   */
   public static POS getPOS(String posTag) {
 
     ArrayList<String> adjective = new ArrayList<String>(Arrays.asList("JJ",
@@ -208,16 +275,73 @@ public class Constants {
 
   }
 
+  /**
+   * Check whether a PoS Tag is relevant of not. A PoS Tag is considered
+   * relevant when it corresponds to:
+   * <ul>
+   * <li>VERB</li>
+   * <li>ADJECTIVE</li>
+   * <li>ADVERB</li>
+   * <li>NOUN</li>
+   * </ul>
+   * 
+   * @param posTag
+   *          the PoS Tag to verify the relevance.
+   * @return whether a PoS Tag corresponds to a relevant Part of Speech (type
+   *         {@link POS}) or not ( true} if it is, false} otherwise)
+   */
   public static boolean isRelevant(String posTag) {
     return getPOS(posTag) != null;
   }
 
+  /**
+   * Check whether a PoS Tag is relevant of not. A PoS Tag is considered
+   * relevant when it is:
+   * <ul>
+   * <li>VERB</li>
+   * <li>ADJECTIVE</li>
+   * <li>ADVERB</li>
+   * <li>NOUN</li>
+   * </ul>
+   * 
+   * @param pos
+   *          The Part of Speech of Type {@link POS}
+   * @return whether a Part of Speech is relevant (true) or not (false)
+   */
   public static boolean isRelevant(POS pos) {
     return pos.equals(POS.ADJECTIVE) || pos.equals(POS.ADVERB)
         || pos.equals(POS.NOUN) || pos.equals(POS.VERB);
   }
 
-  // Check whether a list of arrays contains an array
+  public static String getPOSabbreviation(String posTag) {
+
+    if (posTag == null) {
+      return null;
+    }
+    if (posTag.startsWith("JJ")) {
+      return "a";
+    } else if (posTag.startsWith("RB")) {
+      return "r";
+    } else if (posTag.startsWith("VB") || posTag.equals("MD")) {
+      return "v";
+    } else if (posTag.startsWith("NN")) {
+      return "n";
+    }
+
+    return null;
+
+  }
+
+  /**
+   * Check whether a list of arrays contains an array
+   * 
+   * @param array
+   *          The array To check
+   * @param fullList
+   *          The full list of Arrays
+   * @return whether the {@link ArrayList} of arrays contains the array (true)
+   *         or not (false)
+   */
   public static boolean belongsTo(String[] array, ArrayList<String[]> fullList) {
     for (String[] refArray : fullList) {
       if (areStringArraysEqual(array, refArray))
@@ -226,7 +350,15 @@ public class Constants {
     return false;
   }
 
-  // Check whether two arrays of strings are equal
+  /**
+   * Check whether two arrays of strings are equal
+   * 
+   * @param array1
+   *          first array
+   * @param array2
+   *          second array
+   * @return whether the two arrays are identical (true) or not (false)
+   */
   public static boolean areStringArraysEqual(String[] array1, String[] array2) {
 
     if (array1.equals(null) || array2.equals(null))
@@ -244,4 +376,5 @@ public class Constants {
     return true;
 
   }
+
 }

Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IParagraph.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IParagraph.java?rev=1693045&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IParagraph.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IParagraph.java Tue Jul 28 09:16:25 2015
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.DatasetsReader;
+
+import java.util.ArrayList;
+
+public class IParagraph {
+
+  protected int pnum;
+  protected ArrayList<ISentence> isentences;
+
+  public IParagraph() {
+    super();
+    this.isentences = new ArrayList<ISentence>();
+  }
+
+  public IParagraph(int pnum) {
+    super();
+    this.pnum = pnum;
+    this.isentences = new ArrayList<ISentence>();
+  }
+
+  public IParagraph(int pnum, ArrayList<ISentence> sentences) {
+    super();
+    this.pnum = pnum;
+    this.isentences = sentences;
+  }
+
+  public int getPnum() {
+    return pnum;
+  }
+
+  public void setPnum(int pnum) {
+    this.pnum = pnum;
+  }
+
+  public ArrayList<ISentence> getSsentences() {
+    return isentences;
+  }
+
+  public void setIsentences(ArrayList<ISentence> isentences) {
+    this.isentences = isentences;
+  }
+
+  public void addIsentence(ISentence isentence) {
+    this.isentences.add(isentence);
+  }
+
+  @Override
+  public String toString() {
+    String paragraph = "";
+    for (int i = 0; i < this.isentences.size(); i++) {
+      paragraph = paragraph + " " + this.isentences.get(i).toString();
+    }
+    return paragraph.substring(1, paragraph.length());
+
+  }
+
+  /**
+   * This return TRUE only and only if the paragraph contains the word and it is
+   * sense-tagged
+   * 
+   * @param wordTag
+   * @return {@value Boolean.true} if the word exists in the paragraph and is
+   *         sense-tagged
+   * 
+   */
+  public boolean contains(String wordTag) {
+
+    for (ISentence isentence : this.getSsentences()) {
+      for (IWord iword : isentence.getIwords()) {
+        if (iword.equals(iword))
+          return true;
+      }
+    }
+
+    return false;
+  }
+
+}

Propchange: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IParagraph.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/ISentence.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/ISentence.java?rev=1693045&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/ISentence.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/ISentence.java Tue Jul 28 09:16:25 2015
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.DatasetsReader;
+
+import java.util.ArrayList;
+
+public class ISentence {
+
+  protected int pnum;
+  protected int snum;
+  protected ArrayList<IWord> iwords;
+
+  public ISentence() {
+    super();
+    this.iwords = new ArrayList<IWord>();
+  }
+
+  public ISentence(int pnum, int snum) {
+    super();
+    this.pnum = pnum;
+    this.snum = snum;
+    this.iwords = new ArrayList<IWord>();
+  }
+
+  public ISentence(int pnum, int snum, ArrayList<IWord> iwords) {
+    super();
+    this.pnum = pnum;
+    this.snum = snum;
+    this.iwords = iwords;
+  }
+
+  public int getPnum() {
+    return pnum;
+  }
+
+  public void setPnum(int pnum) {
+    this.pnum = pnum;
+  }
+
+  public int getSnum() {
+    return snum;
+  }
+
+  public void setSnum(int snum) {
+    this.snum = snum;
+  }
+
+  public ArrayList<IWord> getIwords() {
+    return iwords;
+  }
+
+  public void setIwords(ArrayList<IWord> iwords) {
+    this.iwords = iwords;
+  }
+
+  public void addIword(IWord iword) {
+    this.iwords.add(iword);
+  }
+
+  @Override
+  public String toString() {
+    String sentence = "";
+    for (int i = 0; i < this.iwords.size(); i++) {
+      sentence = sentence + " " + this.iwords.get(i).toString();
+    }
+    return sentence.substring(1, sentence.length());
+
+  }
+
+}

Propchange: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/ISentence.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IWord.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IWord.java?rev=1693045&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IWord.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IWord.java Tue Jul 28 09:16:25 2015
@@ -0,0 +1,298 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.DatasetsReader;
+
+import opennlp.tools.disambiguator.Constants;
+
+public class IWord {
+
+  public static enum Type {
+    WORD(1, "word"), PUNCTUATIONMARK(2, "pm");
+
+    public int code;
+    public String type;
+
+    private Type(int code, String type) {
+      this.code = code;
+      this.type = type;
+    }
+  }
+
+  protected int pnum;
+  protected int snum;
+  protected int wnum;
+
+  // Type refers to the type of word in the sentence
+  protected Type type;
+
+  protected String word;
+  protected String cmd;
+  protected String pos;
+  protected String lemma;
+  protected String wnsn;
+  protected String lexsn;
+
+  public IWord() {
+    super();
+  }
+
+  public IWord(String lemma, String pos) {
+    super();
+    this.word = lemma;
+    this.lemma = lemma;
+    this.pos = pos;
+  }
+
+  /**
+   * This serves to create a DISAMBIGUATED word instance
+   * 
+   * @param pnum
+   *          id of the paragraph
+   * @param snum
+   *          id of the sentence
+   * @param wnum
+   *          id of the word in the sentence
+   * @param type
+   *          the type in this case is {@link Type.DWORD}
+   * @param word
+   *          The raw word, as it appears in the sentence
+   * @param cmd
+   *          Whether it is semantically disambiguated or not (or to be
+   *          disambiguated)
+   * @param pos
+   *          The PoS Tag of the word
+   * @param lemma
+   *          The lemma of the word
+   * @param wnsn
+   *          The integer sense number corresponding to the WordNet output
+   *          display
+   * @param lexsn
+   *          The "Sense_key" that indicates the WordNet sense to which word
+   *          should be linked
+   * 
+   */
+  public IWord(int pnum, int snum, int wnum, Type type, String word,
+      String cmd, String pos, String lemma, String wnsn, String lexsn) {
+    super();
+    this.pnum = pnum;
+    this.snum = snum;
+    this.wnum = wnum;
+    this.type = type;
+    this.word = word;
+    this.cmd = cmd;
+    this.pos = pos;
+    this.lemma = lemma;
+    this.wnsn = wnsn;
+    this.lexsn = lexsn;
+  }
+
+  /**
+   * This serves to create a NON DISAMBIGUATED word instance
+   * 
+   * @param pnum
+   *          id of the paragraph
+   * @param snum
+   *          id of the sentence
+   * @param type
+   *          the type in this case is {@link Type.DWORD}
+   * @param word
+   *          The raw word, as it appears in the sentence
+   * @param cmd
+   *          Whether it is semantically disambiguated or not (or to be
+   *          disambiguated)
+   * @param pos
+   *          The PoS Tag of the word
+   * 
+   */
+  public IWord(int pnum, int snum, int wnum, Type type, String word,
+      String cmd, String pos) {
+    super();
+    this.wnum = wnum;
+    this.pnum = pnum;
+    this.snum = snum;
+    this.type = type;
+    this.word = word;
+    this.cmd = cmd;
+    this.pos = pos;
+  }
+
+  /**
+   * This serves to create a punctuation instances
+   * 
+   * @param type
+   *          The type as in {@link Type}
+   * @param word
+   *          The punctuation mark, as it appears in the sentence
+   */
+  public IWord(int pnum, int snum, int wnum, Type type, String word) {
+    super();
+    this.pnum = pnum;
+    this.snum = snum;
+    this.type = type;
+    this.word = word;
+  }
+
+  public int getPnum() {
+    return pnum;
+  }
+
+  public void setPnum(int pnum) {
+    this.pnum = pnum;
+  }
+
+  public int getSnum() {
+    return snum;
+  }
+
+  public void setSnum(int snum) {
+    this.snum = snum;
+  }
+
+  public int getWnum() {
+    return wnum;
+  }
+
+  public void setWnum(int wnum) {
+    this.wnum = wnum;
+  }
+
+  public String getWord() {
+    return word;
+  }
+
+  public void setWord(String word) {
+    this.word = word;
+  }
+
+  public Type getType() {
+    return type;
+  }
+
+  public void setType(Type type) {
+    this.type = type;
+  }
+
+  public String getCmd() {
+    return cmd;
+  }
+
+  public void setCmd(String cmd) {
+    this.cmd = cmd;
+  }
+
+  public String getPos() {
+    return pos;
+  }
+
+  public void setPos(String pos) {
+    this.pos = pos;
+  }
+
+  public String getLemma() {
+    return lemma;
+  }
+
+  public void setLemma(String lemma) {
+    this.lemma = lemma;
+  }
+
+  public String getWnsn() {
+    return wnsn;
+  }
+
+  public void setWnsn(String wnsn) {
+    this.wnsn = wnsn;
+  }
+
+  public String getLexsn() {
+    return lexsn;
+  }
+
+  public void setLexsn(String lexsn) {
+    this.lexsn = lexsn;
+  }
+
+  @Override
+  public String toString() {
+    return this.word;
+  }
+
+  public boolean equals(Object oword) {
+
+    if (!(oword instanceof IWord))
+      return false;
+    if (oword == this)
+      return true;
+
+    IWord iword = (IWord) oword;
+
+    if (this.lemma != null && iword.getLemma() != null) {
+      if (iword.getLemma().equals(this.getLemma())
+          && Constants.getPOS(iword.getPos()).equals(
+              Constants.getPOS(this.getPos()))) {
+        return true;
+      }
+    } else {
+      if (this.word.equals(iword.getWord())
+          && Constants.getPOSabbreviation(this.getPos()).equals(
+              Constants.getPOSabbreviation(iword.getPos()))) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  public boolean isInstanceOf(String wordTag) {
+
+    String tag = Constants.getPOSabbreviation(this.getPos());
+
+    String oword = wordTag.split("\\.")[0];
+    String otag = wordTag.split("\\.")[1];
+
+    if (this.lemma != null) {
+      if (this.lemma.equals(oword) && tag.equals(otag)) {
+        if (this.lexsn != null) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  public boolean senseEquals(Object oword) {
+
+    if (!(oword instanceof IWord))
+      return false;
+    if (oword == this)
+      return true;
+
+    IWord iword = (IWord) oword;
+
+    if (iword.getLemma().equals(this.getLemma())
+        && Constants.getPOS(iword.getPos()).equals(
+            Constants.getPOS(this.getPos()))
+        && iword.getLexsn().equals(this.getLexsn())) {
+      return true;
+    }
+
+    return false;
+  }
+
+}

Propchange: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IWord.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SemcorReaderExtended.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SemcorReaderExtended.java?rev=1693045&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SemcorReaderExtended.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SemcorReaderExtended.java Tue Jul 28 09:16:25 2015
@@ -0,0 +1,301 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.DatasetsReader;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import opennlp.tools.disambiguator.WordToDisambiguate;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+/**
+ * This reads one semcor file. It requires the
+ *
+ */
+public class SemcorReaderExtended {
+
+  private static final String ELEMENT_CONTEXTFILE = "contextfile";
+  private static final String ATTRIBUTE_CONCORDANCE = "concordance";
+
+  private static final String ELEMENT_CONTEXT = "context";
+  private static final String ATTRIBUTE_FILENAME = "filename";
+  private static final String ATTRIBUTE_PARAS = "paras";
+
+  private static final String ELEMENT_PARAGRAPH = "p";
+  private static final String ATTRIBUTE_PARAGRAPHNUM = "pnum";
+
+  private static final String ELEMENT_SENTENCE = "s";
+  private static final String ATTRIBUTE_SENTENCENUM = "snum";
+
+  private static final String ELEMENT_WORDFORM = "wf";
+  private static final String ATTRIBUTE_CMD = "cmd";
+  private static final String ATTRIBUTE_RDF = "rdf";
+  private static final String ATTRIBUTE_POS = "pos";
+  private static final String ATTRIBUTE_LEMMA = "lemma";
+  private static final String ATTRIBUTE_WNSN = "wnsn";
+  private static final String ATTRIBUTE_LEXSN = "lexsn";
+
+  private static final String ELEMENT_PUNCTUATION = "punc";
+
+  private static String path = "src\\test\\resources\\semcor3.0\\";
+  private static String[] folders = { "brown1", "brown2", "brownv" };
+  private static String tagfiles = "\\tagfiles\\";
+
+  public SemcorReaderExtended() {
+    super();
+  }
+
+  /**
+   * This serves to read one Semcor XML file
+   */
+  public ArrayList<ISentence> readFile(String file) {
+
+    ArrayList<ISentence> result = new ArrayList<ISentence>();
+
+    try {
+
+      File xmlFile = new File(file);
+      DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
+      DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
+      Document doc = dBuilder.parse(xmlFile);
+
+      doc.getDocumentElement().normalize();
+
+      NodeList paragraphs = doc.getElementsByTagName(ELEMENT_PARAGRAPH);
+
+      for (int i = 0; i < paragraphs.getLength(); i++) {
+
+        Node nParagraph = paragraphs.item(i);
+
+        if (nParagraph.getNodeType() == Node.ELEMENT_NODE) {
+
+          Element eParagraph = (Element) nParagraph;
+          // THE PARAGRAPH ID
+          int paragraphID = Integer.parseInt(eParagraph
+              .getAttribute(ATTRIBUTE_PARAGRAPHNUM));
+
+          NodeList nSentences = nParagraph.getChildNodes();
+
+          for (int j = 1; j < nSentences.getLength(); j++) {
+
+            Node nSentence = nSentences.item(j);
+            if (nSentence.getNodeType() == Node.ELEMENT_NODE) {
+
+              Element eSentence = (Element) nSentence;
+              // THE SENTENCE ID
+              int sentenceID = Integer.parseInt(eSentence
+                  .getAttribute(ATTRIBUTE_SENTENCENUM));
+              ISentence isentence = new ISentence(paragraphID, sentenceID);
+
+              NodeList nWords = nSentence.getChildNodes();
+
+              int wnum = 0;
+              for (int k = 0; k < nWords.getLength(); k++) {
+                Node nWord = nWords.item(k);
+
+                if (nWord.getNodeType() == Node.ELEMENT_NODE) {
+
+                  if (nWord.getNodeName().equals(ELEMENT_WORDFORM)) {
+
+                    Element eWord = (Element) nWord;
+
+                    if (eWord.getAttribute(ATTRIBUTE_CMD).equals("done")) {
+                      // if the word is already disambiguated
+                      String word = eWord.getTextContent();
+                      String cmd = eWord.getAttribute(ATTRIBUTE_CMD);
+                      String pos = eWord.getAttribute(ATTRIBUTE_POS);
+                      String lemma = eWord.getAttribute(ATTRIBUTE_LEMMA);
+                      String wnsn = eWord.getAttribute(ATTRIBUTE_WNSN);
+                      String lexsn = eWord.getAttribute(ATTRIBUTE_LEXSN);
+
+                      IWord iword = new IWord(paragraphID, sentenceID, wnum,
+                          IWord.Type.WORD, word, cmd, pos, lemma, wnsn, lexsn);
+                      isentence.addIword(iword);
+                      wnum++;
+
+                      // System.out.println("*** " + iword.toString() + " ***");
+
+                    } else {
+                      // if the word is not disambiguated
+                      String word = eWord.getTextContent();
+                      String cmd = eWord.getAttribute(ATTRIBUTE_CMD);
+                      String pos = eWord.getAttribute(ATTRIBUTE_POS);
+
+                      IWord iword = new IWord(paragraphID, sentenceID, wnum,
+                          IWord.Type.WORD, word, cmd, pos);
+                      isentence.addIword(iword);
+                      wnum++;
+                    }
+
+                  } else if (nWord.getNodeName().equals(ELEMENT_PUNCTUATION)) {
+                    Element eWord = (Element) nWord;
+                    String word = eWord.getTextContent();
+                    IWord iword = new IWord(paragraphID, sentenceID, wnum,
+                        IWord.Type.PUNCTUATIONMARK, word);
+                    isentence.addIword(iword);
+                    wnum++;
+                  }
+
+                }
+
+              }
+              result.add(isentence);
+            }
+          }
+        }
+      }
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+
+    return result;
+  }
+
+  public ArrayList<WordToDisambiguate> getSemcorOneFileData(String file,
+      String wordTag) {
+
+    ArrayList<WordToDisambiguate> setInstances = new ArrayList<WordToDisambiguate>();
+
+    try {
+
+      ArrayList<ISentence> isentences = readFile(file);
+      for (int j = 0; j < isentences.size(); j++) {
+        ISentence isentence = isentences.get(j);
+        ArrayList<IWord> iwords = isentence.getIwords();
+        for (int k = 0; k < iwords.size(); k++) {
+          IWord iword = iwords.get(k);
+          if (iword.isInstanceOf(wordTag)) {
+
+            String sentence;
+            int index;
+
+            if (j == 0) {
+              // case of the first sentence, we consider the current sentence
+              // and the next two ones
+              sentence = isentences.get(j).toString() + " "
+                  + isentences.get(j + 1).toString() + " "
+                  + isentences.get(j + 2).toString();
+              index = k;
+            } else if (j == isentences.size() - 1) {
+              // case of the last sentence, we consider the current sentence and
+              // the previous two ones
+              sentence = isentences.get(j - 2).toString() + " "
+                  + isentences.get(j - 1).toString() + " "
+                  + isentences.get(j).toString();
+              index = isentences.get(j - 2).getIwords().size()
+                  + isentences.get(j - 1).getIwords().size() + k;
+            } else {
+              // case of a sentence in the middle, we consider the previous
+              // sentence + the current one + the next one
+              sentence = isentences.get(j - 1).toString() + " "
+                  + isentences.get(j).toString() + " "
+                  + isentences.get(j + 1).toString();
+              index = isentences.get(j - 1).getIwords().size() + k;
+            }
+            ArrayList<String> senses = new ArrayList<String>();
+            String sense = iword.getLexsn();
+            if (sense != null) {
+              senses.add(sense);
+            }
+
+            if (!senses.isEmpty()) {
+              WordToDisambiguate wtd = new WordToDisambiguate(
+                  sentence.split("\\s"), index, senses);
+              setInstances.add(wtd);
+            }
+
+          }
+        }
+
+      }
+
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+
+    return setInstances;
+
+  }
+
+  /**
+   * One Semcor folder reader: This reads all the files in one semcor folder,
+   * and return all the instances in the format {@link WordToDisambiguate} of a
+   * specific word
+   * 
+   * @param folder
+   *          the name of the folder. Three folders exist in Semcor3.0, which
+   *          are ["brown1", "brown2", "brownv"]
+   * @param wordTag
+   *          The word, of which we are looking for the instances
+   * @return the list of the {@link WordToDisambiguate} instances
+   */
+  public ArrayList<WordToDisambiguate> getSemcorFolderData(String folder,
+      String wordTag) {
+
+    ArrayList<WordToDisambiguate> result = new ArrayList<WordToDisambiguate>();
+
+    String directory = path + folder + tagfiles;
+    File tempFolder = new File(directory);
+    File[] listOfFiles;
+
+    if (tempFolder.isDirectory()) {
+      listOfFiles = tempFolder.listFiles();
+      for (File file : listOfFiles) {
+
+        ArrayList<WordToDisambiguate> list = getSemcorOneFileData(directory
+            + file.getName(), wordTag);
+        result.addAll(list);
+      }
+    }
+
+    return result;
+
+  }
+
+  /**
+   * Semcor reader: This reads all the files in semcor, and return all the
+   * instances in the format {@link WordToDisambiguate} of a specific word
+   * 
+   * @param wordTag
+   *          The word, of which we are looking for the instances
+   * @return the list of the {@link WordToDisambiguate} instances of the word to
+   *         disambiguate
+   */
+  public ArrayList<WordToDisambiguate> getSemcorData(String wordTag) {
+
+    ArrayList<WordToDisambiguate> result = new ArrayList<WordToDisambiguate>();
+
+    for (String folder : folders) {
+      ArrayList<WordToDisambiguate> list = getSemcorFolderData(folder, wordTag);
+      result.addAll(list);
+    }
+
+    return result;
+
+  }
+
+}

Propchange: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SemcorReaderExtended.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java?rev=1693045&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java Tue Jul 28 09:16:25 2015
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.DatasetsReader;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+import opennlp.tools.disambiguator.WordToDisambiguate;
+import opennlp.tools.disambiguator.ims.WTDIMS;
+
+/**
+ * This class handles the extraction of Senseval-3 data from the different files
+ * (training data, dictionary instances, etc.)
+ */
+public class SensevalReader {
+
+  private String resourcesFolder = "src\\test\\resources\\";
+  protected String sensevalDirectory = resourcesFolder + "senseval3\\";
+
+  protected String data = sensevalDirectory + "EnglishLS.train";
+  protected String sensemapFile = sensevalDirectory + "EnglishLS.sensemap";
+  protected String wordList = sensevalDirectory + "EnglishLS.train.key";
+
+  // protected String dict = sensevalDirectory + "EnglishLS.dictionary.xml";
+  // protected String map = sensevalDirectory + "EnglishLS.sensemap";
+
+  /**
+   * The XML file of Senseval presents some issues that need to be fixed first
+   */
+  private String fixXmlFile() {
+
+    // TODO fix this !
+
+    return null;
+  }
+
+  public SensevalReader() {
+    super();
+  }
+
+  /**
+   * This extracts the equivalent senses. This serves in the case of the
+   * coarse-grained disambiguation
+   * 
+   * @param sensemapFile
+   *          the file containing the equivalent senses, each set of equivalent
+   *          senses per line
+   * @return a {@link HashMap} conaining the new sense ID ({@link Integer}) and
+   *         an {@link ArrayList} of the equivalent senses original IDs
+   */
+  public HashMap<Integer, ArrayList<String>> getEquivalentSense() {
+
+    HashMap<Integer, ArrayList<String>> mappedSenses = new HashMap<Integer, ArrayList<String>>();
+
+    try (BufferedReader wordsList = new BufferedReader(new FileReader(
+        sensemapFile))) {
+
+      int index = 0;
+
+      String line;
+
+      while ((line = wordsList.readLine()) != null) {
+
+        String[] temp = line.split("\\s");
+
+        ArrayList<String> tempSenses = new ArrayList<String>();
+
+        for (String sense : temp) {
+          if (sense.length() > 1) {
+            tempSenses.add(sense);
+          }
+        }
+
+        mappedSenses.put(index, tempSenses);
+        index++;
+
+      }
+
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+
+    return mappedSenses;
+
+  }
+
+  /**
+   * This returns the list of words available in the Senseval data
+   * 
+   * @return {@link ArrayList} of the words available on the current Senseval
+   *         set
+   */
+  public ArrayList<String> getSensevalWords() {
+
+    ArrayList<String> wordTags = new ArrayList<String>();
+
+    try (BufferedReader br = new BufferedReader(new FileReader(wordList))) {
+
+      String line;
+
+      while ((line = br.readLine()) != null) {
+
+        String word = line.split("\\s")[0];
+
+        if (!wordTags.contains(word)) {
+          wordTags.add(word);
+        }
+
+      }
+
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+
+    return wordTags;
+
+  }
+
+  /**
+   * Main Senseval Reader: This checks if the data corresponding to the words to
+   * disambiguate exist in the folder, and extract the
+   * {@link WordToDisambiguate} instances
+   * 
+   * @param wordTag
+   *          The word, of which we are looking for the instances
+   * @return the list of the {@link WordToDisambiguate} instances of the word to
+   *         disambiguate
+   */
+  public ArrayList<WordToDisambiguate> getSensevalData(String wordTag) {
+
+    ArrayList<WordToDisambiguate> setInstances = new ArrayList<WordToDisambiguate>();
+
+    try {
+
+      File xmlFile = new File(data);
+      DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
+      DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
+      Document doc = dBuilder.parse(xmlFile);
+
+      doc.getDocumentElement().normalize();
+
+      NodeList lexelts = doc.getElementsByTagName("lexelt");
+
+      for (int i = 0; i < lexelts.getLength(); i++) {
+
+        Node nLexelt = lexelts.item(i);
+
+        if (nLexelt.getNodeType() == Node.ELEMENT_NODE) {
+          Element eLexelt = (Element) nLexelt;
+
+          if (eLexelt.getAttribute("item").equals(wordTag)) {
+
+            NodeList nInstances = nLexelt.getChildNodes();
+
+            for (int j = 1; j < nInstances.getLength(); j++) {
+
+              Node nInstance = nInstances.item(j);
+
+              if (nInstance.getNodeType() == Node.ELEMENT_NODE) {
+
+                Element eInstance = (Element) nInstance;
+
+                String[] wordPos = eLexelt.getAttribute("item").split("\\.");
+                String word = wordPos[0]; // Word
+                String tag; // Part of Speech
+
+                if (wordPos[1].equals("n")) {
+                  tag = "noun";
+                } else if (wordPos[1].equals("v")) {
+                  tag = "verb";
+                } else if (wordPos[1].equals("a")) {
+                  tag = "adjective";
+                } else {
+                  tag = "adverb";
+                }
+
+                String id = eInstance.getAttribute("id");
+                String source = eInstance.getAttribute("docsrc");
+
+                ArrayList<String> answers = new ArrayList<String>();
+                String sentence = "";
+                String rawWord = "";
+
+                NodeList nChildren = nInstance.getChildNodes();
+
+                for (int k = 1; k < nChildren.getLength(); k++) {
+                  Node nChild = nChildren.item(k);
+
+                  if (nChild.getNodeName().equals("answer")) {
+                    // String answer =
+                    // nChild.getAttributes().item(0).getTextContent();
+                    String senseid = nChild.getAttributes().item(1)
+                        .getTextContent();
+
+                    String temp = senseid;
+                    // String[] temp = { answer, senseid };
+                    answers.add(temp);
+                  }
+
+                  if (nChild.getNodeName().equals("context")) {
+                    sentence = ((Element) nChild).getTextContent();
+
+                    if (nChild.hasChildNodes()) {
+                      // textbefore =
+                      // nChild.getChildNodes().item(0).getTextContent();
+                      rawWord = nChild.getChildNodes().item(1).getTextContent();
+                      // textAfter =
+                      // nChild.getChildNodes().item(2).getTextContent();
+                    }
+                  }
+
+                }
+
+                WTDIMS wordToDisambiguate = new WTDIMS(word, answers, sentence,
+                    rawWord);
+                setInstances.add(wordToDisambiguate);
+              }
+            }
+
+          }
+
+        }
+
+      }
+
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+
+    return setInstances;
+
+  }
+
+}

Propchange: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java?rev=1693045&r1=1693044&r2=1693045&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java (original)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java Tue Jul 28 09:16:25 2015
@@ -44,17 +44,13 @@ import opennlp.tools.disambiguator.ims.W
  * check {@link https://www.comp.nus.edu.sg/~nght/pubs/ims.pdf} for details
  * about this approach
  */
-
 public class FeaturesExtractor {
 
-  /**
-   * Constructor
-   */
   public FeaturesExtractor() {
     super();
   }
 
-  // IMS approach
+  // IMS
 
   private String[] extractPosOfSurroundingWords(String[] sentence,
       int wordIndex, int windowSize) {
@@ -230,4 +226,4 @@ public class FeaturesExtractor {
 
   // SST approach
 
-}
\ No newline at end of file
+}

Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java?rev=1693045&r1=1693044&r2=1693045&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java (original)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java Tue Jul 28 09:16:25 2015
@@ -30,6 +30,7 @@ import net.sf.extjwnl.data.POS;
 import net.sf.extjwnl.dictionary.Dictionary;
 import net.sf.extjwnl.dictionary.MorphologicalProcessor;
 import opennlp.tools.cmdline.postag.POSModelLoader;
+import opennlp.tools.disambiguator.DatasetsReader.SensevalReader;
 import opennlp.tools.lemmatizer.SimpleLemmatizer;
 import opennlp.tools.namefind.NameFinderME;
 import opennlp.tools.namefind.TokenNameFinderModel;
@@ -44,7 +45,7 @@ import opennlp.tools.util.InvalidFormatE
 
 public class Loader {
 
-  private static DataExtractor dExtractor = new DataExtractor();
+  private static SensevalReader dExtractor = new SensevalReader();
 
   private static String modelsDir = "src\\test\\resources\\models\\";
 
@@ -64,7 +65,6 @@ public class Loader {
 
   private static HashMap<String, Object> englishWords;
 
-  // Constructor
   public Loader() {
     super();
     load();
@@ -102,8 +102,8 @@ public class Loader {
 
   public static HashMap<String, Object> getEnglishWords() {
     if (englishWords == null || englishWords.keySet().isEmpty()) {
-      englishWords = dExtractor.getEnglishWords(modelsDir
-          + "en-lemmatizer.dict");
+      englishWords = Constants
+          .getEnglishWords(modelsDir + "en-lemmatizer.dict");
     }
     return englishWords;
   }

Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java?rev=1693045&r1=1693044&r2=1693045&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java (original)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java Tue Jul 28 09:16:25 2015
@@ -20,6 +20,7 @@
 package opennlp.tools.disambiguator;
 
 import java.util.ArrayList;
+import java.util.Arrays;
 
 import net.sf.extjwnl.data.POS;
 
@@ -53,6 +54,22 @@ public class WordToDisambiguate {
     this.sense = sense;
   }
 
+  public WordToDisambiguate(String[] sentence, int wordIndex,
+      ArrayList<String> senseIDs) throws IllegalArgumentException {
+    super();
+
+    if (wordIndex > sentence.length) {
+      throw new IllegalArgumentException("The index is out of bounds !");
+    }
+
+    this.sentence = sentence;
+    this.posTags = PreProcessor.tag(sentence);
+
+    this.wordIndex = wordIndex;
+
+    this.senseIDs = senseIDs;
+  }
+
   public WordToDisambiguate(String[] sentence, int wordIndex) {
     this(sentence, wordIndex, -1);
   }
@@ -66,7 +83,7 @@ public class WordToDisambiguate {
     this.sense = -1;
 
   }
-   
+
   // Sentence
   public String[] getSentence() {
     return sentence;
@@ -97,15 +114,17 @@ public class WordToDisambiguate {
 
     String ref = "";
 
-    if (Constants.getPOS(this.posTags[wordIndex]).equals(POS.VERB)) {
-      ref = wordBaseForm + ".v";
-    } else if (Constants.getPOS(this.posTags[wordIndex]).equals(POS.NOUN)) {
-      ref = wordBaseForm + ".n";
-    } else if (Constants.getPOS(this.posTags[wordIndex]).equals(POS.ADJECTIVE)) {
-      ref = wordBaseForm + ".a";
-    } else if (Constants.getPOS(this.posTags[wordIndex]).equals(POS.ADVERB)) {
-      ref = wordBaseForm + ".r";
-    } else {
+    if ((Constants.getPOS(this.posTags[wordIndex]) != null)) {
+      if (Constants.getPOS(this.posTags[wordIndex]).equals(POS.VERB)) {
+        ref = wordBaseForm + ".v";
+      } else if (Constants.getPOS(this.posTags[wordIndex]).equals(POS.NOUN)) {
+        ref = wordBaseForm + ".n";
+      } else if (Constants.getPOS(this.posTags[wordIndex])
+          .equals(POS.ADJECTIVE)) {
+        ref = wordBaseForm + ".a";
+      } else if (Constants.getPOS(this.posTags[wordIndex]).equals(POS.ADVERB)) {
+        ref = wordBaseForm + ".r";
+      }
 
     }
 
@@ -147,5 +166,11 @@ public class WordToDisambiguate {
   public String toString() {
     return (wordIndex + "\t" + getWord() + "\n" + sentence);
   }
+  
+  public void print() {
+    Constants.print("Sentence:  " + Arrays.asList(sentence) + "\n" + 
+        "Index: " + wordIndex + "\n" + 
+        "Word: "+ getWord() + "\n" +
+        "Sense ID: " + senseIDs.get(0));
+  }
 }
-

Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java?rev=1693045&r1=1693044&r2=1693045&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java (original)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java Tue Jul 28 09:16:25 2015
@@ -41,24 +41,25 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.security.InvalidParameterException;
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.zip.GZIPInputStream;
 
-import opennlp.tools.disambiguator.DictionaryInstance;
+import net.sf.extjwnl.JWNLException;
 import net.sf.extjwnl.data.POS;
 import net.sf.extjwnl.data.Synset;
+import net.sf.extjwnl.data.Word;
 import opennlp.tools.ml.model.MaxentModel;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.ObjectStreamUtils;
 import opennlp.tools.util.Span;
 import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.disambiguator.Constants;
-import opennlp.tools.disambiguator.DataExtractor;
 import opennlp.tools.disambiguator.FeaturesExtractor;
-import opennlp.tools.disambiguator.PreProcessor;
 import opennlp.tools.disambiguator.WSDParameters;
 import opennlp.tools.disambiguator.WordPOS;
 import opennlp.tools.disambiguator.WSDisambiguator;
+import opennlp.tools.disambiguator.WordToDisambiguate;
+import opennlp.tools.disambiguator.DatasetsReader.SemcorReaderExtended;
+import opennlp.tools.disambiguator.DatasetsReader.SensevalReader;
 
 /**
  * Implementation of the <b>It Makes Sense</b> approach originally proposed in
@@ -80,85 +81,108 @@ public class IMS implements WSDisambigua
   private final IMSContextGenerator cg;
 
   private FeaturesExtractor fExtractor = new FeaturesExtractor();
-  private DataExtractor dExtractor = new DataExtractor();
 
+  /**
+   * Sets the input parameters to the default ones
+   * 
+   * @throws InvalidParameterException
+   */
   public IMS() {
     super();
+    // Loader loader = new Loader();
     this.parameters = new IMSParameters();
-    ;
     this.cg = parameters.createContextGenerator();
   }
 
+  /**
+   * Initializes the loader object and sets the input parameters
+   * 
+   * @param parameters
+   *          The parameters to be used
+   * @throws InvalidParameterException
+   */
   public IMS(IMSParameters parameters) {
     super();
     this.parameters = parameters;
     this.cg = this.parameters.createContextGenerator();
   }
 
-  // Internal Methods
-  private String getTrainingFileName(WTDIMS wtd) {
-
-    String wordBaseForm = PreProcessor
-        .lemmatize(wtd.getWord(), wtd.getPosTag());
+  /**
+   * Returns that parameter settings of the IMS object.
+   * 
+   * @return the parameter settings
+   */
+  @Override
+  public WSDParameters getParams() {
+    return this.parameters;
+  }
 
-    String ref = "";
+  /**
+   * Returns that parameter settings of the IMS object. The returned parameters
+   * are of type {@link IMSParameters}
+   * 
+   * @return the parameter settings
+   */
+  public IMSParameters getParameters() {
+    return this.parameters;
+  }
 
-    if (Constants.getPOS(wtd.getPosTag()).equals(POS.VERB)) {
-      ref = wordBaseForm + ".v";
-    } else if (Constants.getPOS(wtd.getPosTag()).equals(POS.NOUN)) {
-      ref = wordBaseForm + ".n";
-    } else if (Constants.getPOS(wtd.getPosTag()).equals(POS.ADJECTIVE)) {
-      ref = wordBaseForm + ".a";
-    } else if (Constants.getPOS(wtd.getPosTag()).equals(POS.ADVERB)) {
-      ref = wordBaseForm + ".r";
+  /**
+   * If the parameters are null, set the default ones. Otherwise, only set them
+   * if they valid. Invalid parameters will return a exception (and set the
+   * parameters to the default ones)
+   * 
+   * @param Input
+   *          parameters
+   * @throws InvalidParameterException
+   */
+  @Override
+  public void setParams(WSDParameters parameters)
+      throws InvalidParameterException {
+    if (parameters == null) {
+      this.parameters = new IMSParameters();
     } else {
-
+      if (parameters.isValid()) {
+        this.parameters = (IMSParameters) parameters;
+      } else {
+        this.parameters = new IMSParameters();
+        throw new InvalidParameterException("wrong parameters");
+      }
     }
 
-    return ref;
   }
 
-  private void saveAllSurroundingWords(ArrayList<WTDIMS> trainingData,
-      String wordTag) {
-
-    ArrayList<String> surrWords = fExtractor
-        .extractTrainingSurroundingWords(trainingData);
-
-    File file = new File(parameters.getTrainingDataDirectory() + wordTag
-        + ".sw");
-    if (!file.exists()) {
-      try {
-
-        file.createNewFile();
-
-        FileWriter fw = new FileWriter(file.getAbsoluteFile());
-        BufferedWriter bw = new BufferedWriter(fw);
-
-        for (String surrWord : surrWords) {
-          bw.write(surrWord);
-          bw.newLine();
-        }
-
-        bw.close();
-
-        System.out.println("Done");
-
-      } catch (IOException e) {
-        e.printStackTrace();
+  /**
+   * If the parameters are null, set the default ones. Otherwise, only set them
+   * if they valid. Invalid parameters will return a exception (and set the
+   * parameters to the default ones)
+   * 
+   * @param Input
+   *          parameters
+   * @throws InvalidParameterException
+   */
+  public void setParams(IMSParameters parameters)
+      throws InvalidParameterException {
+    if (parameters == null) {
+      this.parameters = new IMSParameters();
+    } else {
+      if (parameters.isValid()) {
+        this.parameters = parameters;
+      } else {
+        this.parameters = new IMSParameters();
+        throw new InvalidParameterException("wrong parameters");
       }
-
     }
-
   }
 
+  // Internal Methods
   private ArrayList<String> getAllSurroundingWords(String wordTag) {
 
     ArrayList<String> surrWords = new ArrayList<String>();
 
     BufferedReader br = null;
 
-    File file = new File(parameters.getTrainingDataDirectory() + wordTag
-        + ".sw");
+    File file = new File(IMSParameters.trainingDataDirectory + wordTag + ".sw");
 
     if (file.exists()) {
 
@@ -191,47 +215,47 @@ public class IMS implements WSDisambigua
 
   }
 
-  private ArrayList<WTDIMS> extractTrainingData(String wordTrainingXmlFile,
-      HashMap<String, ArrayList<DictionaryInstance>> senses) {
+  private void saveAllSurroundingWords(ArrayList<WTDIMS> trainingInstances,
+      String wordTag) {
 
-    /**
-     * word tag has to be in the format "word.t" (e.g., "activate.v", "smart.a",
-     * etc.)
-     */
-
-    ArrayList<WTDIMS> trainingData = dExtractor
-        .extractWSDInstances(wordTrainingXmlFile);
-
-    for (WTDIMS word : trainingData) {
-      for (String senseId : word.getSenseIDs()) {
-        for (String dictKey : senses.keySet()) {
-          for (DictionaryInstance instance : senses.get(dictKey)) {
-            if (senseId.equals(instance.getId())) {
-              word.setSense(Integer.parseInt(dictKey.split("_")[1]));
-              break;
-            }
-          }
-        }
+    ArrayList<String> surrWords = fExtractor
+        .extractTrainingSurroundingWords(trainingInstances);
+
+    File file = new File(IMSParameters.trainingDataDirectory + wordTag + ".sw");
+    if (!file.exists()) {
+
+      try {
+        file.createNewFile();
+      } catch (IOException e) {
+        System.out
+            .println("Unable to create the List of Surrounding Words file !");
       }
     }
 
-    return trainingData;
-  }
+    try {
+      FileWriter fw = new FileWriter(file.getAbsoluteFile());
+      BufferedWriter bw = new BufferedWriter(fw);
 
-  private void extractFeature(WTDIMS word) {
+      for (String surrWord : surrWords) {
+        bw.write(surrWord);
+        bw.newLine();
+      }
 
-    fExtractor.extractIMSFeatures(word, this.parameters.getWindowSize(),
-        this.parameters.getNgram());
+      bw.close();
+    } catch (IOException e) {
+      System.out
+          .println("Unable to create the List of Surrounding Words file !");
+      e.printStackTrace();
+    }
 
-  }
+    System.out.println("Done");
 
-  private HashMap<String, String> getWordDictionaryInstance(WTDIMS wtd) {
+  }
 
-    String dict = parameters.getDict();
-    String map = parameters.getMap();
+  private void extractFeature(WTDIMS word) {
 
-    return dExtractor.getDictionaryInstance(dict, map,
-        this.getTrainingFileName(wtd));
+    fExtractor.extractIMSFeatures(word, this.parameters.getWindowSize(),
+        this.parameters.getNgram());
 
   }
 
@@ -240,19 +264,37 @@ public class IMS implements WSDisambigua
     String word = wordToDisambiguate.getRawWord();
     POS pos = Constants.getPOS(wordToDisambiguate.getPosTag());
 
-    WordPOS wordPOS = new WordPOS(word, pos);
+    if (pos != null) {
 
-    ArrayList<Synset> synsets = wordPOS.getSynsets();
+      WordPOS wordPOS = new WordPOS(word, pos);
 
-    int size = synsets.size();
+      ArrayList<Synset> synsets = wordPOS.getSynsets();
 
-    String[] senses = new String[size];
+      int size = synsets.size();
 
-    for (int i = 0; i < size; i++) {
-      senses[i] = synsets.get(i).getGloss();
-    }
+      String[] senses = new String[size];
 
-    return senses;
+      for (int i = 0; i < size; i++) {
+        String senseKey = null;
+        for (Word wd : synsets.get(i).getWords()) {
+          if (wd.getLemma().equals(
+              wordToDisambiguate.getRawWord().split("\\.")[0])) {
+            try {
+              senseKey = wd.getSenseKey();
+            } catch (JWNLException e) {
+              e.printStackTrace();
+            }
+            senses[i] = senseKey;
+            break;
+          }
+        }
+
+      }
+      return senses;
+    } else {
+      System.out.println("The word has no definitions in WordNet !");
+      return null;
+    }
 
   }
 
@@ -260,78 +302,60 @@ public class IMS implements WSDisambigua
    * Method for training a model
    * 
    * @param wordTag
-   *          : the word to disambiguate. It should be written in the format
+   *          the word to disambiguate. It should be written in the format
    *          "word.p" (Exp: "write.v", "well.r", "smart.a", "go.v"
    * @param trainParams
-   *          : the parameters used for training
+   *          the parameters used for training
+   * @param trainingInstances
+   *          the training data in the format {@link WTDIMS}
    */
-  public void train(String wordTag, TrainingParameters trainParams) {
-
-    String dict = parameters.getDict();
-    String map = parameters.getMap();
+  public void train(String wordTag, TrainingParameters trainParams,
+      ArrayList<WTDIMS> trainingInstances) {
 
-    String wordTrainingxmlFile = parameters.getRawDataDirectory() + wordTag
-        + ".xml";
-    String wordTrainingbinFile = parameters.getTrainingDataDirectory()
-        + wordTag + ".gz";
-
-    File bf = new File(wordTrainingxmlFile);
+    String wordTrainingbinFile = IMSParameters.trainingDataDirectory + wordTag
+        + ".gz";
 
     ObjectStream<Event> IMSes = null;
 
-    if (bf.exists() && !bf.isDirectory()) {
-
-      HashMap<String, ArrayList<DictionaryInstance>> senses = dExtractor
-          .extractWordSenses(dict, map, wordTag);
-
-      ArrayList<WTDIMS> instances = extractTrainingData(wordTrainingxmlFile,
-          senses);
-
-      for (WTDIMS wtd : instances) {
-        extractFeature(wtd);
-      }
-
-      saveAllSurroundingWords(instances, wordTag);
-
-      for (WTDIMS wtd : instances) {
-        extractFeature(wtd);
-      }
+    for (WTDIMS wtd : trainingInstances) {
+      extractFeature(wtd);
+    }
 
-      ArrayList<String> surrWords = getAllSurroundingWords(wordTag);
+    saveAllSurroundingWords(trainingInstances, wordTag);
 
-      for (WTDIMS wtd : instances) {
-        fExtractor.serializeIMSFeatures(wtd, surrWords);
-      }
+    ArrayList<String> surrWords = getAllSurroundingWords(wordTag);
 
-      ArrayList<Event> events = new ArrayList<Event>();
+    for (WTDIMS wtd : trainingInstances) {
+      fExtractor.serializeIMSFeatures(wtd, surrWords);
+    }
 
-      for (WTDIMS wtd : instances) {
+    ArrayList<Event> events = new ArrayList<Event>();
 
-        int sense = wtd.getSense();
+    for (WTDIMS wtd : trainingInstances) {
 
-        String[] context = cg.getContext(wtd);
+      String sense = wtd.getSenseIDs().get(0);
 
-        Event ev = new Event(sense + "", context);
+      String[] context = cg.getContext(wtd);
 
-        events.add(ev);
+      Event ev = new Event(sense + "", context);
 
-        IMSes = ObjectStreamUtils.createObjectStream(events);
+      events.add(ev);
 
-      }
+      IMSes = ObjectStreamUtils.createObjectStream(events);
 
-      DataIndexer indexer;
-      try {
-        indexer = new OnePassDataIndexer((ObjectStream<Event>) IMSes);
-        MaxentModel trainedMaxentModel = GIS.trainModel(200, indexer);
-        File outFile = new File(wordTrainingbinFile);
-        AbstractModelWriter writer = new SuffixSensitiveGISModelWriter(
-            (AbstractModel) trainedMaxentModel, outFile);
-        writer.persist();
+    }
 
-      } catch (IOException e) {
-        e.printStackTrace();
-      }
+    DataIndexer indexer;
+    try {
+      indexer = new OnePassDataIndexer((ObjectStream<Event>) IMSes);
+      MaxentModel trainedMaxentModel = GIS.trainModel(200, indexer);
+      File outFile = new File(wordTrainingbinFile);
+      AbstractModelWriter writer = new SuffixSensitiveGISModelWriter(
+          (AbstractModel) trainedMaxentModel, outFile);
+      writer.persist();
 
+    } catch (IOException e) {
+      e.printStackTrace();
     }
 
   }
@@ -339,17 +363,17 @@ public class IMS implements WSDisambigua
   /**
    * Load an existing model
    * 
-   * @param binFile
-   *          : Location of the already trained model
+   * @param trainedModel
+   *          Name of the file of the already trained model
    * @return the model trained
    */
-  public MaxentModel load(String binFile) {
+  public MaxentModel load(String trainedModel) {
 
     MaxentModel loadedMaxentModel = null;
 
     FileInputStream inputStream;
     try {
-      inputStream = new FileInputStream(binFile);
+      inputStream = new FileInputStream(trainedModel);
       InputStream decodedInputStream = new GZIPInputStream(inputStream);
       DataReader modelReader = new PlainTextFileDataReader(decodedInputStream);
       loadedMaxentModel = new GISModelReader(modelReader).getModel();
@@ -373,25 +397,29 @@ public class IMS implements WSDisambigua
   @Override
   public String[] disambiguate(String[] inputText, int inputWordIndex) {
 
-    String rawDataDirectory = this.parameters.getRawDataDirectory();
-    String trainingDataDirectory = this.parameters.getTrainingDataDirectory();
+    String trainingDataDirectory = IMSParameters.trainingDataDirectory;
+
+    File file = new File(trainingDataDirectory);
+
+    if (!file.exists()) {
+      file.mkdirs();
+    }
 
     WTDIMS word = new WTDIMS(inputText, inputWordIndex);
     fExtractor.extractIMSFeatures(word, this.parameters.getWindowSize(),
         this.parameters.getNgram());
 
-    String wordTag = getTrainingFileName(word);
+    String wordTag = word.getWordTag();
 
-    String wordTrainingxmlFile = rawDataDirectory + wordTag + ".xml";
     String wordTrainingbinFile = trainingDataDirectory + wordTag + ".gz";
 
     File bf = new File(wordTrainingbinFile);
 
     MaxentModel loadedMaxentModel = null;
     String outcome = "";
+
     if (bf.exists() && !bf.isDirectory()) {
-      // if the model file exists already
-      // System.out.println("the model file was found !");
+      // If the trained model exists
       ArrayList<String> surrWords = getAllSurroundingWords(wordTag);
       fExtractor.serializeIMSFeatures(word, surrWords);
 
@@ -402,11 +430,40 @@ public class IMS implements WSDisambigua
       outcome = loadedMaxentModel.getBestOutcome(outcomeProbs);
 
     } else {
-      bf = new File(wordTrainingxmlFile);
-      if (bf.exists() && !bf.isDirectory()) {
-        // if the xml file exists already
-        // System.out.println("the xml file was found !");
-        train(wordTag, null);
+      // Depending on the source, go fetch the training data
+      ArrayList<WTDIMS> trainingInstances = new ArrayList<WTDIMS>();
+      switch (this.parameters.getSource().code) {
+      case 1: {
+        SemcorReaderExtended sReader = new SemcorReaderExtended();
+        for (WordToDisambiguate ti : sReader.getSemcorData(wordTag)) {
+          WTDIMS imsIT = new WTDIMS(ti);
+          extractFeature(imsIT);
+          trainingInstances.add(imsIT);
+        }
+        break;
+      }
+
+      case 2: {
+        SensevalReader sReader = new SensevalReader();
+        for (WordToDisambiguate ti : sReader.getSensevalData(wordTag)) {
+          WTDIMS imsIT = (WTDIMS) ti;
+          extractFeature(imsIT);
+          trainingInstances.add(imsIT);
+        }
+        break;
+      }
+
+      case 3: {
+        // TODO check the case when the user selects his own data set (make an
+        // interface to collect training data)
+        break;
+      }
+      }
+
+      if (!trainingInstances.isEmpty()) {
+
+        train(wordTag, null, trainingInstances);
+
         ArrayList<String> surrWords = getAllSurroundingWords(wordTag);
 
         fExtractor.serializeIMSFeatures(word, surrWords);
@@ -418,21 +475,22 @@ public class IMS implements WSDisambigua
         double[] outcomeProbs = loadedMaxentModel.eval(context);
         outcome = loadedMaxentModel.getBestOutcome(outcomeProbs);
       }
+
     }
 
     if (!outcome.equals("")) {
 
-      HashMap<String, String> senses = getWordDictionaryInstance(word);
+      // System.out.println("The sense is [" + outcome + "] : " /*+
+      // Loader.getDictionary().getWordBySenseKey(outcome.split("%")[1]).getSynset().getGloss()*/);
 
-      String index = wordTag + "_" + outcome;
+      outcome = "WordNet " + wordTag.split("\\.")[0] + "%" + outcome;
 
-      String[] s = { senses.get(index) };
+      String[] s = { outcome };
 
       return s;
 
     } else {
       // if no training data exist
-      // System.out.println("No training data available, the MFS is returned !");
       String[] s = getMostFrequentSense(word);
       return s;
     }
@@ -454,17 +512,4 @@ public class IMS implements WSDisambigua
     return null;
   }
 
-  // TODO fix the conflicts in parameters with Anthony's code
-  @Override
-  public WSDParameters getParams() {
-    // TODO Auto-generated method stub
-    return null;
-  }
-
-  @Override
-  public void setParams(WSDParameters params) throws InvalidParameterException {
-    // TODO Auto-generated method stub
-
-  }
-
 }

Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java?rev=1693045&r1=1693044&r2=1693045&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java (original)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java Tue Jul 28 09:16:25 2015
@@ -19,6 +19,8 @@
 
 package opennlp.tools.disambiguator.ims;
 
+import java.io.File;
+
 import opennlp.tools.disambiguator.WSDParameters;
 
 /**
@@ -27,40 +29,63 @@ import opennlp.tools.disambiguator.WSDPa
  */
 public class IMSParameters extends WSDParameters {
 
+  public static enum Source {
+    SEMCOR(1, "semcor"), SEMEVAL(2, "semeval"), OTHER(3, "other");
+
+    public int code;
+    public String src;
+
+    private Source(int code, String src) {
+      this.code = code;
+      this.src = src;
+    }
+  }
+
   protected String languageCode;
   protected int windowSize;
   protected int ngram;
+  protected Source source;
 
-  protected String resourcesFolder = "src\\test\\resources\\supervised\\";
-
-  protected String rawDataDirectory = resourcesFolder + "raw\\";
-  protected String trainingDataDirectory = resourcesFolder + "models\\";
-  protected String dictionaryDirectory = resourcesFolder + "dictionary\\";
-
-  protected String dict = dictionaryDirectory + "EnglishLS.dictionary.xml";
-  protected String map = dictionaryDirectory + "EnglishLS.sensemap";
-
-  public IMSParameters() {
-    super();
-    this.languageCode = "En";
-    this.windowSize = 3;
-    this.ngram = 2;
-  }
+  public static final String resourcesFolder = "src\\test\\resources\\";
+  public static final String trainingDataDirectory = resourcesFolder
+      + "supervised\\models\\";
 
   /**
+   * This constructor takes only two parameters. The default language used is
+   * <i>English</i>
    * 
    * @param windowSize
-   *          : the size of the window used for the extraction of the features
+   *          the size of the window used for the extraction of the features
    *          qualified of Surrounding Words
    * @param ngram
-   *          : the number words used for the extraction of features qualified
-   *          of Local Collocations
+   *          the number words used for the extraction of features qualified of
+   *          Local Collocations
+   * @param source
+   *          the source of the training data
    */
-  public IMSParameters(int windowSize, int ngram) {
+  public IMSParameters(int windowSize, int ngram, Source source) {
     super();
     this.languageCode = "En";
     this.windowSize = windowSize;
     this.ngram = ngram;
+    this.source = source;
+    this.isCoarseSense = false;
+
+    File folder = new File(trainingDataDirectory);
+    if (!folder.exists())
+      folder.mkdirs();
+  }
+
+  public IMSParameters() {
+    this(3, 2, Source.SEMCOR);
+  }
+
+  public IMSParameters(Source source) {
+    this(3, 2, source);
+  }
+
+  public IMSParameters(int windowSize, int ngram) {
+    this(windowSize, ngram, Source.SEMCOR);
   }
 
   public String getLanguageCode() {
@@ -87,52 +112,12 @@ public class IMSParameters extends WSDPa
     this.ngram = ngram;
   }
 
-  public String getRawDataDirectory() {
-    return rawDataDirectory;
-  }
-
-  public void setRawDataDirectory(String rawDataDirectory) {
-    this.rawDataDirectory = rawDataDirectory;
-  }
-
-  public String getTrainingDataDirectory() {
-    return trainingDataDirectory;
-  }
-
-  public void setTrainingDataDirectory(String trainingDataDirectory) {
-    this.trainingDataDirectory = trainingDataDirectory;
-  }
-
-  public String getDictionaryDirectory() {
-    return dictionaryDirectory;
-  }
-
-  public void setDictionaryDirectory(String dictionaryDirectory) {
-    this.dictionaryDirectory = dictionaryDirectory;
-  }
-
-  public String getDict() {
-    return dict;
-  }
-
-  public void setDict(String dict) {
-    this.dict = dict;
-  }
-
-  public String getMap() {
-    return map;
-  }
-
-  public void setMap(String map) {
-    this.map = map;
-  }
-
-  public String getResourcesFolder() {
-    return resourcesFolder;
+  public Source getSource() {
+    return source;
   }
 
-  public void setResourcesFolder(String resourcesFolder) {
-    this.resourcesFolder = resourcesFolder;
+  public void setSource(Source source) {
+    this.source = source;
   }
 
   void init() {
@@ -149,7 +134,7 @@ public class IMSParameters extends WSDPa
   @Override
   public boolean isValid() {
     // TODO Auto-generated method stub
-    return false;
+    return true;
   }
 
 }

Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java?rev=1693045&r1=1693044&r2=1693045&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java (original)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java Tue Jul 28 09:16:25 2015
@@ -21,6 +21,8 @@ package opennlp.tools.disambiguator.ims;
 
 import java.util.ArrayList;
 
+import net.sf.extjwnl.data.POS;
+import opennlp.tools.disambiguator.Constants;
 import opennlp.tools.disambiguator.PreProcessor;
 import opennlp.tools.disambiguator.WordToDisambiguate;
 
@@ -41,8 +43,8 @@ public class WTDIMS extends WordToDisamb
     super(sentence, word);
   }
 
-  public WTDIMS(String xmlWord, ArrayList<String> xmlAnswers,
-      String xmlSentence, String xmlrawWord) {
+  public WTDIMS(String xmlWord, ArrayList<String> senseIDs, String xmlSentence,
+      String xmlrawWord) {
     super();
 
     // this.word = xmlWord;
@@ -57,10 +59,15 @@ public class WTDIMS extends WordToDisamb
       }
     }
 
-    this.senseIDs = xmlAnswers;
+    this.senseIDs = senseIDs;
 
   }
 
+  public WTDIMS(WordToDisambiguate wtd) {
+    super(wtd.getSentence(), wtd.getWordIndex(), wtd.getSense());
+    this.senseIDs = wtd.getSenseIDs();
+  }
+
   public String[] getPosOfSurroundingWords() {
     return posOfSurroundingWords;
   }
@@ -93,4 +100,25 @@ public class WTDIMS extends WordToDisamb
     this.features = features;
   }
 
+  public String getWordTag() {
+
+    String wordBaseForm = PreProcessor.lemmatize(this.getWord(),
+        this.getPosTag());
+
+    String ref = "";
+
+    if ((Constants.getPOS(this.getPosTag()) != null)) {
+      if (Constants.getPOS(this.getPosTag()).equals(POS.VERB)) {
+        ref = wordBaseForm + ".v";
+      } else if (Constants.getPOS(this.getPosTag()).equals(POS.NOUN)) {
+        ref = wordBaseForm + ".n";
+      } else if (Constants.getPOS(this.getPosTag()).equals(POS.ADJECTIVE)) {
+        ref = wordBaseForm + ".a";
+      } else if (Constants.getPOS(this.getPosTag()).equals(POS.ADVERB)) {
+        ref = wordBaseForm + ".r";
+      }
+    }
+
+    return ref;
+  }
 }

Added: opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java?rev=1693045&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java (added)
+++ opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java Tue Jul 28 09:16:25 2015
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import opennlp.tools.disambiguator.DatasetsReader.SensevalReader;
+import opennlp.tools.disambiguator.ims.IMS;
+import opennlp.tools.disambiguator.ims.IMSParameters;
+import opennlp.tools.disambiguator.ims.WTDIMS;
+
+import org.junit.Test;
+
+public class IMSEvaluatorTest {
+
+  static SensevalReader seReader = new SensevalReader();
+
+  @Test
+  public static void main(String[] args) {
+    Constants.print("Evaluation Started");
+
+    IMS ims = new IMS();
+    IMSParameters imsParams = new IMSParameters();
+    ims.setParams(imsParams);
+
+    ArrayList<String> words = seReader.getSensevalWords();
+
+    for (String word : words) {
+      WSDEvaluator evaluator = new WSDEvaluator(ims);
+
+      // don't take verbs because they are not from WordNet
+      if (!word.split("\\.")[1].equals("v")) {
+
+        ArrayList<WTDIMS> instances = getTestData(word);
+
+        if (instances != null) {
+          Constants.print("------------------" + word + "------------------");
+          for (WordToDisambiguate instance : instances) {
+            // Constants.print("sense IDs : " + instance.senseIDs);
+            evaluator.evaluateSample(instance);
+          }
+          Constants.print(evaluator.toString());
+        } else {
+          Constants.print("null instances");
+        }
+      }
+
+    }
+
+  }
+
+  /**
+   * For a specific word, return the Semeval3 corresponding instances in form of
+   * {@link WSDIMS}
+   * 
+   * @param wordTag
+   *          the word of which the instances are to be collected. wordTag has
+   *          to be in the format "word.POS" (e.g., "activate.v", "smart.a",
+   *          etc.)
+   * @return list of {@link WSDIMS} instances of the wordTag
+   */
+  protected static ArrayList<WTDIMS> getTestData(String wordTag) {
+
+    ArrayList<WTDIMS> instances = new ArrayList<WTDIMS>();
+    for (WordToDisambiguate wtd : seReader.getSensevalData(wordTag)) {
+      WTDIMS wtdims = new WTDIMS(wtd);
+      instances.add(wtdims);
+    }
+
+    return instances;
+  }
+
+}

Propchange: opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain