You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2015/07/16 23:52:05 UTC

svn commit: r1691456 - in /opennlp/sandbox/opennlp-wsd/src: main/java/opennlp/tools/disambiguator/ main/java/opennlp/tools/disambiguator/lesk/ test/java/opennlp/tools/disambiguator/

Author: joern
Date: Thu Jul 16 21:52:04 2015
New Revision: 1691456

URL: http://svn.apache.org/r1691456
Log:
OPENNLP-790 Removed unused variables. Changed the output format to : [Source SenseKey Score]
each WSDisambiguator is assumed to have at least [Source SenseKey] as output for each disambiguation.
In the case of Lesk and other unsupervised approaches with scores, the score can be provided as extra output.
For now only the highest scoring disambiguated sense is considered in evaluation.

added explanation about accuracy

Modified:
    opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
    opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluator.java
    opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
    opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
    opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java

Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java?rev=1691456&r1=1691455&r2=1691456&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java (original)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java Thu Jul 16 21:52:04 2015
@@ -133,19 +133,16 @@ public class Constants {
     if (results != null) {
 
       if (disambiguator instanceof Lesk) {
-        POS pos;
-        long offset;
-        double score;
         String[] parts;
 
         for (String result : results) {
-          parts = result.split("@");
-          pos = POS.getPOSForKey(parts[0]);
-          offset = Long.parseLong(parts[1]);
-          score = Double.parseDouble(parts[3]);
+          parts = result.split(" ");
           try {
-            Constants.print("score : " + score + " for : "
-                + Loader.getDictionary().getSynsetAt(pos, offset).getGloss());
+            Constants.print("score : "
+                + parts[2]
+                + " for : "
+                + Loader.getDictionary().getWordBySenseKey(parts[1])
+                    .getSynset().getGloss());
           } catch (JWNLException e) {
             e.printStackTrace();
           }

Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluator.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluator.java?rev=1691456&r1=1691455&r2=1691456&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluator.java (original)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluator.java Thu Jul 16 21:52:04 2015
@@ -17,8 +17,6 @@
 
 package opennlp.tools.disambiguator;
 
-import net.sf.extjwnl.data.POS;
-import opennlp.tools.disambiguator.lesk.Lesk;
 import opennlp.tools.util.eval.Evaluator;
 import opennlp.tools.util.eval.Mean;
 
@@ -64,52 +62,61 @@ public class WSDEvaluator extends Evalua
     String predictedSense = disambiguator.disambiguate(reference.sentence,
         reference.getWordIndex())[0];
 
-    // TODO review this pattern
-    String[] parts = predictedSense.split("@");
-    POS pos = POS.getPOSForKey(parts[0]);
-    long offset = Long.parseLong(parts[1]);
-    String senseKey = parts[2];
-    double score = Double.parseDouble(parts[3]);
+    String senseKey = predictedSense.split(" ")[1];
 
     // if we have multiple senses mapped to one sense
     if (disambiguator.getParams().isCoarseSense()) {
-
       // if we find the sense in one of the coarse senses
       int found = -1;
       for (int i = 0; i < referenceSenses.length; i++) {
         if (referenceSenses[i].equals(senseKey)) {
-          // Constants.print("++++++++++++++++++++++++ YES");
           accuracy.add(1);
           found = i;
           break;
         }
       }
       if (found < 0) {
-        // Constants.print("NO : "+referenceSenses[0]+"+++" + senseKey);
         accuracy.add(0);
       }
-
     } // else we have fine grained senses (only one mapped sense)
     else {
       if (referenceSenses[0].equals(senseKey)) {
-        // Constants.print("++++++++++++++++++++++++ YES");
         accuracy.add(1);
       } else {
-        // Constants.print("NO : "+referenceSenses[0]+"+++" + senseKey);
         accuracy.add(0);
       }
     }
+
     return new WordToDisambiguate(reference.getSentence(),
         reference.getWordIndex());
   }
 
+  /**
+   * Retrieves the WSD accuracy.
+   *
+   * This is defined as: WSD accuracy = correctly disambiguated / total words
+   *
+   * @return the WSD accuracy
+   */
   public double getAccuracy() {
     return accuracy.mean();
   }
 
+  /**
+   * Retrieves the total number of words considered in the evaluation.
+   *
+   * @return the word count
+   */
   public long getWordCount() {
     return accuracy.count();
   }
 
+  /**
+   * Represents this objects as human readable {@link String}.
+   */
+  @Override
+  public String toString() {
+    return "Accuracy: " + (accuracy.mean() * 100) + "%"
+        + "\tNumber of Samples: " + accuracy.count();
+  }
 }
-

Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java?rev=1691456&r1=1691455&r2=1691456&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java (original)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java Thu Jul 16 21:52:04 2015
@@ -26,14 +26,21 @@ import opennlp.tools.util.Span;
  * A word sense disambiguator that determines which sense of a word is meant in
  * a particular context. It is a classification task, where the classes are the
  * different senses of the ambiguous word. Disambiguation can be achieved in
- * either supervised or un-supervised approaches. For the moment this component
- * relies on WordNet to retrieve sense definitions. It returns an array of
- * WordNet sense IDs ordered by their disambiguation score. The sense with
- * highest score is the most likely sense of the word.
+ * either supervised or un-supervised approaches. A disambiguator returns an
+ * array of sense IDs ordered by their disambiguation score as well their
+ * source. The first sense ID is the most probable sense in the set context. The
+ * context is a sentence or a chunk of text where the target word exists.
  * 
+ * <b>How it works :<b> Just supply the context as an array of tokens and the
+ * index of the target word to the disambiguate method.
+ * 
+ * Otherwise for multiple words, you can set a word span instead of simply one
+ * index. For the moment the source of sense definitions is from WordNet. *
  * Please see {@link Lesk} for an un-supervised approach. Please see {@link IMS}
  * for a supervised approach.
  * 
+ * Examples on how to use each approach are provided in the test section.
+ * 
  * @see Lesk
  * @see IMS
  */

Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java?rev=1691456&r1=1691455&r2=1691456&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java (original)
+++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java Thu Jul 16 21:52:04 2015
@@ -123,7 +123,6 @@ public class Lesk implements WSDisambigu
       for (WordPOS senseWordPOS : wordSense.getNode().getSenseRelevantWords()) {
         ArrayList stems = (ArrayList) PreProcessor.Stem(senseWordPOS);
         for (WordPOS sentenceWordPOS : relvWords) {
-          // TODO change to lemma check
           if (sentenceWordPOS.isStemEquivalent(senseWordPOS)) {
             count = count + 1;
           }
@@ -201,9 +200,6 @@ public class Lesk implements WSDisambigu
       wordSense.setScore(count);
 
     }
-
-    Collections.sort(scoredSenses);
-
     return scoredSenses;
   }
 
@@ -893,8 +889,8 @@ public class Lesk implements WSDisambigu
    * @param tokenizedContext
    * @param ambiguousTokenIndex
    * @return array of sense indexes from WordNet ordered by their score. The
-   *         result format is <b>POS</b>@<b>SenseID</b>@<b>Sense Score</b> If
-   *         the input token is non relevant a null is returned.
+   *         result format is <b>Source</b> <b>SenseID</b> If the input token is
+   *         non relevant a null is returned.
    */
   @Override
   public String[] disambiguate(String[] tokenizedContext,
@@ -974,11 +970,14 @@ public class Lesk implements WSDisambigu
           this.params.fathom_hypernyms, this.params.fathom_hyponyms,
           this.params.fathom_meronyms, this.params.fathom_holonyms);
       break;
+    default:
+      wsenses = extendedExponentialContextual(wtd,
+          LeskParameters.DFLT_WIN_SIZE, LeskParameters.DFLT_DEPTH,
+          LeskParameters.DFLT_IEXP, LeskParameters.DFLT_DEXP, true, true, true,
+          true, true);
+      break;
     }
 
-    wsenses = extendedExponentialContextual(wtd, LeskParameters.DFLT_WIN_SIZE,
-        LeskParameters.DFLT_DEPTH, LeskParameters.DFLT_IEXP,
-        LeskParameters.DFLT_DEXP, true, true, true, true, true);
     Collections.sort(wsenses);
 
     List<Word> synsetWords;
@@ -996,14 +995,8 @@ public class Lesk implements WSDisambigu
           break;
         }
       }
-      senses[i] = Constants.getPOS(wsenses.get(i).getWTDLesk().getPosTag())
-          .getKey()
-          + "@"
-          + Long.toString(wsenses.get(i).getNode().getSynsetID())
-          + "@"
-          + senseKey + "@" + wsenses.get(i).getScore();
+      senses[i] = "WordNet" + " " + senseKey + " " + wsenses.get(i).getScore();
 
-      Collections.sort(wsenses);
     }
     return senses;
   }
@@ -1015,8 +1008,8 @@ public class Lesk implements WSDisambigu
    * @param inputText
    * @param inputWordSpans
    * @return array of array of sense indexes from WordNet ordered by their
-   *         score. The result format is <b>POS</b>@<b>SenseID</b>@<b>Sense
-   *         Score</b> If the input token is non relevant a null is returned.
+   *         score. The result format is <b>Source</b> <b>SenseID</b> If the
+   *         input token is non relevant a null is returned.
    */
   @Override
   public String[][] disambiguate(String[] tokenizedContext,

Modified: opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java?rev=1691456&r1=1691455&r2=1691456&view=diff
==============================================================================
--- opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java (original)
+++ opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java Thu Jul 16 21:52:04 2015
@@ -18,6 +18,7 @@
  */
 
 package opennlp.tools.disambiguator;
+
 import java.io.File;
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -48,7 +49,7 @@ public class LeskEvaluatorTest {
 
     Lesk lesk = new Lesk();
     LeskParameters leskParams = new LeskParameters();
-    leskParams.setLeskType(LeskParameters.LESK_TYPE.LESK_BASIC);
+    leskParams.setLeskType(LeskParameters.LESK_TYPE.LESK_EXT_EXP_CTXT_WIN);
     lesk.setParams(leskParams);
 
     if (testFolder.isDirectory()) {
@@ -66,13 +67,11 @@ public class LeskEvaluatorTest {
             if (instances != null) {
               Constants.print("------------------" + file.getName()
                   + "------------------");
-              Constants.print("there are " + instances.size() + " instances");
               for (WordToDisambiguate instance : instances) {
                 // Constants.print("sense IDs : " + instance.senseIDs);
                 evaluator.evaluateSample(instance);
               }
-              Constants.print("the accuracy " + evaluator.getAccuracy() * 100
-                  + "%");
+              Constants.print(evaluator.toString());
             } else {
               Constants.print("null instances");
             }