You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ra...@apache.org on 2016/02/18 22:02:35 UTC

svn commit: r1731145 - in /opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools: lemmatizer/ util/

Author: ragerri
Date: Thu Feb 18 21:02:34 2016
New Revision: 1731145

URL: http://svn.apache.org/viewvc?rev=1731145&view=rev
Log:
OPENNLP-760 adding factory and string utils to induce lemma classes

Added:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluator.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java
Modified:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java?rev=1731145&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java Thu Feb 18 21:02:34 2016
@@ -0,0 +1,49 @@
+package opennlp.tools.lemmatizer;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.StringUtil;
+
+
+/**
+ * Reads data for training and testing. The format consists of:
+ * word\tabpostag\tablemma.
+ * @version 2016-02-16
+ */
+public class LemmaSampleStream extends FilterObjectStream<String, LemmaSample> {
+
+  public LemmaSampleStream(ObjectStream<String> samples) {
+    super(samples);
+  }
+
+  public LemmaSample read() throws IOException {
+
+    List<String> toks = new ArrayList<String>();
+    List<String> tags = new ArrayList<String>();
+    List<String> preds = new ArrayList<String>();
+
+    for (String line = samples.read(); line != null && !line.equals(""); line = samples.read()) {
+      String[] parts = line.split("\t");
+      if (parts.length != 3) {
+        System.err.println("Skipping corrupt line: " + line);
+      }
+      else {
+        toks.add(parts[0]);
+        tags.add(parts[1]);
+        String ses = StringUtil.getShortestEditScript(parts[0], parts[2]);
+        preds.add(ses);
+      }
+    }
+    if (toks.size() > 0) {
+      LemmaSample lemmaSample = new LemmaSample(toks.toArray(new String[toks.size()]), tags.toArray(new String[tags.size()]), preds.toArray(new String[preds.size()]));
+      return lemmaSample;
+    }
+    else {
+      return null;
+    }
+  }
+}

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java?rev=1731145&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java Thu Feb 18 21:02:34 2016
@@ -0,0 +1,18 @@
+package opennlp.tools.lemmatizer;
+
+/**
+ * The interface for lemmatizers.
+ */
+public interface Lemmatizer {
+
+  /**
+   * Generates lemma tags for the word and postag returning the result in an array.
+   *
+   * @param toks an array of the tokens
+   * @param tags an array of the pos tags
+   *
+   * @return an array of lemma classes for each token in the sequence.
+   */
+  public String[] lemmatize(String[] toks, String tags[]);
+
+}

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java?rev=1731145&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java Thu Feb 18 21:02:34 2016
@@ -0,0 +1,12 @@
+package opennlp.tools.lemmatizer;
+
+import opennlp.tools.util.eval.EvaluationMonitor;
+
+/**
+ * Interface for the lemmatizer evaluator.
+ * @version 2016-02-18
+ *
+ */
+public interface LemmatizerEvaluationMonitor extends EvaluationMonitor<LemmaSample> {
+
+}

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluator.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluator.java?rev=1731145&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluator.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluator.java Thu Feb 18 21:02:34 2016
@@ -0,0 +1,88 @@
+package opennlp.tools.lemmatizer;
+
+import opennlp.tools.util.eval.Evaluator;
+import opennlp.tools.util.eval.Mean;
+
+/**
+ * The {@link LemmatizerEvaluator} measures the performance of
+ * the given {@link Lemmatizer} with the provided reference
+ * {@link LemmaSample}s.
+ */
+public class LemmatizerEvaluator extends Evaluator<LemmaSample> {
+
+  private Lemmatizer lemmatizer;
+
+  private Mean wordAccuracy = new Mean();
+
+  /**
+   * Initializes the current instance.
+   *
+   * @param aLemmatizer a lemmatizer
+   * @param listeners an array of evaluation listeners
+   */
+  public LemmatizerEvaluator(Lemmatizer aLemmatizer, LemmatizerEvaluationMonitor ... listeners) {
+    super(listeners);
+    this.lemmatizer = aLemmatizer;
+  }
+
+  /**
+   * Evaluates the given reference {@link LemmaSample} object.
+   *
+   * This is done by tagging the sentence from the reference
+   * {@link LemmaSample} with the {@link Lemmatizer}. The
+   * tags are then used to update the word accuracy score.
+   *
+   * @param reference the reference {@link LemmaSample}.
+   *
+   * @return the predicted {@link LemmaSample}.
+   */
+  @Override
+  protected LemmaSample processSample(LemmaSample reference) {
+
+    String[] predictedLemmas = lemmatizer.lemmatize(reference.getTokens(), reference.getTags());
+    String[] referenceLemmas = reference.getLemmas();
+    
+    for (int i = 0; i < referenceLemmas.length; i++) {
+      //System.err.println("-> Reference: " + referenceLemmas[i]);
+      //System.err.println("-> Predicted: " + predictedLemmas[i]);
+      if (referenceLemmas[i].equals(predictedLemmas[i])) {
+        wordAccuracy.add(1);
+      }
+      else {
+        wordAccuracy.add(0);
+      }
+    }
+    return new LemmaSample(reference.getTokens(), reference.getTags(), predictedLemmas);
+  }
+
+  /**
+   * Retrieves the word accuracy.
+   *
+   * This is defined as:
+   * word accuracy = correctly detected tags / total words
+   *
+   * @return the word accuracy
+   */
+  public double getWordAccuracy() {
+    return wordAccuracy.mean();
+  }
+
+  /**
+   * Retrieves the total number of words considered
+   * in the evaluation.
+   *
+   * @return the word count
+   */
+  public long getWordCount() {
+    return wordAccuracy.count();
+  }
+
+  /**
+   * Represents this objects as human readable {@link String}.
+   */
+  @Override
+  public String toString() {
+    return "Accuracy:" + wordAccuracy.mean() +
+        " Number of Samples: " + wordAccuracy.count();
+  }
+}

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java?rev=1731145&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java Thu Feb 18 21:02:34 2016
@@ -0,0 +1,48 @@
+package opennlp.tools.lemmatizer;
+
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.SequenceValidator;
+import opennlp.tools.util.ext.ExtensionLoader;
+
+public class LemmatizerFactory extends BaseToolFactory {
+
+  /**
+   * Creates a {@link LemmatizerFactory} that provides the default implementation
+   * of the resources.
+   */
+  public LemmatizerFactory() {
+  }
+
+  public static LemmatizerFactory create(String subclassName)
+      throws InvalidFormatException {
+    if (subclassName == null) {
+      // will create the default factory
+      return new LemmatizerFactory();
+    }
+    try {
+      LemmatizerFactory theFactory = ExtensionLoader.instantiateExtension(
+          LemmatizerFactory.class, subclassName);
+      return theFactory;
+    } catch (Exception e) {
+      String msg = "Could not instantiate the " + subclassName
+          + ". The initialization throw an exception.";
+      System.err.println(msg);
+      e.printStackTrace();
+      throw new InvalidFormatException(msg, e);
+    }
+  }
+
+  @Override
+  public void validateArtifactMap() throws InvalidFormatException {
+    // no additional artifacts
+  }
+
+  public SequenceValidator<String> getSequenceValidator() {
+    return new DefaultLemmatizerSequenceValidator();
+  }
+
+  public LemmatizerContextGenerator getContextGenerator() {
+    return new DefaultLemmatizerContextGenerator();
+  }
+}

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java?rev=1731145&r1=1731144&r2=1731145&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java Thu Feb 18 21:02:34 2016
@@ -113,4 +113,143 @@ public class StringUtil {
   public static boolean isEmpty(CharSequence theString) {
 	return theString.length() == 0;
   }
+  
+  /**
+   * Get mininum of three values.
+   * @param a number a
+   * @param b number b
+   * @param c number c
+   * @return the minimum
+   */
+  private static int minimum(int a, int b, int c) {
+      int minValue;
+      minValue = a;
+      if (b < minValue) {
+        minValue = b;
+      }
+      if (c < minValue) {
+        minValue = c;
+      }
+      return minValue;
+  }
+  
+  /**
+   * Computes the Levenshtein distance of two strings in a matrix.
+   * Based on pseudo-code provided here:
+   * https://en.wikipedia.org/wiki/Levenshtein_distance#Computing_Levenshtein_distance
+   * which in turn is based on the paper Wagner, Robert A.; Fischer, Michael J. (1974),
+   * "The String-to-String Correction Problem", Journal of the ACM 21 (1): 168-173
+   * @param wordForm the form
+   * @param lemma the lemma
+   * @return the distance
+   */
+  public static int[][] levenshteinDistance(String wordForm, String lemma) {
+
+    int wordLength = wordForm.length();
+    int lemmaLength = lemma.length();
+    int cost;
+    int[][] distance = new int[wordLength + 1][lemmaLength + 1];
+    
+    if (wordLength == 0) {
+      return distance;
+    }
+    if (lemmaLength == 0) {
+      return distance;
+    }
+    //fill in the rows of column 0
+    for (int i = 0; i <= wordLength; i++) {
+      distance[i][0] = i;
+    }
+    //fill in the columns of row 0
+    for (int j = 0; j <= lemmaLength; j++) {
+      distance[0][j] = j;
+    }
+    //fill in the rest of the matrix calculating the minimum distance
+    for (int i = 1; i <= wordLength; i++) {
+      int s_i = wordForm.charAt(i - 1);
+      for (int j = 1; j <= lemmaLength; j++) {
+        if (s_i == lemma.charAt(j - 1)) {
+          cost = 0;
+        } else {
+          cost = 1;
+        }
+        //obtain minimum distance from calculating deletion, insertion, substitution
+        distance[i][j] = minimum(distance[i - 1][j] + 1, distance[i][j - 1] + 1, distance[i - 1][j - 1] + cost);
+      }
+    }
+    return distance;
+  }
+  
+  /**
+   * Computes the Shortest Edit Script (SES) to convert a word into its lemma.
+   * This is based on Chrupala's PhD thesis (2008).
+ * @param wordForm the token
+ * @param lemma the target lemma
+ * @param distance the levenshtein distance
+ * @param permutations the number of permutations
+ */
+public static void computeShortestEditScript(String wordForm, String lemma, int[][] distance, StringBuffer permutations) {
+    
+    int n = distance.length;
+    int m = distance[0].length;
+    
+    int wordFormLength = n - 1;
+    int lemmaLength = m - 1;
+    while(true) {
+        
+        if (distance[wordFormLength][lemmaLength] == 0) {
+          break;
+        }
+        if ((lemmaLength > 0 && wordFormLength > 0) && (distance[wordFormLength - 1][lemmaLength - 1] < distance[wordFormLength][lemmaLength])) {
+            permutations.append('R').append(Integer.toString(wordFormLength - 1)).append(wordForm.charAt(wordFormLength - 1)).append(lemma.charAt(lemmaLength - 1));
+            lemmaLength--;
+            wordFormLength--;
+            continue;
+        }
+        if (lemmaLength > 0 && (distance[wordFormLength][lemmaLength - 1] < distance[wordFormLength][lemmaLength])) {
+            permutations.append('I').append(Integer.toString(wordFormLength)).append(lemma.charAt(lemmaLength - 1));
+            lemmaLength--;
+            continue;
+        }
+        if (wordFormLength > 0 && (distance[wordFormLength - 1][lemmaLength] < distance[wordFormLength][lemmaLength])) {
+            permutations.append('D').append(Integer.toString(wordFormLength - 1)).append(wordForm.charAt(wordFormLength - 1));
+            wordFormLength--;
+            continue;
+        }
+        if ((wordFormLength > 0 && lemmaLength > 0) && (distance[wordFormLength - 1][lemmaLength - 1] == distance[wordFormLength][lemmaLength])) {
+            wordFormLength--; lemmaLength--;
+            continue ;
+        }
+        if (wordFormLength > 0 && (distance[wordFormLength - 1][lemmaLength] == distance[wordFormLength][lemmaLength])) {
+            wordFormLength--;
+            continue;
+        }
+        if (lemmaLength > 0 && (distance[wordFormLength][lemmaLength - 1] == distance[wordFormLength][lemmaLength])) {
+            lemmaLength--;
+            continue;
+        }   
+    }
+}
+
+/**
+ * Get the SES required to go from a word to a lemma.
+ * @param wordForm the word
+ * @param lemma the lemma
+ * @return the shortest edit script
+ */
+public static String getShortestEditScript(String wordForm, String lemma) {
+  String reversedWF = new StringBuffer(wordForm.toLowerCase()).reverse().toString();
+  String reversedLemma = new StringBuffer(lemma.toLowerCase()).reverse().toString();
+  StringBuffer permutations = new StringBuffer();
+  String ses;
+  if (!reversedWF.equals(reversedLemma)) {
+    int[][]levenDistance = StringUtil.levenshteinDistance(reversedWF, reversedLemma);
+    StringUtil.computeShortestEditScript(reversedWF, reversedLemma, levenDistance, permutations);
+    ses = permutations.toString();
+  } else {
+    ses = "O";
+  }
+  return ses;
+}
+
 }



Re: svn commit: r1731145 - in /opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools: lemmatizer/ util/

Posted by Joern Kottmann <ko...@gmail.com>.
Hello Rodrigo,

you are adding a couple of java files in this commit, and I think more
in other commits for the lemmatizer.

All new java files must have the AL header. May you please add the
header to files where it is missing.

Thanks,
Jörn 


On Thu, 2016-02-18 at 21:02 +0000, ragerri@apache.org wrote:
> Author: ragerri
> Date: Thu Feb 18 21:02:34 2016
> New Revision: 1731145
> 
> URL: http://svn.apache.org/viewvc?rev=1731145&view=rev
> Log:
> OPENNLP-760 adding factory and string utils to induce lemma classes
> 
> Added:
>     opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
>     opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java
>     opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMoni
> tor.java
>     opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluator.java
>     opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java
> Modified:
>     opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/util/StringUtil.java
> 
> Added: opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
> URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/mai
> n/java/opennlp/tools/lemmatizer/LemmaSampleStream.java?rev=1731145&vi
> ew=auto
> =====================================================================
> =========
> --- opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
> (added)
> +++ opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
> Thu Feb 18 21:02:34 2016
> @@ -0,0 +1,49 @@
> +package opennlp.tools.lemmatizer;
> +
> +import java.io.IOException;
> +import java.util.ArrayList;
> +import java.util.List;
> +
> +import opennlp.tools.util.FilterObjectStream;
> +import opennlp.tools.util.ObjectStream;
> +import opennlp.tools.util.StringUtil;
> +
> +
> +/**
> + * Reads data for training and testing. The format consists of:
> + * word\tabpostag\tablemma.
> + * @version 2016-02-16
> + */
> +public class LemmaSampleStream extends FilterObjectStream<String,
> LemmaSample> {
> +
> +  public LemmaSampleStream(ObjectStream<String> samples) {
> +    super(samples);
> +  }
> +
> +  public LemmaSample read() throws IOException {
> +
> +    List<String> toks = new ArrayList<String>();
> +    List<String> tags = new ArrayList<String>();
> +    List<String> preds = new ArrayList<String>();
> +
> +    for (String line = samples.read(); line != null &&
> !line.equals(""); line = samples.read()) {
> +      String[] parts = line.split("\t");
> +      if (parts.length != 3) {
> +        System.err.println("Skipping corrupt line: " + line);
> +      }
> +      else {
> +        toks.add(parts[0]);
> +        tags.add(parts[1]);
> +        String ses = StringUtil.getShortestEditScript(parts[0],
> parts[2]);
> +        preds.add(ses);
> +      }
> +    }
> +    if (toks.size() > 0) {
> +      LemmaSample lemmaSample = new LemmaSample(toks.toArray(new
> String[toks.size()]), tags.toArray(new String[tags.size()]),
> preds.toArray(new String[preds.size()]));
> +      return lemmaSample;
> +    }
> +    else {
> +      return null;
> +    }
> +  }
> +}
> 
> Added: opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java
> URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/mai
> n/java/opennlp/tools/lemmatizer/Lemmatizer.java?rev=1731145&view=auto
> =====================================================================
> =========
> --- opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java (added)
> +++ opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java Thu Feb
> 18 21:02:34 2016
> @@ -0,0 +1,18 @@
> +package opennlp.tools.lemmatizer;
> +
> +/**
> + * The interface for lemmatizers.
> + */
> +public interface Lemmatizer {
> +
> +  /**
> +   * Generates lemma tags for the word and postag returning the
> result in an array.
> +   *
> +   * @param toks an array of the tokens
> +   * @param tags an array of the pos tags
> +   *
> +   * @return an array of lemma classes for each token in the
> sequence.
> +   */
> +  public String[] lemmatize(String[] toks, String tags[]);
> +
> +}
> 
> Added: opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMoni
> tor.java
> URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/mai
> n/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java?rev=
> 1731145&view=auto
> =====================================================================
> =========
> --- opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMoni
> tor.java (added)
> +++ opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMoni
> tor.java Thu Feb 18 21:02:34 2016
> @@ -0,0 +1,12 @@
> +package opennlp.tools.lemmatizer;
> +
> +import opennlp.tools.util.eval.EvaluationMonitor;
> +
> +/**
> + * Interface for the lemmatizer evaluator.
> + * @version 2016-02-18
> + *
> + */
> +public interface LemmatizerEvaluationMonitor extends
> EvaluationMonitor<LemmaSample> {
> +
> +}
> 
> Added: opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluator.java
> URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/mai
> n/java/opennlp/tools/lemmatizer/LemmatizerEvaluator.java?rev=1731145&
> view=auto
> =====================================================================
> =========
> --- opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluator.java
> (added)
> +++ opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluator.java
> Thu Feb 18 21:02:34 2016
> @@ -0,0 +1,88 @@
> +package opennlp.tools.lemmatizer;
> +
> +import opennlp.tools.util.eval.Evaluator;
> +import opennlp.tools.util.eval.Mean;
> +
> +/**
> + * The {@link LemmatizerEvaluator} measures the performance of
> + * the given {@link Lemmatizer} with the provided reference
> + * {@link LemmaSample}s.
> + */
> +public class LemmatizerEvaluator extends Evaluator<LemmaSample> {
> +
> +  private Lemmatizer lemmatizer;
> +
> +  private Mean wordAccuracy = new Mean();
> +
> +  /**
> +   * Initializes the current instance.
> +   *
> +   * @param aLemmatizer a lemmatizer
> +   * @param listeners an array of evaluation listeners
> +   */
> +  public LemmatizerEvaluator(Lemmatizer aLemmatizer,
> LemmatizerEvaluationMonitor ... listeners) {
> +    super(listeners);
> +    this.lemmatizer = aLemmatizer;
> +  }
> +
> +  /**
> +   * Evaluates the given reference {@link LemmaSample} object.
> +   *
> +   * This is done by tagging the sentence from the reference
> +   * {@link LemmaSample} with the {@link Lemmatizer}. The
> +   * tags are then used to update the word accuracy score.
> +   *
> +   * @param reference the reference {@link LemmaSample}.
> +   *
> +   * @return the predicted {@link LemmaSample}.
> +   */
> +  @Override
> +  protected LemmaSample processSample(LemmaSample reference) {
> +
> +    String[] predictedLemmas =
> lemmatizer.lemmatize(reference.getTokens(), reference.getTags());
> +    String[] referenceLemmas = reference.getLemmas();
> +    
> +    for (int i = 0; i < referenceLemmas.length; i++) {
> +      //System.err.println("-> Reference: " + referenceLemmas[i]);
> +      //System.err.println("-> Predicted: " + predictedLemmas[i]);
> +      if (referenceLemmas[i].equals(predictedLemmas[i])) {
> +        wordAccuracy.add(1);
> +      }
> +      else {
> +        wordAccuracy.add(0);
> +      }
> +    }
> +    return new LemmaSample(reference.getTokens(),
> reference.getTags(), predictedLemmas);
> +  }
> +
> +  /**
> +   * Retrieves the word accuracy.
> +   *
> +   * This is defined as:
> +   * word accuracy = correctly detected tags / total words
> +   *
> +   * @return the word accuracy
> +   */
> +  public double getWordAccuracy() {
> +    return wordAccuracy.mean();
> +  }
> +
> +  /**
> +   * Retrieves the total number of words considered
> +   * in the evaluation.
> +   *
> +   * @return the word count
> +   */
> +  public long getWordCount() {
> +    return wordAccuracy.count();
> +  }
> +
> +  /**
> +   * Represents this objects as human readable {@link String}.
> +   */
> +  @Override
> +  public String toString() {
> +    return "Accuracy:" + wordAccuracy.mean() +
> +        " Number of Samples: " + wordAccuracy.count();
> +  }
> +}
> 
> Added: opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java
> URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/mai
> n/java/opennlp/tools/lemmatizer/LemmatizerFactory.java?rev=1731145&vi
> ew=auto
> =====================================================================
> =========
> --- opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java
> (added)
> +++ opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java
> Thu Feb 18 21:02:34 2016
> @@ -0,0 +1,48 @@
> +package opennlp.tools.lemmatizer;
> +
> +import opennlp.tools.util.BaseToolFactory;
> +import opennlp.tools.util.InvalidFormatException;
> +import opennlp.tools.util.SequenceValidator;
> +import opennlp.tools.util.ext.ExtensionLoader;
> +
> +public class LemmatizerFactory extends BaseToolFactory {
> +
> +  /**
> +   * Creates a {@link LemmatizerFactory} that provides the default
> implementation
> +   * of the resources.
> +   */
> +  public LemmatizerFactory() {
> +  }
> +
> +  public static LemmatizerFactory create(String subclassName)
> +      throws InvalidFormatException {
> +    if (subclassName == null) {
> +      // will create the default factory
> +      return new LemmatizerFactory();
> +    }
> +    try {
> +      LemmatizerFactory theFactory =
> ExtensionLoader.instantiateExtension(
> +          LemmatizerFactory.class, subclassName);
> +      return theFactory;
> +    } catch (Exception e) {
> +      String msg = "Could not instantiate the " + subclassName
> +          + ". The initialization throw an exception.";
> +      System.err.println(msg);
> +      e.printStackTrace();
> +      throw new InvalidFormatException(msg, e);
> +    }
> +  }
> +
> +  @Override
> +  public void validateArtifactMap() throws InvalidFormatException {
> +    // no additional artifacts
> +  }
> +
> +  public SequenceValidator<String> getSequenceValidator() {
> +    return new DefaultLemmatizerSequenceValidator();
> +  }
> +
> +  public LemmatizerContextGenerator getContextGenerator() {
> +    return new DefaultLemmatizerContextGenerator();
> +  }
> +}
> 
> Modified: opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/util/StringUtil.java
> URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/mai
> n/java/opennlp/tools/util/StringUtil.java?rev=1731145&r1=1731144&r2=1
> 731145&view=diff
> =====================================================================
> =========
> --- opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/util/StringUtil.java (original)
> +++ opennlp/trunk/opennlp-
> tools/src/main/java/opennlp/tools/util/StringUtil.java Thu Feb 18
> 21:02:34 2016
> @@ -113,4 +113,143 @@ public class StringUtil {
>    public static boolean isEmpty(CharSequence theString) {
>  	return theString.length() == 0;
>    }
> +  
> +  /**
> +   * Get mininum of three values.
> +   * @param a number a
> +   * @param b number b
> +   * @param c number c
> +   * @return the minimum
> +   */
> +  private static int minimum(int a, int b, int c) {
> +      int minValue;
> +      minValue = a;
> +      if (b < minValue) {
> +        minValue = b;
> +      }
> +      if (c < minValue) {
> +        minValue = c;
> +      }
> +      return minValue;
> +  }
> +  
> +  /**
> +   * Computes the Levenshtein distance of two strings in a matrix.
> +   * Based on pseudo-code provided here:
> +   * https://en.wikipedia.org/wiki/Levenshtein_distance#Computing_Le
> venshtein_distance
> +   * which in turn is based on the paper Wagner, Robert A.; Fischer,
> Michael J. (1974),
> +   * "The String-to-String Correction Problem", Journal of the ACM
> 21 (1): 168-173
> +   * @param wordForm the form
> +   * @param lemma the lemma
> +   * @return the distance
> +   */
> +  public static int[][] levenshteinDistance(String wordForm, String
> lemma) {
> +
> +    int wordLength = wordForm.length();
> +    int lemmaLength = lemma.length();
> +    int cost;
> +    int[][] distance = new int[wordLength + 1][lemmaLength + 1];
> +    
> +    if (wordLength == 0) {
> +      return distance;
> +    }
> +    if (lemmaLength == 0) {
> +      return distance;
> +    }
> +    //fill in the rows of column 0
> +    for (int i = 0; i <= wordLength; i++) {
> +      distance[i][0] = i;
> +    }
> +    //fill in the columns of row 0
> +    for (int j = 0; j <= lemmaLength; j++) {
> +      distance[0][j] = j;
> +    }
> +    //fill in the rest of the matrix calculating the minimum
> distance
> +    for (int i = 1; i <= wordLength; i++) {
> +      int s_i = wordForm.charAt(i - 1);
> +      for (int j = 1; j <= lemmaLength; j++) {
> +        if (s_i == lemma.charAt(j - 1)) {
> +          cost = 0;
> +        } else {
> +          cost = 1;
> +        }
> +        //obtain minimum distance from calculating deletion,
> insertion, substitution
> +        distance[i][j] = minimum(distance[i - 1][j] + 1,
> distance[i][j - 1] + 1, distance[i - 1][j - 1] + cost);
> +      }
> +    }
> +    return distance;
> +  }
> +  
> +  /**
> +   * Computes the Shortest Edit Script (SES) to convert a word into
> its lemma.
> +   * This is based on Chrupala's PhD thesis (2008).
> + * @param wordForm the token
> + * @param lemma the target lemma
> + * @param distance the levenshtein distance
> + * @param permutations the number of permutations
> + */
> +public static void computeShortestEditScript(String wordForm, String
> lemma, int[][] distance, StringBuffer permutations) {
> +    
> +    int n = distance.length;
> +    int m = distance[0].length;
> +    
> +    int wordFormLength = n - 1;
> +    int lemmaLength = m - 1;
> +    while(true) {
> +        
> +        if (distance[wordFormLength][lemmaLength] == 0) {
> +          break;
> +        }
> +        if ((lemmaLength > 0 && wordFormLength > 0) &&
> (distance[wordFormLength - 1][lemmaLength - 1] <
> distance[wordFormLength][lemmaLength])) {
> +            permutations.append('R').append(Integer.toString(wordFor
> mLength - 1)).append(wordForm.charAt(wordFormLength -
> 1)).append(lemma.charAt(lemmaLength - 1));
> +            lemmaLength--;
> +            wordFormLength--;
> +            continue;
> +        }
> +        if (lemmaLength > 0 && (distance[wordFormLength][lemmaLength
> - 1] < distance[wordFormLength][lemmaLength])) {
> +            permutations.append('I').append(Integer.toString(wordFor
> mLength)).append(lemma.charAt(lemmaLength - 1));
> +            lemmaLength--;
> +            continue;
> +        }
> +        if (wordFormLength > 0 && (distance[wordFormLength -
> 1][lemmaLength] < distance[wordFormLength][lemmaLength])) {
> +            permutations.append('D').append(Integer.toString(wordFor
> mLength - 1)).append(wordForm.charAt(wordFormLength - 1));
> +            wordFormLength--;
> +            continue;
> +        }
> +        if ((wordFormLength > 0 && lemmaLength > 0) &&
> (distance[wordFormLength - 1][lemmaLength - 1] ==
> distance[wordFormLength][lemmaLength])) {
> +            wordFormLength--; lemmaLength--;
> +            continue ;
> +        }
> +        if (wordFormLength > 0 && (distance[wordFormLength -
> 1][lemmaLength] == distance[wordFormLength][lemmaLength])) {
> +            wordFormLength--;
> +            continue;
> +        }
> +        if (lemmaLength > 0 && (distance[wordFormLength][lemmaLength
> - 1] == distance[wordFormLength][lemmaLength])) {
> +            lemmaLength--;
> +            continue;
> +        }   
> +    }
> +}
> +
> +/**
> + * Get the SES required to go from a word to a lemma.
> + * @param wordForm the word
> + * @param lemma the lemma
> + * @return the shortest edit script
> + */
> +public static String getShortestEditScript(String wordForm, String
> lemma) {
> +  String reversedWF = new
> StringBuffer(wordForm.toLowerCase()).reverse().toString();
> +  String reversedLemma = new
> StringBuffer(lemma.toLowerCase()).reverse().toString();
> +  StringBuffer permutations = new StringBuffer();
> +  String ses;
> +  if (!reversedWF.equals(reversedLemma)) {
> +    int[][]levenDistance =
> StringUtil.levenshteinDistance(reversedWF, reversedLemma);
> +    StringUtil.computeShortestEditScript(reversedWF, reversedLemma,
> levenDistance, permutations);
> +    ses = permutations.toString();
> +  } else {
> +    ses = "O";
> +  }
> +  return ses;
> +}
> +
>  }
> 
>