You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ra...@apache.org on 2016/02/18 16:06:59 UTC

svn commit: r1731084 - /opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/

Author: ragerri
Date: Thu Feb 18 15:06:59 2016
New Revision: 1731084

URL: http://svn.apache.org/viewvc?rev=1731084&view=rev
Log:
OPENNLP-760 first commit of statistical lemmatizer: features and sample

Added:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DefaultLemmatizerContextGenerator.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DefaultLemmatizerSequenceValidator.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSample.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleSequenceStream.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerContextGenerator.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/package-info.java

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DefaultLemmatizerContextGenerator.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DefaultLemmatizerContextGenerator.java?rev=1731084&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DefaultLemmatizerContextGenerator.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DefaultLemmatizerContextGenerator.java Thu Feb 18 15:06:59 2016
@@ -0,0 +1,98 @@
+package opennlp.tools.lemmatizer;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+/**
+ * Simple feature generator for learning statistical lemmatizers.
+ * Features based on Grzegorz Chrupała. 2008. Towards a Machine-Learning
+ * Architecture for Lexical Functional Grammar Parsing. PhD dissertation, 
+ * Dublin City University 
+ * @version 2016-02-15
+ */
+public class DefaultLemmatizerContextGenerator implements LemmatizerContextGenerator {
+  
+  private static final int PREFIX_LENGTH = 5;
+  private static final int SUFFIX_LENGTH = 7;
+
+  private static Pattern hasCap = Pattern.compile("[A-Z]");
+  private static Pattern hasNum = Pattern.compile("[0-9]");
+
+  public DefaultLemmatizerContextGenerator() {
+  }
+
+  protected static String[] getPrefixes(String lex) {
+    String[] prefs = new String[PREFIX_LENGTH];
+    for (int li = 1, ll = PREFIX_LENGTH; li < ll; li++) {
+      prefs[li] = lex.substring(0, Math.min(li + 1, lex.length()));
+    }
+    return prefs;
+  }
+
+  protected static String[] getSuffixes(String lex) {
+    String[] suffs = new String[SUFFIX_LENGTH];
+    for (int li = 1, ll = SUFFIX_LENGTH; li < ll; li++) {
+      suffs[li] = lex.substring(Math.max(lex.length() - li - 1, 0));
+    }
+    return suffs;
+  }
+  
+  public String[] getContext(int index, String[] sequence, String[] priorDecisions, Object[] additionalContext) {
+    return getContext(index, sequence, (String[]) additionalContext[0], priorDecisions);
+  }
+
+  public String[] getContext(int index, String[] toks, String[] tags, String[] preds) {
+    // Word
+    String w0;
+    // Tag
+    String t0;
+    // Previous prediction
+    String p_1;
+
+    String lex = toks[index].toString();
+    if (index < 1) {
+      p_1 = "p_1=bos";
+    }
+    else {
+      p_1 = "p_1=" + preds[index - 1];
+    }
+
+    w0 = "w0=" + toks[index];
+    t0 = "t0=" + tags[index];
+
+    List<String> features = new ArrayList<String>();
+    
+    features.add(w0);
+    features.add(t0);
+    features.add(p_1);
+    features.add(p_1 + t0);
+    features.add(p_1 + w0);
+    
+    // do some basic suffix analysis
+    String[] suffs = getSuffixes(lex);
+    for (int i = 0; i < suffs.length; i++) {
+      features.add("suf=" + suffs[i]);
+    }
+
+    String[] prefs = getPrefixes(lex);
+    for (int i = 0; i < prefs.length; i++) {
+      features.add("pre=" + prefs[i]);
+    }
+    // see if the word has any special characters
+    if (lex.indexOf('-') != -1) {
+      features.add("h");
+    }
+
+    if (hasCap.matcher(lex).find()) {
+      features.add("c");
+    }
+
+    if (hasNum.matcher(lex).find()) {
+      features.add("d");
+    }
+    
+    return features.toArray(new String[features.size()]);
+  }
+}
+

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DefaultLemmatizerSequenceValidator.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DefaultLemmatizerSequenceValidator.java?rev=1731084&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DefaultLemmatizerSequenceValidator.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DefaultLemmatizerSequenceValidator.java Thu Feb 18 15:06:59 2016
@@ -0,0 +1,12 @@
+package opennlp.tools.lemmatizer;
+
+import opennlp.tools.util.SequenceValidator;
+
+public class DefaultLemmatizerSequenceValidator implements SequenceValidator<String>{
+
+  //TODO complete this
+  public boolean validSequence(int i, String[] sequence, String[] s, String outcome) {
+    return true;
+  }
+
+}

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSample.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSample.java?rev=1731084&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSample.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSample.java Thu Feb 18 15:06:59 2016
@@ -0,0 +1,89 @@
+package opennlp.tools.lemmatizer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Represents an lemmatized sentence.
+ */
+public class LemmaSample {
+
+  private List<String> tokens;
+
+  private List<String> tags;
+  
+  private final List<String> lemmas;
+
+ /**
+ * Represents one lemma sample.
+ * @param tokens the token
+ * @param tags the postags
+ * @param lemmas the lemmas
+ */
+public LemmaSample(String[] tokens, String[] tags, String[] lemmas) {
+
+    validateArguments(tokens.length, tags.length, lemmas.length);
+
+    this.tokens = Collections.unmodifiableList(new ArrayList<String>(Arrays.asList(tokens)));
+    this.tags = Collections.unmodifiableList(new ArrayList<String>(Arrays.asList(tags)));
+    this.lemmas = Collections.unmodifiableList(new ArrayList<String>(Arrays.asList(lemmas)));
+  }
+  
+  public LemmaSample(List<String> tokens, List<String> tags, List<String> lemmas) {
+
+    validateArguments(tokens.size(), tags.size(), lemmas.size());
+
+    this.tokens = Collections.unmodifiableList(new ArrayList<String>((tokens)));
+    this.tags = Collections.unmodifiableList(new ArrayList<String>((tags)));
+    this.lemmas = Collections.unmodifiableList(new ArrayList<String>((lemmas)));
+  }
+
+  public String[] getTokens() {
+    return tokens.toArray(new String[tokens.size()]);
+  }
+
+  public String[] getTags() {
+    return tags.toArray(new String[tags.size()]);
+  }
+  
+  public String[] getLemmas() {
+    return lemmas.toArray(new String[lemmas.size()]);
+  }
+
+  private void validateArguments(int tokensSize, int tagsSize, int lemmasSize) throws IllegalArgumentException {
+    if (tokensSize != tagsSize || tagsSize != lemmasSize) {
+      throw new IllegalArgumentException(
+          "All arrays must have the same length: " +
+              "sentenceSize: " + tokensSize +
+              ", tagsSize: " + tagsSize +
+              ", predsSize: " + lemmasSize + "!");
+    }
+  }
+
+  @Override
+  public String toString() {
+
+        StringBuilder lemmaString = new StringBuilder();
+
+        for (int ci = 0; ci < lemmas.size(); ci++) {
+        lemmaString.append(tokens.get(ci)).append(" ").append(tags.get(ci)).append(" ").append(lemmas.get(ci)).append("\n");
+        }
+        return lemmaString.toString();
+      }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    } else if (obj instanceof LemmaSample) {
+      LemmaSample a = (LemmaSample) obj;
+      return Arrays.equals(getTokens(), a.getTokens())
+          && Arrays.equals(getTags(), a.getTags())
+          && Arrays.equals(getLemmas(), a.getLemmas());
+    } else {
+      return false;
+    }
+  }
+}

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java?rev=1731084&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java Thu Feb 18 15:06:59 2016
@@ -0,0 +1,46 @@
+package opennlp.tools.lemmatizer;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+
+import opennlp.tools.ml.model.Event;
+import opennlp.tools.util.AbstractEventStream;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * Class for creating an event stream out of data files for training a probabilistic lemmatizer.
+ */
+public class LemmaSampleEventStream extends AbstractEventStream<LemmaSample> {
+
+  private LemmatizerContextGenerator contextGenerator;
+
+  /**
+   * Creates a new event stream based on the specified data stream using the specified context generator.
+   * @param d The data stream for this event stream.
+   * @param cg The context generator which should be used in the creation of events for this event stream.
+   */
+  public LemmaSampleEventStream(ObjectStream<LemmaSample> d, LemmatizerContextGenerator cg) {
+    super(d);
+    this.contextGenerator = cg;
+  }
+  
+  protected Iterator<Event> createEvents(LemmaSample sample) {
+
+    if (sample != null) {
+      List<Event> events = new ArrayList<Event>();
+      String[] toksArray = sample.getTokens();
+      String[] tagsArray = sample.getTags();
+      String[] predsArray = sample.getLemmas();
+      for (int ei = 0, el = sample.getTokens().length; ei < el; ei++) {
+        events.add(new Event(predsArray[ei], contextGenerator.getContext(ei,toksArray,tagsArray,predsArray)));
+      }
+      return events.iterator();
+    }
+    else {
+      return Collections.emptyListIterator();
+    }
+  }
+}
+

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleSequenceStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleSequenceStream.java?rev=1731084&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleSequenceStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleSequenceStream.java Thu Feb 18 15:06:59 2016
@@ -0,0 +1,60 @@
+package opennlp.tools.lemmatizer;
+
+import java.io.IOException;
+
+import opennlp.tools.ml.model.AbstractModel;
+import opennlp.tools.ml.model.Event;
+import opennlp.tools.ml.model.Sequence;
+import opennlp.tools.ml.model.SequenceStream;
+import opennlp.tools.util.ObjectStream;
+
+public class LemmaSampleSequenceStream implements SequenceStream {
+
+  private final ObjectStream<LemmaSample> samples;
+  private final LemmatizerContextGenerator contextGenerator;
+
+  public LemmaSampleSequenceStream(ObjectStream<LemmaSample> samples,
+      LemmatizerContextGenerator contextGenerator) {
+    this.samples = samples;
+    this.contextGenerator = contextGenerator;
+  }
+
+  @Override
+  public Sequence read() throws IOException {
+    LemmaSample sample = samples.read();
+
+    if (sample != null) {
+      String sentence[] = sample.getTokens();
+      String tags[] = sample.getTags();
+      String preds[] = sample.getLemmas();
+      Event[] events = new Event[sentence.length];
+
+      for (int i=0; i < sentence.length; i++) {
+        // it is safe to pass the tags as previous tags because
+        // the context generator does not look for non predicted tags
+        String[] context = contextGenerator.getContext(i, sentence, tags, preds);
+
+        events[i] = new Event(tags[i], context);
+      }
+      return new Sequence<LemmaSample>(events,sample);
+    }
+
+    return null;
+  }
+
+  @Override
+  public Event[] updateContext(Sequence sequence, AbstractModel model) {
+    // TODO: Should be implemented for Perceptron sequence learning ...
+    return null;
+  }
+
+  @Override
+  public void reset() throws IOException, UnsupportedOperationException {
+    samples.reset();
+  }
+
+  @Override
+  public void close() throws IOException {
+    samples.close();
+  }
+}

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerContextGenerator.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerContextGenerator.java?rev=1731084&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerContextGenerator.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerContextGenerator.java Thu Feb 18 15:06:59 2016
@@ -0,0 +1,20 @@
+package opennlp.tools.lemmatizer;
+
+import opennlp.tools.util.BeamSearchContextGenerator;
+
+/**
+ * Interface for the context generator used for probabilistic lemmatizer.
+ */
+public interface LemmatizerContextGenerator extends BeamSearchContextGenerator<String> {
+
+  /**
+   * Returns the contexts for lemmatizing of the specified index.
+   * @param i The index of the token in the specified toks array for which the context should be constructed.
+   * @param toks The tokens of the sentence.  The <code>toString</code> methods of these objects should return the token text.
+   * @param tags The POS tags for the the specified tokens.
+   * @param preds The previous decisions made in the tagging of this sequence.  Only indices less than i will be examined.
+   * @return An array of predictive contexts on which a model basis its decisions.
+   */
+  public String[] getContext(int i, String[] toks, String[] tags, String[] preds);
+}
+

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/package-info.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/package-info.java?rev=1731084&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/package-info.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/package-info.java Thu Feb 18 15:06:59 2016
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Package related with the lemmatizer tool
+ */
+package opennlp.tools.lemmatizer;
\ No newline at end of file