You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ra...@apache.org on 2017/03/03 08:19:29 UTC
opennlp git commit: OPENNLP-904 Harmonize lemmatizer API and function
to get multiple lemmas
Repository: opennlp
Updated Branches:
refs/heads/master ebf108170 -> 8a3b3b537
OPENNLP-904 Harmonize lemmatizer API and function to get multiple lemmas
OPENNLP-904 add minor correction after PR comment
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/8a3b3b53
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/8a3b3b53
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/8a3b3b53
Branch: refs/heads/master
Commit: 8a3b3b537a30b14c4ffb5eb32ffa41d5027bddad
Parents: ebf1081
Author: Rodrigo Agerri <ra...@apache.org>
Authored: Fri Feb 3 16:00:38 2017 +0100
Committer: Rodrigo Agerri <ra...@apache.org>
Committed: Fri Mar 3 09:16:20 2017 +0100
----------------------------------------------------------------------
.../cmdline/lemmatizer/LemmatizerMETool.java | 4 +-
.../tools/lemmatizer/DictionaryLemmatizer.java | 70 ++++++++++++++------
.../lemmatizer/LemmaSampleEventStream.java | 2 +-
.../tools/lemmatizer/LemmaSampleStream.java | 4 +-
.../opennlp/tools/lemmatizer/Lemmatizer.java | 16 ++++-
.../opennlp/tools/lemmatizer/LemmatizerME.java | 64 ++++++++++++++++--
.../tools/lemmatizer/DummyLemmatizer.java | 7 ++
.../tools/lemmatizer/LemmatizerMETest.java | 3 +-
8 files changed, 136 insertions(+), 34 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/8a3b3b53/opennlp-tools/src/main/java/opennlp/tools/cmdline/lemmatizer/LemmatizerMETool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/lemmatizer/LemmatizerMETool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/lemmatizer/LemmatizerMETool.java
index e4e47b5..90ba95d 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/lemmatizer/LemmatizerMETool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/lemmatizer/LemmatizerMETool.java
@@ -72,10 +72,8 @@ public class LemmatizerMETool extends BasicCmdLineTool {
continue;
}
- String[] preds = lemmatizer.lemmatize(posSample.getSentence(),
+ String[] lemmas = lemmatizer.lemmatize(posSample.getSentence(),
posSample.getTags());
- String[] lemmas = lemmatizer.decodeLemmas(posSample.getSentence(),
- preds);
System.out.println(new LemmaSample(posSample.getSentence(),
posSample.getTags(), lemmas).toString());
http://git-wip-us.apache.org/repos/asf/opennlp/blob/8a3b3b53/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
index b1b04a1..9f0b0b0 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
@@ -37,7 +37,7 @@ public class DictionaryLemmatizer implements Lemmatizer {
/**
* The hashmap containing the dictionary.
*/
- private final Map<List<String>, String> dictMap;
+ private final Map<List<String>, List<String>> dictMap;
/**
* Construct a hashmap from the input tab separated dictionary.
@@ -47,26 +47,24 @@ public class DictionaryLemmatizer implements Lemmatizer {
* @param dictionary
* the input dictionary via inputstream
*/
- public DictionaryLemmatizer(final InputStream dictionary) {
+ public DictionaryLemmatizer(final InputStream dictionary) throws IOException {
this.dictMap = new HashMap<>();
- final BufferedReader breader = new BufferedReader(new InputStreamReader(dictionary));
+ final BufferedReader breader = new BufferedReader(
+ new InputStreamReader(dictionary));
String line;
- try {
- while ((line = breader.readLine()) != null) {
- final String[] elems = line.split("\t");
- this.dictMap.put(Arrays.asList(elems[0], elems[1]), elems[2]);
- }
- } catch (final IOException e) {
- e.printStackTrace();
+ while ((line = breader.readLine()) != null) {
+ final String[] elems = line.split("\t");
+ this.dictMap.put(Arrays.asList(elems[0], elems[1]), Arrays.asList(elems[2]));
}
}
+
/**
* Get the Map containing the dictionary.
*
* @return dictMap the Map
*/
- public Map<List<String>, String> getDictMap() {
+ public Map<List<String>, List<String>> getDictMap() {
return this.dictMap;
}
@@ -85,31 +83,65 @@ public class DictionaryLemmatizer implements Lemmatizer {
return keys;
}
+
public String[] lemmatize(final String[] tokens, final String[] postags) {
List<String> lemmas = new ArrayList<>();
for (int i = 0; i < tokens.length; i++) {
- lemmas.add(this.apply(tokens[i], postags[i]));
+ lemmas.add(this.lemmatize(tokens[i], postags[i]));
}
return lemmas.toArray(new String[lemmas.size()]);
}
+ public List<List<String>> lemmatize(final List<String> tokens, final List<String> posTags) {
+ List<List<String>> allLemmas = new ArrayList<>();
+ for (int i = 0; i < tokens.size(); i++) {
+ allLemmas.add(this.getAllLemmas(tokens.get(i), posTags.get(i)));
+ }
+ return allLemmas;
+ }
+
/**
* Lookup lemma in a dictionary. Outputs "O" if not found.
- * @param word the token
- * @param postag the postag
+ *
+ * @param word
+ * the token
+ * @param postag
+ * the postag
* @return the lemma
*/
- public String apply(final String word, final String postag) {
+ private String lemmatize(final String word, final String postag) {
String lemma;
final List<String> keys = this.getDictKeys(word, postag);
// lookup lemma as value of the map
- final String keyValue = this.dictMap.get(keys);
- if (keyValue != null) {
- lemma = keyValue;
+ final List<String> keyValues = this.dictMap.get(keys);
+ if (!keyValues.isEmpty()) {
+ lemma = keyValues.get(0);
} else {
lemma = "O";
}
return lemma;
}
-}
+ /**
+ * Lookup every lemma for a word,pos tag in a dictionary. Outputs "O" if not
+ * found.
+ *
+ * @param word
+ * the token
+ * @param postag
+ * the postag
+ * @return every lemma
+ */
+ private List<String> getAllLemmas(final String word, final String postag) {
+ List<String> lemmasList = new ArrayList<>();
+ final List<String> keys = this.getDictKeys(word, postag);
+ // lookup lemma as value of the map
+ final List<String> keyValues = this.dictMap.get(keys);
+ if (!keyValues.isEmpty()) {
+ lemmasList.addAll(keyValues);
+ } else {
+ lemmasList.add("O");
+ }
+ return lemmasList;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/8a3b3b53/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java
index fc1a558..a8d71e8 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java
@@ -49,7 +49,7 @@ public class LemmaSampleEventStream extends AbstractEventStream<LemmaSample> {
List<Event> events = new ArrayList<>();
String[] toksArray = sample.getTokens();
String[] tagsArray = sample.getTags();
- String[] lemmasArray = sample.getLemmas();
+ String[] lemmasArray = LemmatizerME.encodeLemmas(toksArray,sample.getLemmas());
for (int ei = 0, el = sample.getTokens().length; ei < el; ei++) {
events.add(new Event(lemmasArray[ei],
contextGenerator.getContext(ei,toksArray,tagsArray,lemmasArray)));
http://git-wip-us.apache.org/repos/asf/opennlp/blob/8a3b3b53/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
index 0a133c3..9c661a5 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
@@ -23,7 +23,6 @@ import java.util.List;
import opennlp.tools.util.FilterObjectStream;
import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.StringUtil;
/**
@@ -51,8 +50,7 @@ public class LemmaSampleStream extends FilterObjectStream<String, LemmaSample> {
else {
toks.add(parts[0]);
tags.add(parts[1]);
- String ses = StringUtil.getShortestEditScript(parts[0], parts[2]);
- preds.add(ses);
+ preds.add(parts[2]);
}
}
if (toks.size() > 0) {
http://git-wip-us.apache.org/repos/asf/opennlp/blob/8a3b3b53/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java
index f21f9e3..933eec1 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java
@@ -17,19 +17,31 @@
package opennlp.tools.lemmatizer;
+import java.util.List;
+
/**
* The interface for lemmatizers.
*/
public interface Lemmatizer {
/**
- * Generates lemma tags for the word and postag returning the result in an array.
+ * Generates lemmas for the word and postag returning the result in an array.
*
* @param toks an array of the tokens
* @param tags an array of the pos tags
*
- * @return an array of lemma classes for each token in the sequence.
+ * @return an array of possible lemmas for each token in the sequence.
*/
String[] lemmatize(String[] toks, String[] tags);
+ /**
+ * Generates a lemma tags for the word and postag returning the result in a list
+ * of every possible lemma for each token and postag.
+ *
+ * @param toks an array of the tokens
+ * @param tags an array of the pos tags
+ * @return a list of every possible lemma for each token in the sequence.
+ */
+ List<List<String>> lemmatize(List<String> toks, List<String> tags);
+
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/8a3b3b53/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
index 4855fda..2b8122f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
@@ -19,6 +19,7 @@ package opennlp.tools.lemmatizer;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -47,6 +48,7 @@ import opennlp.tools.util.TrainingParameters;
*/
public class LemmatizerME implements Lemmatizer {
+ public static final int LEMMA_NUMBER = 29;
public static final int DEFAULT_BEAM_SIZE = 3;
protected int beamSize;
private Sequence bestSequence;
@@ -86,9 +88,52 @@ public class LemmatizerME implements Lemmatizer {
}
public String[] lemmatize(String[] toks, String[] tags) {
+ String[] ses = predictSES(toks, tags);
+ String[] lemmas = decodeLemmas(toks, ses);
+ return lemmas;
+ }
+
+ @Override public List<List<String>> lemmatize(List<String> toks,
+ List<String> tags) {
+ String[] tokens = toks.toArray(new String[toks.size()]);
+ String[] posTags = tags.toArray(new String[tags.size()]);
+ String[][] allLemmas = predictLemmas(LEMMA_NUMBER, tokens, posTags);
+ List<List<String>> predictedLemmas = new ArrayList<>();
+ for (int i = 0; i < allLemmas.length; i++) {
+ predictedLemmas.add(Arrays.asList(allLemmas[i]));
+ }
+ return predictedLemmas;
+ }
+
+ /**
+ * Predict Short Edit Script (automatically induced lemma class).
+ * @param toks the array of tokens
+ * @param tags the array of pos tags
+ * @return an array containing the lemma classes
+ */
+ public String[] predictSES(String[] toks, String[] tags) {
bestSequence = model.bestSequence(toks, new Object[] {tags}, contextGenerator, sequenceValidator);
- List<String> c = bestSequence.getOutcomes();
- return c.toArray(new String[c.size()]);
+ List<String> ses = bestSequence.getOutcomes();
+ return ses.toArray(new String[ses.size()]);
+ }
+
+ /**
+ * Predict all possible lemmas (using a default upper bound).
+ * @param numLemmas the default number of lemmas
+ * @param toks the tokens
+ * @param tags the postags
+ * @return a double array containing all posible lemmas for each token and postag pair
+ */
+ public String[][] predictLemmas(int numLemmas, String[] toks, String[] tags) {
+ Sequence[] bestSequences = model.bestSequences(numLemmas, toks, new Object[] {tags},
+ contextGenerator, sequenceValidator);
+ String[][] allLemmas = new String[bestSequences.length][];
+ for (int i = 0; i < allLemmas.length; i++) {
+ List<String> ses = bestSequences[i].getOutcomes();
+ String[] sesArray = ses.toArray(new String[ses.size()]);
+ allLemmas[i] = decodeLemmas(toks,sesArray);
+ }
+ return allLemmas;
}
/**
@@ -97,11 +142,10 @@ public class LemmatizerME implements Lemmatizer {
* @param preds the predicted lemma classes
* @return the array of decoded lemmas
*/
- public String[] decodeLemmas(String[] toks, String[] preds) {
+ public static String[] decodeLemmas(String[] toks, String[] preds) {
List<String> lemmas = new ArrayList<>();
for (int i = 0; i < toks.length; i++) {
String lemma = StringUtil.decodeShortestEditScript(toks[i].toLowerCase(), preds[i]);
- //System.err.println("-> DEBUG: " + toks[i].toLowerCase() + " " + preds[i] + " " + lemma);
if (lemma.length() == 0) {
lemma = "_";
}
@@ -110,6 +154,18 @@ public class LemmatizerME implements Lemmatizer {
return lemmas.toArray(new String[lemmas.size()]);
}
+ public static String[] encodeLemmas(String[] toks, String[] lemmas) {
+ List<String> sesList = new ArrayList<>();
+ for (int i = 0; i < toks.length; i++) {
+ String ses = StringUtil.getShortestEditScript(toks[i], lemmas[i]);
+ if (ses.length() == 0) {
+ ses = "_";
+ }
+ sesList.add(ses);
+ }
+ return sesList.toArray(new String[sesList.size()]);
+ }
+
public Sequence[] topKSequences(String[] sentence, String[] tags) {
return model.bestSequences(DEFAULT_BEAM_SIZE, sentence,
new Object[] { tags }, contextGenerator, sequenceValidator);
http://git-wip-us.apache.org/repos/asf/opennlp/blob/8a3b3b53/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DummyLemmatizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DummyLemmatizer.java b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DummyLemmatizer.java
index 489ba38..dcfc883 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DummyLemmatizer.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DummyLemmatizer.java
@@ -19,6 +19,7 @@ package opennlp.tools.lemmatizer;
import java.io.IOException;
import java.util.Arrays;
+import java.util.List;
/**
* This dummy lemmatizer implementation simulates a LemmatizerME. The file has
@@ -56,4 +57,10 @@ public class DummyLemmatizer implements Lemmatizer {
}
}
+ @Override
+ public List<List<String>> lemmatize(List<String> toks,
+ List<String> tags) {
+ return null;
+ }
+
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/8a3b3b53/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java
index 76b4cd5..97dcc3c 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java
@@ -82,8 +82,7 @@ public class LemmatizerMETest {
@Test
public void testLemmasAsArray() throws Exception {
- String[] preds = lemmatizer.lemmatize(tokens, postags);
- String[] lemmas = lemmatizer.decodeLemmas(tokens, preds);
+ String[] lemmas = lemmatizer.lemmatize(tokens, postags);
Assert.assertArrayEquals(expect, lemmas);
}