You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ra...@apache.org on 2017/02/07 08:49:32 UTC
[1/2] opennlp git commit: OPENNLP-904 add function to LemmatizerME to
get every lemma for a token and pos tag combination
Repository: opennlp
Updated Branches:
refs/heads/904 f65d4d3ef -> 0e7c49aeb
OPENNLP-904 add function to LemmatizerME to get every lemma for a token and pos tag combination
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/53cd0ddf
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/53cd0ddf
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/53cd0ddf
Branch: refs/heads/904
Commit: 53cd0ddf5e3d38247ecce6585a3fc53bea74463a
Parents: f65d4d3
Author: Rodrigo Agerri <ra...@apache.org>
Authored: Mon Feb 6 23:42:29 2017 +0100
Committer: Rodrigo Agerri <ra...@apache.org>
Committed: Mon Feb 6 23:42:29 2017 +0100
----------------------------------------------------------------------
.../cmdline/lemmatizer/LemmatizerMETool.java | 4 +--
.../tools/lemmatizer/LemmaSampleStream.java | 1 -
.../opennlp/tools/lemmatizer/LemmatizerME.java | 35 ++++++++++++++++----
3 files changed, 30 insertions(+), 10 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/53cd0ddf/opennlp-tools/src/main/java/opennlp/tools/cmdline/lemmatizer/LemmatizerMETool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/lemmatizer/LemmatizerMETool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/lemmatizer/LemmatizerMETool.java
index 13f28b2..9390376 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/lemmatizer/LemmatizerMETool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/lemmatizer/LemmatizerMETool.java
@@ -72,10 +72,8 @@ public class LemmatizerMETool extends BasicCmdLineTool {
continue;
}
- String[] preds = lemmatizer.lemmatize(posSample.getSentence(),
+ String[] lemmas = lemmatizer.lemmatize(posSample.getSentence(),
posSample.getTags());
- String[] lemmas = lemmatizer.decodeLemmas(posSample.getSentence(),
- preds);
System.out.println(new LemmaSample(posSample.getSentence(),
posSample.getTags(), lemmas).toString());
http://git-wip-us.apache.org/repos/asf/opennlp/blob/53cd0ddf/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
index 0704026..9c661a5 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
@@ -50,7 +50,6 @@ public class LemmaSampleStream extends FilterObjectStream<String, LemmaSample> {
else {
toks.add(parts[0]);
tags.add(parts[1]);
- //String ses = StringUtil.getShortestEditScript(parts[0], parts[2]);
preds.add(parts[2]);
}
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/53cd0ddf/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
index bb6a0b6..4ee924b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
@@ -18,10 +18,7 @@
package opennlp.tools.lemmatizer;
import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
import opennlp.tools.ml.BeamSearch;
import opennlp.tools.ml.EventModelSequenceTrainer;
@@ -47,6 +44,7 @@ import opennlp.tools.util.TrainingParameters;
*/
public class LemmatizerME implements Lemmatizer {
+ public static final int LEMMA_NUMBER = 29;
public static final int DEFAULT_BEAM_SIZE = 3;
protected int beamSize;
private Sequence bestSequence;
@@ -93,7 +91,14 @@ public class LemmatizerME implements Lemmatizer {
@Override public List<List<String>> lemmatize(List<String> toks,
List<String> tags) {
- return null;
+ String[] tokens = toks.toArray(new String[toks.size()]);
+ String[] posTags = tags.toArray(new String[tags.size()]);
+ String[][] allLemmas = predictLemmas(LEMMA_NUMBER, tokens, posTags);
+ List<List<String>> predictedLemmas = new ArrayList<>();
+ for (int i = 0; i < allLemmas.length; i++) {
+ predictedLemmas.add(Arrays.asList(allLemmas[i]));
+ }
+ return predictedLemmas;
}
/**
@@ -109,6 +114,25 @@ public class LemmatizerME implements Lemmatizer {
}
/**
+ * Predict all possible lemmas (using a default upper bound).
+ * @param numLemmas the default number of lemmas
+ * @param toks the tokens
+ * @param tags the postags
+ * @return a double array containing all posible lemmas for each token and postag pair
+ */
+ public String[][] predictLemmas(int numLemmas, String[] toks, String[] tags) {
+ Sequence[] bestSequences = model.bestSequences(numLemmas, toks, new Object[] {tags},
+ contextGenerator, sequenceValidator);
+ String[][] allLemmas = new String[bestSequences.length][];
+ for (int i = 0; i < allLemmas.length; i++) {
+ List<String> ses = bestSequences[i].getOutcomes();
+ String[] sesArray = ses.toArray(new String[ses.size()]);
+ allLemmas[i] = decodeLemmas(toks,sesArray);
+ }
+ return allLemmas;
+ }
+
+ /**
* Decodes the lemma from the word and the induced lemma class.
* @param toks the array of tokens
* @param preds the predicted lemma classes
@@ -118,7 +142,6 @@ public class LemmatizerME implements Lemmatizer {
List<String> lemmas = new ArrayList<>();
for (int i = 0; i < toks.length; i++) {
String lemma = StringUtil.decodeShortestEditScript(toks[i].toLowerCase(), preds[i]);
- //System.err.println("-> DEBUG: " + toks[i].toLowerCase() + " " + preds[i] + " " + lemma);
if (lemma.length() == 0) {
lemma = "_";
}
[2/2] opennlp git commit: OPENNLP-904 improve import minor change
Posted by ra...@apache.org.
OPENNLP-904 improve import minor change
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/0e7c49ae
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/0e7c49ae
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/0e7c49ae
Branch: refs/heads/904
Commit: 0e7c49aeb89e718d5e439a9efedf969f84826ee8
Parents: 53cd0dd
Author: Rodrigo Agerri <ra...@apache.org>
Authored: Tue Feb 7 09:49:16 2017 +0100
Committer: Rodrigo Agerri <ra...@apache.org>
Committed: Tue Feb 7 09:49:16 2017 +0100
----------------------------------------------------------------------
.../src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/0e7c49ae/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
index 4ee924b..34bfa87 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
@@ -18,7 +18,11 @@
package opennlp.tools.lemmatizer;
import java.io.IOException;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
import opennlp.tools.ml.BeamSearch;
import opennlp.tools.ml.EventModelSequenceTrainer;