You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by to...@apache.org on 2018/01/09 14:32:28 UTC
[opennlp] branch master updated: OPENNLP-1180 - LM API switches to
String[] (#304)
This is an automated email from the ASF dual-hosted git repository.
tommaso pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new e24f0e7 OPENNLP-1180 - LM API switches to String[] (#304)
e24f0e7 is described below
commit e24f0e78dbeef99bc09a34ee0663e085cb00b8a1
Author: Tommaso Teofili <to...@gmail.com>
AuthorDate: Tue Jan 9 15:32:25 2018 +0100
OPENNLP-1180 - LM API switches to String[] (#304)
---
.../languagemodel/NGramLanguageModelTool.java | 13 +--
.../opennlp/tools/languagemodel/LanguageModel.java | 24 +++++-
.../tools/languagemodel/NGramLanguageModel.java | 52 +++++++++++-
.../main/java/opennlp/tools/ngram/NGramUtils.java | 26 +++++-
.../languagemodel/LanguageModelEvaluationTest.java | 16 ++--
.../languagemodel/LanguageModelTestUtils.java | 17 ++--
.../languagemodel/NgramLanguageModelTest.java | 99 +++++++++++-----------
7 files changed, 169 insertions(+), 78 deletions(-)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/languagemodel/NGramLanguageModelTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/languagemodel/NGramLanguageModelTool.java
index 1c599c5..e5d88c0 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/languagemodel/NGramLanguageModelTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/languagemodel/NGramLanguageModelTool.java
@@ -19,6 +19,7 @@ package opennlp.tools.cmdline.languagemodel;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
+import java.util.Arrays;
import opennlp.tools.cmdline.BasicCmdLineTool;
import opennlp.tools.cmdline.CLI;
@@ -28,7 +29,6 @@ import opennlp.tools.cmdline.SystemInputStreamFactory;
import opennlp.tools.languagemodel.NGramLanguageModel;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
-import opennlp.tools.util.StringList;
/**
* Command line tool for {@link opennlp.tools.languagemodel.NGramLanguageModel}.
@@ -60,19 +60,20 @@ public class NGramLanguageModelTool extends BasicCmdLineTool {
String line;
while ((line = lineStream.read()) != null) {
double probability;
- StringList predicted;
+ String[] predicted;
+ // TODO : use a Tokenizer here
String[] tokens = line.split(" ");
- StringList sample = new StringList(tokens);
try {
- probability = nGramLanguageModel.calculateProbability(sample);
- predicted = nGramLanguageModel.predictNextTokens(sample);
+ probability = nGramLanguageModel.calculateProbability(tokens);
+ predicted = nGramLanguageModel.predictNextTokens(tokens);
} catch (Exception e) {
System.err.println("Error:" + e.getLocalizedMessage());
System.err.println(line);
continue;
}
- System.out.println(sample + " -> prob:" + probability + ", next:" + predicted);
+ System.out.println(Arrays.toString(tokens) + " -> prob:" + probability + ", " +
+ "next:" + Arrays.toString(predicted));
perfMon.incrementCounter();
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/languagemodel/LanguageModel.java b/opennlp-tools/src/main/java/opennlp/tools/languagemodel/LanguageModel.java
index 98dde4e..8366925 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/languagemodel/LanguageModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/languagemodel/LanguageModel.java
@@ -26,19 +26,39 @@ import opennlp.tools.util.StringList;
public interface LanguageModel {
/**
- * Calculate the probability of a series of tokens (e.g. a sentence), given a vocabulary
+ * Calculate the probability of a series of tokens (e.g. a sentence), given a vocabulary.
*
* @param tokens the text tokens to calculate the probability for
* @return the probability of the given text tokens in the vocabulary
+ * @deprecated use {@link #calculateProbability(String...)}
*/
+ @Deprecated
double calculateProbability(StringList tokens);
/**
- * Predict the most probable output sequence of tokens, given an input sequence of tokens
+ * Calculate the probability of a series of tokens (e.g. a sentence), given a vocabulary.
+ *
+ * @param tokens the text tokens to calculate the probability for
+ * @return the probability of the given text tokens in the vocabulary
+ */
+ double calculateProbability(String... tokens);
+
+ /**
+ * Predict the most probable output sequence of tokens, given an input sequence of tokens.
*
* @param tokens a sequence of tokens
* @return the most probable subsequent token sequence
+ * @deprecated use {@link #predictNextTokens(String...)}
*/
+ @Deprecated
StringList predictNextTokens(StringList tokens);
+ /**
+ * Predict the most probable output sequence of tokens, given an input sequence of tokens.
+ *
+ * @param tokens a sequence of tokens
+ * @return the most probable subsequent token sequence
+ */
+ String[] predictNextTokens(String... tokens);
+
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/languagemodel/NGramLanguageModel.java b/opennlp-tools/src/main/java/opennlp/tools/languagemodel/NGramLanguageModel.java
index 501c1bc..e9d25d5 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/languagemodel/NGramLanguageModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/languagemodel/NGramLanguageModel.java
@@ -52,15 +52,37 @@ public class NGramLanguageModel extends NGramModel implements LanguageModel {
this.n = n;
}
+ public void add(String... tokens) {
+ add(new StringList(tokens), 1, n);
+ }
+
@Override
- public double calculateProbability(StringList sample) {
+ public double calculateProbability(StringList tokens) {
double probability = 0d;
if (size() > 0) {
- for (StringList ngram : NGramUtils.getNGrams(sample, n)) {
+ for (StringList ngram : NGramUtils.getNGrams(tokens, n)) {
double score = stupidBackoff(ngram);
probability += Math.log(score);
if (Double.isNaN(probability)) {
probability = 0d;
+ break;
+ }
+ }
+ probability = Math.exp(probability);
+ }
+ return probability;
+ }
+
+ @Override
+ public double calculateProbability(String... tokens) {
+ double probability = 0d;
+ if (size() > 0) {
+ for (String[] ngram : NGramUtils.getNGrams(tokens, n)) {
+ double score = stupidBackoff(new StringList(ngram));
+ probability += Math.log(score);
+ if (Double.isNaN(probability)) {
+ probability = 0d;
+ break;
}
}
probability = Math.exp(probability);
@@ -92,6 +114,32 @@ public class NGramLanguageModel extends NGramModel implements LanguageModel {
return token;
}
+ @Override
+ public String[] predictNextTokens(String... tokens) {
+ double maxProb = Double.NEGATIVE_INFINITY;
+ String[] token = null;
+
+ for (StringList ngram : this) {
+ String[] sequence = new String[ngram.size() + tokens.length];
+ for (int i = 0; i < tokens.length; i++) {
+ sequence[i] = tokens[i];
+ }
+ for (int i = 0; i < ngram.size(); i++) {
+ sequence[i + tokens.length] = ngram.getToken(i);
+ }
+ double v = calculateProbability(sequence);
+ if (v > maxProb) {
+ maxProb = v;
+ token = new String[ngram.size()];
+ for (int i = 0; i < ngram.size(); i++) {
+ token[i] = ngram.getToken(i);
+ }
+ }
+ }
+
+ return token;
+ }
+
private double stupidBackoff(StringList ngram) {
int count = getCount(ngram);
StringList nMinusOneToken = NGramUtils.getNMinusOneTokenFirst(ngram);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramUtils.java b/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramUtils.java
index e41291f..dd3e19b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramUtils.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramUtils.java
@@ -229,7 +229,7 @@ public class NGramUtils {
}
/**
- * get the ngrams of dimension n of a certain input sequence of tokens
+ * Get the ngrams of dimension n of a certain input sequence of tokens.
*
* @param sequence a sequence of tokens
* @param size the size of the resulting ngrmams
@@ -249,6 +249,30 @@ public class NGramUtils {
ngrams.add(new StringList(ngram));
}
}
+ return ngrams;
+ }
+
+ /**
+ * Get the ngrams of dimension n of a certain input sequence of tokens.
+ *
+ * @param sequence a sequence of tokens
+ * @param size the size of the resulting ngrmams
+ * @return all the possible ngrams of the given size derivable from the input sequence
+ */
+ public static Collection<String[]> getNGrams(String[] sequence, int size) {
+ Collection<String[]> ngrams = new LinkedList<>();
+ if (size == -1 || size >= sequence.length) {
+ ngrams.add(sequence);
+ } else {
+ for (int i = 0; i < sequence.length - size + 1; i++) {
+ String[] ngram = new String[size];
+ ngram[0] = sequence[i];
+ for (int j = 1; j < size; j++) {
+ ngram[j] = sequence[i + j];
+ }
+ ngrams.add(ngram);
+ }
+ }
return ngrams;
}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/languagemodel/LanguageModelEvaluationTest.java b/opennlp-tools/src/test/java/opennlp/tools/languagemodel/LanguageModelEvaluationTest.java
index d4e8e37..eea0eb6 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/languagemodel/LanguageModelEvaluationTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/languagemodel/LanguageModelEvaluationTest.java
@@ -32,29 +32,29 @@ public class LanguageModelEvaluationTest {
@Test
public void testPerplexityComparison() throws Exception {
- Collection<StringList> trainingVocabulary =
+ Collection<String[]> trainingVocabulary =
LanguageModelTestUtils.generateRandomVocabulary(1100000);
- Collection<StringList> testVocabulary =
+ Collection<String[]> testVocabulary =
LanguageModelTestUtils.generateRandomVocabulary(100);
NGramLanguageModel unigramLM = new NGramLanguageModel(1);
- for (StringList sentence : trainingVocabulary) {
- unigramLM.add(sentence, 1, 1);
+ for (String[] sentence : trainingVocabulary) {
+ unigramLM.add(new StringList(sentence), 1, 1);
}
double unigramPerplexity =
LanguageModelTestUtils.getPerplexity(unigramLM, testVocabulary, 1);
NGramLanguageModel bigramLM = new NGramLanguageModel(2);
- for (StringList sentence : trainingVocabulary) {
- bigramLM.add(sentence, 1, 2);
+ for (String[] sentence : trainingVocabulary) {
+ bigramLM.add(new StringList(sentence), 1, 2);
}
double bigramPerplexity =
LanguageModelTestUtils.getPerplexity(bigramLM, testVocabulary, 2);
Assert.assertTrue(unigramPerplexity >= bigramPerplexity);
NGramLanguageModel trigramLM = new NGramLanguageModel(3);
- for (StringList sentence : trainingVocabulary) {
- trigramLM.add(sentence, 1, 3);
+ for (String[] sentence : trainingVocabulary) {
+ trigramLM.add(new StringList(sentence), 1, 3);
}
double trigramPerplexity =
LanguageModelTestUtils.getPerplexity(trigramLM, testVocabulary, 3);
diff --git a/opennlp-tools/src/test/java/opennlp/tools/languagemodel/LanguageModelTestUtils.java b/opennlp-tools/src/test/java/opennlp/tools/languagemodel/LanguageModelTestUtils.java
index 81725ae..56edb9e 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/languagemodel/LanguageModelTestUtils.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/languagemodel/LanguageModelTestUtils.java
@@ -26,7 +26,6 @@ import java.util.Random;
import org.junit.Ignore;
import opennlp.tools.ngram.NGramUtils;
-import opennlp.tools.util.StringList;
/**
* Utility class for language models tests
@@ -39,16 +38,16 @@ public class LanguageModelTestUtils {
private static final char[] chars = new char[]{'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'};
- public static Collection<StringList> generateRandomVocabulary(int size) {
- Collection<StringList> vocabulary = new LinkedList<>();
+ public static Collection<String[]> generateRandomVocabulary(int size) {
+ Collection<String[]> vocabulary = new LinkedList<>();
for (int i = 0; i < size; i++) {
- StringList sentence = generateRandomSentence();
+ String[] sentence = generateRandomSentence();
vocabulary.add(sentence);
}
return vocabulary;
}
- public static StringList generateRandomSentence() {
+ public static String[] generateRandomSentence() {
int dimension = r.nextInt(10) + 1;
String[] sentence = new String[dimension];
for (int j = 0; j < dimension; j++) {
@@ -56,15 +55,15 @@ public class LanguageModelTestUtils {
char c = chars[i];
sentence[j] = c + "-" + c + "-" + c;
}
- return new StringList(sentence);
+ return sentence;
}
- public static double getPerplexity(LanguageModel lm, Collection<StringList> testSet, int ngramSize)
+ public static double getPerplexity(LanguageModel lm, Collection<String[]> testSet, int ngramSize)
throws ArithmeticException {
BigDecimal perplexity = new BigDecimal(1d);
- for (StringList sentence : testSet) {
- for (StringList ngram : NGramUtils.getNGrams(sentence, ngramSize)) {
+ for (String[] sentence : testSet) {
+ for (String[] ngram : NGramUtils.getNGrams(sentence, ngramSize)) {
double ngramProbability = lm.calculateProbability(ngram);
perplexity = perplexity.multiply(new BigDecimal(1d).divide(
new BigDecimal(ngramProbability), CONTEXT));
diff --git a/opennlp-tools/src/test/java/opennlp/tools/languagemodel/NgramLanguageModelTest.java b/opennlp-tools/src/test/java/opennlp/tools/languagemodel/NgramLanguageModelTest.java
index 2ac1f5e..2091d3f 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/languagemodel/NgramLanguageModelTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/languagemodel/NgramLanguageModelTest.java
@@ -26,7 +26,6 @@ import org.junit.Assert;
import org.junit.Test;
import opennlp.tools.ngram.NGramGenerator;
-import opennlp.tools.util.StringList;
/**
* Tests for {@link opennlp.tools.languagemodel.NGramLanguageModel}
@@ -34,19 +33,19 @@ import opennlp.tools.util.StringList;
public class NgramLanguageModelTest {
@Test
- public void testEmptyVocabularyProbability() throws Exception {
+ public void testEmptyVocabularyProbability() {
NGramLanguageModel model = new NGramLanguageModel();
Assert.assertEquals("probability with an empty vocabulary is always 0",
- 0d, model.calculateProbability(new StringList("")), 0d);
+ 0d, model.calculateProbability(""), 0d);
Assert.assertEquals("probability with an empty vocabulary is always 0",
- 0d, model.calculateProbability(new StringList("1", "2", "3")), 0d);
+ 0d, model.calculateProbability("1", "2", "3"), 0d);
}
@Test
- public void testRandomVocabularyAndSentence() throws Exception {
+ public void testRandomVocabularyAndSentence() {
NGramLanguageModel model = new NGramLanguageModel();
- for (StringList sentence : LanguageModelTestUtils.generateRandomVocabulary(10)) {
- model.add(sentence, 1, 3);
+ for (String[] sentence : LanguageModelTestUtils.generateRandomVocabulary(10)) {
+ model.add(sentence);
}
double probability = model.calculateProbability(LanguageModelTestUtils.generateRandomSentence());
Assert.assertTrue("a probability measure should be between 0 and 1 [was "
@@ -54,82 +53,82 @@ public class NgramLanguageModelTest {
}
@Test
- public void testNgramModel() throws Exception {
+ public void testNgramModel() {
NGramLanguageModel model = new NGramLanguageModel(4);
- model.add(new StringList("I", "saw", "the", "fox"), 1, 4);
- model.add(new StringList("the", "red", "house"), 1, 4);
- model.add(new StringList("I", "saw", "something", "nice"), 1, 2);
- double probability = model.calculateProbability(new StringList("I", "saw", "the", "red", "house"));
+ model.add("I", "saw", "the", "fox");
+ model.add("the", "red", "house");
+ model.add("I", "saw", "something", "nice");
+ double probability = model.calculateProbability("I", "saw", "the", "red", "house");
Assert.assertTrue("a probability measure should be between 0 and 1 [was "
+ probability + "]", probability >= 0 && probability <= 1);
- StringList tokens = model.predictNextTokens(new StringList("I", "saw"));
+ String[] tokens = model.predictNextTokens("I", "saw");
Assert.assertNotNull(tokens);
- Assert.assertEquals(new StringList("the", "fox"), tokens);
+ Assert.assertArrayEquals(new String[] {"the", "fox"}, tokens);
}
@Test
- public void testBigramProbabilityNoSmoothing() throws Exception {
+ public void testBigramProbability() {
NGramLanguageModel model = new NGramLanguageModel(2);
- model.add(new StringList("<s>", "I", "am", "Sam", "</s>"), 1, 2);
- model.add(new StringList("<s>", "Sam", "I", "am", "</s>"), 1, 2);
- model.add(new StringList("<s>", "I", "do", "not", "like", "green", "eggs", "and", "ham", "</s>"), 1, 2);
- double probability = model.calculateProbability(new StringList("<s>", "I"));
+ model.add("<s>", "I", "am", "Sam", "</s>");
+ model.add("<s>", "Sam", "I", "am", "</s>");
+ model.add("<s>", "I", "do", "not", "like", "green", "eggs", "and", "ham", "</s>");
+ double probability = model.calculateProbability("<s>", "I");
Assert.assertEquals(0.666d, probability, 0.001);
- probability = model.calculateProbability(new StringList("Sam", "</s>"));
+ probability = model.calculateProbability("Sam", "</s>");
Assert.assertEquals(0.5d, probability, 0.001);
- probability = model.calculateProbability(new StringList("<s>", "Sam"));
+ probability = model.calculateProbability("<s>", "Sam");
Assert.assertEquals(0.333d, probability, 0.001);
- probability = model.calculateProbability(new StringList("am", "Sam"));
+ probability = model.calculateProbability("am", "Sam");
Assert.assertEquals(0.5d, probability, 0.001);
- probability = model.calculateProbability(new StringList("I", "am"));
+ probability = model.calculateProbability("I", "am");
Assert.assertEquals(0.666d, probability, 0.001);
- probability = model.calculateProbability(new StringList("I", "do"));
+ probability = model.calculateProbability("I", "do");
Assert.assertEquals(0.333d, probability, 0.001);
- probability = model.calculateProbability(new StringList("I", "am", "Sam"));
+ probability = model.calculateProbability("I", "am", "Sam");
Assert.assertEquals(0.333d, probability, 0.001);
}
@Test
- public void testTrigram() throws Exception {
+ public void testTrigram() {
NGramLanguageModel model = new NGramLanguageModel(3);
- model.add(new StringList("I", "see", "the", "fox"), 1, 3);
- model.add(new StringList("the", "red", "house"), 1, 3);
- model.add(new StringList("I", "saw", "something", "nice"), 1, 3);
- double probability = model.calculateProbability(new StringList("I", "saw", "the", "red", "house"));
+ model.add("I", "see", "the", "fox");
+ model.add("the", "red", "house");
+ model.add("I", "saw", "something", "nice");
+ double probability = model.calculateProbability("I", "saw", "the", "red", "house");
Assert.assertTrue("a probability measure should be between 0 and 1 [was "
+ probability + "]", probability >= 0 && probability <= 1);
- StringList tokens = model.predictNextTokens(new StringList("I", "saw"));
+ String[] tokens = model.predictNextTokens("I", "saw");
Assert.assertNotNull(tokens);
- Assert.assertEquals(new StringList("something"), tokens);
+ Assert.assertArrayEquals(new String[] {"something"}, tokens);
}
@Test
- public void testBigram() throws Exception {
+ public void testBigram() {
NGramLanguageModel model = new NGramLanguageModel(2);
- model.add(new StringList("I", "see", "the", "fox"), 1, 2);
- model.add(new StringList("the", "red", "house"), 1, 2);
- model.add(new StringList("I", "saw", "something", "nice"), 1, 2);
- double probability = model.calculateProbability(new StringList("I", "saw", "the", "red", "house"));
+ model.add("I", "see", "the", "fox");
+ model.add("the", "red", "house");
+ model.add("I", "saw", "something", "nice");
+ double probability = model.calculateProbability("I", "saw", "the", "red", "house");
Assert.assertTrue("a probability measure should be between 0 and 1 [was " + probability + "]",
probability >= 0 && probability <= 1);
- StringList tokens = model.predictNextTokens(new StringList("I", "saw"));
+ String[] tokens = model.predictNextTokens("I", "saw");
Assert.assertNotNull(tokens);
- Assert.assertEquals(new StringList("something"), tokens);
+ Assert.assertArrayEquals(new String[] {"something"}, tokens);
}
@Test
public void testSerializedNGramLanguageModel() throws Exception {
NGramLanguageModel languageModel = new NGramLanguageModel(getClass().getResourceAsStream(
"/opennlp/tools/ngram/ngram-model.xml"), 3);
- double probability = languageModel.calculateProbability(new StringList("The", "brown", "fox", "jumped"));
+ double probability = languageModel.calculateProbability("The", "brown", "fox", "jumped");
Assert.assertTrue("a probability measure should be between 0 and 1 [was " + probability + "]",
probability >= 0 && probability <= 1);
- StringList tokens = languageModel.predictNextTokens(new StringList("the","brown","fox"));
+ String[] tokens = languageModel.predictNextTokens("the", "brown", "fox");
Assert.assertNotNull(tokens);
- Assert.assertEquals(new StringList("jumped"), tokens);
+ Assert.assertArrayEquals(new String[] {"jumped"}, tokens);
}
@Test
@@ -144,18 +143,18 @@ public class NgramLanguageModelTest {
for (String generatedString : generatedStrings) {
String[] tokens = generatedString.split(" ");
if (tokens.length > 0) {
- languageModel.add(new StringList(tokens), 1, ngramSize);
+ languageModel.add(tokens);
}
}
}
- StringList tokens = languageModel.predictNextTokens(new StringList("neural",
- "network", "language"));
+ String[] tokens = languageModel.predictNextTokens("neural",
+ "network", "language");
Assert.assertNotNull(tokens);
- Assert.assertEquals(new StringList("models"), tokens);
- double p1 = languageModel.calculateProbability(new StringList("neural", "network",
- "language", "models"));
- double p2 = languageModel.calculateProbability(new StringList("neural", "network",
- "language", "model"));
+ Assert.assertArrayEquals(new String[] {"models"}, tokens);
+ double p1 = languageModel.calculateProbability("neural", "network",
+ "language", "models");
+ double p2 = languageModel.calculateProbability("neural", "network",
+ "language", "model");
Assert.assertTrue(p1 > p2);
}
}
--
To stop receiving notification emails like this one, please contact
['"commits@opennlp.apache.org" <co...@opennlp.apache.org>'].