You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2017/06/14 13:56:02 UTC
opennlp git commit: Changed the LD factory to allow customization
Repository: opennlp
Updated Branches:
refs/heads/LangDetect 260f52f06 -> 8d731904b
Changed the LD factory to allow customization
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/8d731904
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/8d731904
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/8d731904
Branch: refs/heads/LangDetect
Commit: 8d731904bdf3a6742fdea49e0070083e2cc8bf24
Parents: 260f52f
Author: William D C M SILVA <co...@apache.org>
Authored: Wed Jun 14 10:55:15 2017 -0300
Committer: William D C M SILVA <co...@apache.org>
Committed: Wed Jun 14 10:55:15 2017 -0300
----------------------------------------------------------------------
.../LanguageDetectorContextGenerator.java | 51 ++++++++++---------
.../langdetect/LanguageDetectorEventStream.java | 6 +--
.../langdetect/LanguageDetectorFactory.java | 14 ++++++
.../tools/langdetect/LanguageDetectorME.java | 4 +-
.../opennlp/tools/langdetect/DummyFactory.java | 53 ++++++++++++++++++++
.../LanguageDetectorContextGeneratorTest.java | 2 +-
.../langdetect/LanguageDetectorFactoryTest.java | 30 ++++++++---
.../langdetect/LanguageDetectorMETest.java | 1 +
8 files changed, 125 insertions(+), 36 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/8d731904/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
index d1f1f4b..a467521 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
@@ -24,48 +24,53 @@ import opennlp.tools.ngram.NGramModel;
import opennlp.tools.util.StringList;
import opennlp.tools.util.normalizer.AggregateCharSequenceNormalizer;
import opennlp.tools.util.normalizer.CharSequenceNormalizer;
-import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.UrlCharSequenceNormalizer;
/**
- * Context generator for document categorizer
+ * A context generator for language detector.
*/
class LanguageDetectorContextGenerator {
- private final int minLength;
- private final int maxLength;
- private final CharSequenceNormalizer normalizer;
+ protected final int minLength;
+ protected final int maxLength;
+ protected final CharSequenceNormalizer normalizer;
- LanguageDetectorContextGenerator(int minLength, int maxLength) {
+ /**
+ * Creates a customizable @{@link LanguageDetectorContextGenerator} that computes ngrams from text
+ * @param minLength min ngrams chars
+ * @param maxLength max ngrams chars
+ * @param normalizers zero or more normalizers to
+ * be applied in to the text before extracting ngrams
+ */
+ public LanguageDetectorContextGenerator(int minLength, int maxLength,
+ CharSequenceNormalizer... normalizers) {
this.minLength = minLength;
this.maxLength = maxLength;
- this.normalizer = new AggregateCharSequenceNormalizer(
- EmojiCharSequenceNormalizer.getInstance(),
- UrlCharSequenceNormalizer.getInstance(),
- TwitterCharSequenceNormalizer.getInstance(),
- NumberCharSequenceNormalizer.getInstance(),
- ShrinkCharSequenceNormalizer.getInstance()
- );
+ this.normalizer = new AggregateCharSequenceNormalizer(normalizers);
}
/**
- * Initializes the current instance with min 1 length and max 3 length of ngrams.
+ * Generates the context for a document. It normalizers the text using normalizers before.
+ * Classes that extends @{@link LanguageDetectorContextGenerator} should not extend this method,
+ * but {@link #getContextNormalized(String)}.
+ * @param document document to extract context from
+ * @return the generated context
*/
- LanguageDetectorContextGenerator() {
- this(1, 3);
+ public final String[] getContext(String document) {
+ return getContextNormalized(this.normalizer.normalize(document).toString());
}
- public String[] getContext(String document) {
+ /**
+ * Extension point of the {@link LanguageDetectorContextGenerator}.
+ * @param document document to extract context from
+ * @return the generated context
+ */
+ protected String[] getContextNormalized(String document) {
Collection<String> context = new ArrayList<>();
NGramModel model = new NGramModel();
- String normalized = normalizer.normalize(document).toString();
- model.add(normalized, minLength, maxLength);
+ model.add(document, minLength, maxLength);
for (StringList tokenList : model) {
if (tokenList.size() > 0) {
http://git-wip-us.apache.org/repos/asf/opennlp/blob/8d731904/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
index b556a4d..19e6d46 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
@@ -35,11 +35,11 @@ public class LanguageDetectorEventStream extends AbstractEventStream<LanguageSam
*
* @param data {@link ObjectStream} of {@link LanguageSample}s
*/
- public LanguageDetectorEventStream(ObjectStream<LanguageSample> data) {
+ public LanguageDetectorEventStream(ObjectStream<LanguageSample> data,
+ LanguageDetectorContextGenerator contextGenerator) {
super(data);
- mContextGenerator =
- new LanguageDetectorContextGenerator();
+ mContextGenerator = contextGenerator;
}
@Override
http://git-wip-us.apache.org/repos/asf/opennlp/blob/8d731904/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
index 5cebbba..11357ec 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
@@ -20,10 +20,24 @@ package opennlp.tools.langdetect;
import opennlp.tools.util.BaseToolFactory;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.ext.ExtensionLoader;
+import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.UrlCharSequenceNormalizer;
public class LanguageDetectorFactory extends BaseToolFactory {
+ public LanguageDetectorContextGenerator getContextGenerator() {
+ return new LanguageDetectorContextGenerator(1, 3,
+ EmojiCharSequenceNormalizer.getInstance(),
+ UrlCharSequenceNormalizer.getInstance(),
+ TwitterCharSequenceNormalizer.getInstance(),
+ NumberCharSequenceNormalizer.getInstance(),
+ ShrinkCharSequenceNormalizer.getInstance());
+ }
+
public static LanguageDetectorFactory create(String subclassName)
throws InvalidFormatException {
if (subclassName == null) {
http://git-wip-us.apache.org/repos/asf/opennlp/blob/8d731904/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
index 74a1cea..3af6afd 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
@@ -45,7 +45,7 @@ public class LanguageDetectorME implements LanguageDetector {
*/
public LanguageDetectorME(LanguageDetectorModel model) {
this.model = model;
- this.mContextGenerator = new LanguageDetectorContextGenerator();
+ this.mContextGenerator = model.getFactory().getContextGenerator();
}
@Override
@@ -90,7 +90,7 @@ public class LanguageDetectorME implements LanguageDetector {
mlParams, manifestInfoEntries);
MaxentModel model = trainer.train(
- new LanguageDetectorEventStream(samples));
+ new LanguageDetectorEventStream(samples, factory.getContextGenerator()));
return new LanguageDetectorModel(model, manifestInfoEntries, factory);
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/8d731904/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java
index cbe7d1a..f3c7dd8 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java
@@ -18,6 +18,16 @@
package opennlp.tools.langdetect;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import opennlp.tools.ngram.NGramModel;
+import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.util.StringList;
+import opennlp.tools.util.normalizer.CharSequenceNormalizer;
+
public class DummyFactory extends LanguageDetectorFactory {
@@ -30,4 +40,47 @@ public class DummyFactory extends LanguageDetectorFactory {
super.init();
}
+ @Override
+ public LanguageDetectorContextGenerator getContextGenerator() {
+ return new DummyFactory.MyContectGenerator(1, 5,
+ new DummyFactory.UpperCaseNormalizer());
+ }
+
+ public class UpperCaseNormalizer implements CharSequenceNormalizer {
+ @Override
+ public CharSequence normalize(CharSequence text) {
+ return text.toString().toUpperCase();
+ }
+ }
+
+ public class MyContectGenerator extends LanguageDetectorContextGenerator {
+
+ public MyContectGenerator(int min, int max, CharSequenceNormalizer ... normalizers) {
+ super(min, max, normalizers);
+ }
+
+ @Override
+ public String[] getContextNormalized(String document) {
+ String[] superContext = super.getContextNormalized(document);
+
+ List<String> context = new ArrayList(Arrays.asList(superContext));
+
+ SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
+ String[] words = tokenizer.tokenize(document);
+ NGramModel tokenNgramModel = new NGramModel();
+ if(words.length > 0) {
+ tokenNgramModel.add(new StringList(words), 1, 3);
+ Iterator tokenNgramIterator = tokenNgramModel.iterator();
+
+ while(tokenNgramIterator.hasNext()) {
+ StringList tokenList = (StringList)tokenNgramIterator.next();
+ if(tokenList.size() > 0) {
+ context.add("tg=" + tokenList.toString());
+ }
+ }
+ }
+
+ return context.toArray(new String[context.size()]);
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/8d731904/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
index c800688..dc6ca26 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
@@ -30,7 +30,7 @@ public class LanguageDetectorContextGeneratorTest {
public void extractContext() throws Exception {
String doc = "abcde fghijk";
- LanguageDetectorContextGenerator cg = new LanguageDetectorContextGenerator();
+ LanguageDetectorContextGenerator cg = new LanguageDetectorContextGenerator(1, 3);
Collection<String> features = Arrays.asList(cg.getContext(doc));
http://git-wip-us.apache.org/repos/asf/opennlp/blob/8d731904/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
index 2a6c0ce..781326b 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
@@ -20,9 +20,12 @@ package opennlp.tools.langdetect;
import java.io.ByteArrayInputStream;
import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
import org.junit.Assert;
-import org.junit.Before;
+import org.junit.BeforeClass;
import org.junit.Test;
import opennlp.tools.formats.ResourceAsStreamFactory;
@@ -32,10 +35,10 @@ import opennlp.tools.util.TrainingParameters;
public class LanguageDetectorFactoryTest {
- private LanguageDetectorModel model;
+ private static LanguageDetectorModel model;
- @Before
- public void train() throws Exception {
+ @BeforeClass
+ public static void train() throws Exception {
ResourceAsStreamFactory streamFactory = new ResourceAsStreamFactory(
LanguageDetectorMETest.class, "/opennlp/tools/doccat/DoccatSample.txt");
@@ -47,8 +50,9 @@ public class LanguageDetectorFactoryTest {
TrainingParameters params = new TrainingParameters();
params.put(TrainingParameters.ITERATIONS_PARAM, "100");
params.put(TrainingParameters.CUTOFF_PARAM, "0");
+ params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES");
- this.model = LanguageDetectorME.train(sampleStream, params, new DummyFactory());
+ model = LanguageDetectorME.train(sampleStream, params, new DummyFactory());
}
@Test
@@ -63,13 +67,25 @@ public class LanguageDetectorFactoryTest {
@Test
public void testDummyFactory() throws Exception {
- byte[] serialized = LanguageDetectorMETest.serializeModel(
- LanguageDetectorMETest.trainModel(new DummyFactory()));
+ byte[] serialized = LanguageDetectorMETest.serializeModel(model);
LanguageDetectorModel myModel = new LanguageDetectorModel(new ByteArrayInputStream(serialized));
Assert.assertTrue(myModel.getFactory() instanceof DummyFactory);
+ }
+
+ @Test
+ public void testDummyFactoryContextGenerator() throws Exception {
+ LanguageDetectorContextGenerator cg = model.getFactory().getContextGenerator();
+ String[] context = cg.getContext(
+ "a dummy text phrase to test if the context generator works!!!!!!!!!!!!");
+
+ Set<String> set = new HashSet(Arrays.asList(context));
+
+ Assert.assertTrue(set.contains("!!!!!")); // default normalizer would remove the repeated !
+ Assert.assertTrue(set.contains("a dum"));
+ Assert.assertTrue(set.contains("tg=[THE,CONTEXT,GENERATOR]"));
}
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/8d731904/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
index 8caca1d..beb7589 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
@@ -98,6 +98,7 @@ public class LanguageDetectorMETest {
TrainingParameters params = new TrainingParameters();
params.put(TrainingParameters.ITERATIONS_PARAM, "100");
params.put(TrainingParameters.CUTOFF_PARAM, "2");
+ params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES");
return LanguageDetectorME.train(sampleStream, params, factory);
}