You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/06/06 10:10:05 UTC
[21/21] opennlp git commit: OPENNLP-788: Add LanguageDetector tool
OPENNLP-788: Add LanguageDetector tool
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/a9853284
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/a9853284
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/a9853284
Branch: refs/heads/LangDetect
Commit: a985328464c130bf516d19eace49a1b8e3095022
Parents: 15ac7bd
Author: William D C M SILVA <co...@apache.org>
Authored: Wed May 17 13:34:21 2017 -0300
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Tue Jun 6 12:07:05 2017 +0200
----------------------------------------------------------------------
NOTICE | 7 +
.../main/java/opennlp/tools/cmdline/CLI.java | 12 +
.../cmdline/FineGrainedReportListener.java | 13 +-
.../tools/cmdline/StreamFactoryRegistry.java | 4 +
.../LanguageDetectorConverterTool.java | 28 ++
.../LanguageDetectorCrossValidatorTool.java | 123 ++++++++
...LanguageDetectorEvaluationErrorListener.java | 54 ++++
.../LanguageDetectorEvaluatorTool.java | 139 +++++++++
...nguageDetectorFineGrainedReportListener.java | 70 +++++
.../langdetect/LanguageDetectorModelLoader.java | 42 +++
.../langdetect/LanguageDetectorTool.java | 88 ++++++
.../langdetect/LanguageDetectorTrainerTool.java | 83 ++++++
.../cmdline/langdetect/TrainingParams.java | 40 +++
.../LanguageDetectorSampleStreamFactory.java | 66 +++++
.../formats/LeipzigDoccatSampleStream.java | 5 +-
.../LeipzigDocumentSampleStreamFactory.java | 3 +
.../leipzig/LeipzigLanguageSampleStream.java | 136 +++++++++
.../LeipzigLanguageSampleStreamFactory.java | 74 +++++
.../java/opennlp/tools/langdetect/Language.java | 73 +++++
.../tools/langdetect/LanguageDetector.java | 31 ++
.../LanguageDetectorContextGenerator.java | 80 +++++
.../LanguageDetectorCrossValidator.java | 107 +++++++
.../LanguageDetectorEvaluationMonitor.java | 28 ++
.../langdetect/LanguageDetectorEvaluator.java | 99 +++++++
.../langdetect/LanguageDetectorEventStream.java | 69 +++++
.../langdetect/LanguageDetectorFactory.java | 53 ++++
.../tools/langdetect/LanguageDetectorME.java | 97 ++++++
.../tools/langdetect/LanguageDetectorModel.java | 82 +++++
.../LanguageDetectorSampleStream.java | 55 ++++
.../tools/langdetect/LanguageSample.java | 68 +++++
.../AggregateCharSequenceNormalizer.java | 39 +++
.../util/normalizer/CharSequenceNormalizer.java | 23 ++
.../normalizer/EmojiCharSequenceNormalizer.java | 38 +++
.../NumberCharSequenceNormalizer.java | 36 +++
.../ShrinkCharSequenceNormalizer.java | 40 +++
.../TwitterCharSequenceNormalizer.java | 50 ++++
.../UnicodeCharSequenceNormalizer.java | 297 +++++++++++++++++++
.../normalizer/UrlCharSequenceNormalizer.java | 40 +++
.../normalizer/unicode_normalizer.properties | 154 ++++++++++
.../opennlp/tools/langdetect/DummyFactory.java | 33 +++
.../LanguageDetectorContextGeneratorTest.java | 43 +++
.../LanguageDetectorCrossValidatorTest.java | 64 ++++
.../LanguageDetectorEvaluatorTest.java | 68 +++++
.../langdetect/LanguageDetectorFactoryTest.java | 75 +++++
.../langdetect/LanguageDetectorMETest.java | 114 +++++++
.../tools/langdetect/LanguageSampleTest.java | 89 ++++++
.../opennlp/tools/langdetect/LanguageTest.java | 101 +++++++
.../EmojiCharSequenceNormalizerTest.java | 43 +++
.../NumberCharSequenceNormalizerTest.java | 32 ++
.../ShrinkCharSequenceNormalizerTest.java | 41 +++
.../TwitterCharSequenceNormalizerTest.java | 62 ++++
.../UnicodeCharSequenceNormalizerTest.java | 263 ++++++++++++++++
.../UrlCharSequenceNormalizerTest.java | 47 +++
53 files changed, 3618 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/NOTICE
----------------------------------------------------------------------
diff --git a/NOTICE b/NOTICE
index c0b8394..36d90e2 100644
--- a/NOTICE
+++ b/NOTICE
@@ -10,3 +10,10 @@ opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball
were developed by Martin Porter and Richard Boulton.
The full snowball package is available from
http://snowball.tartarus.org/
+
+
+The Language Detector normalizer in
+opennlp.tools.util.normalizer.UnicodeCharSequenceNormalizer.java
+and its tests and resources were developed by Shuyo Nakatani.
+The full Language Detector package is available from
+https://github.com/shuyo/language-detection
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
index b575f71..c828e26 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
@@ -37,6 +37,11 @@ import opennlp.tools.cmdline.doccat.DoccatEvaluatorTool;
import opennlp.tools.cmdline.doccat.DoccatTool;
import opennlp.tools.cmdline.doccat.DoccatTrainerTool;
import opennlp.tools.cmdline.entitylinker.EntityLinkerTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorConverterTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorCrossValidatorTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorEvaluatorTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorTrainerTool;
import opennlp.tools.cmdline.languagemodel.NGramLanguageModelTool;
import opennlp.tools.cmdline.lemmatizer.LemmatizerEvaluatorTool;
import opennlp.tools.cmdline.lemmatizer.LemmatizerMETool;
@@ -90,6 +95,13 @@ public final class CLI {
tools.add(new DoccatCrossValidatorTool());
tools.add(new DoccatConverterTool());
+ // Language Detector
+ tools.add(new LanguageDetectorTool());
+ tools.add(new LanguageDetectorTrainerTool());
+ tools.add(new LanguageDetectorConverterTool());
+ tools.add(new LanguageDetectorCrossValidatorTool());
+ tools.add(new LanguageDetectorEvaluatorTool());
+
// Dictionary Builder
tools.add(new DictionaryBuilderTool());
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java
index 714561a..75b84aa 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java
@@ -802,8 +802,8 @@ public abstract class FineGrainedReportListener {
}
}
- public void add(String[] text, String ref, String pred) {
- int length = text.length;
+ public void add(int length, String ref, String pred) {
+
averageSentenceLength.add(length);
if (minimalSentenceLength > length) {
@@ -820,7 +820,16 @@ public abstract class FineGrainedReportListener {
updateTagFMeasure(refs, preds);
commit("", ref, pred);
+ }
+
+ public void add(String[] text, String ref, String pred) {
+ int length = text.length;
+ this.add(length, ref, pred);
+ }
+ public void add(CharSequence text, String ref, String pred) {
+ int length = text.length();
+ this.add(length, ref, pred);
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index 3d68945..48b8025 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -29,6 +29,7 @@ import opennlp.tools.formats.ConllXSentenceSampleStreamFactory;
import opennlp.tools.formats.ConllXTokenSampleStreamFactory;
import opennlp.tools.formats.DocumentSampleStreamFactory;
import opennlp.tools.formats.EvalitaNameSampleStreamFactory;
+import opennlp.tools.formats.LanguageDetectorSampleStreamFactory;
import opennlp.tools.formats.LeipzigDocumentSampleStreamFactory;
import opennlp.tools.formats.LemmatizerSampleStreamFactory;
import opennlp.tools.formats.NameSampleDataStreamFactory;
@@ -56,6 +57,7 @@ import opennlp.tools.formats.convert.ParseToTokenSampleStreamFactory;
import opennlp.tools.formats.frenchtreebank.ConstitParseSampleStreamFactory;
import opennlp.tools.formats.irishsentencebank.IrishSentenceBankSentenceStreamFactory;
import opennlp.tools.formats.irishsentencebank.IrishSentenceBankTokenSampleStreamFactory;
+import opennlp.tools.formats.leipzig.LeipzigLanguageSampleStreamFactory;
import opennlp.tools.formats.letsmt.LetsmtSentenceStreamFactory;
import opennlp.tools.formats.moses.MosesSentenceSampleStreamFactory;
import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory;
@@ -79,6 +81,7 @@ public final class StreamFactoryRegistry {
TokenSampleStreamFactory.registerFactory();
WordTagSampleStreamFactory.registerFactory();
LemmatizerSampleStreamFactory.registerFactory();
+ LanguageDetectorSampleStreamFactory.registerFactory();
NameToSentenceSampleStreamFactory.registerFactory();
NameToTokenSampleStreamFactory.registerFactory();
@@ -124,6 +127,7 @@ public final class StreamFactoryRegistry {
IrishSentenceBankSentenceStreamFactory.registerFactory();
IrishSentenceBankTokenSampleStreamFactory.registerFactory();
+ LeipzigLanguageSampleStreamFactory.registerFactory();
}
public static final String DEFAULT_FORMAT = "opennlp";
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java
new file mode 100644
index 0000000..69d9db7
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import opennlp.tools.cmdline.AbstractConverterTool;
+import opennlp.tools.langdetect.LanguageSample;
+
+public class LanguageDetectorConverterTool extends AbstractConverterTool<LanguageSample> {
+
+ public LanguageDetectorConverterTool() {
+ super(LanguageSample.class);
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java
new file mode 100644
index 0000000..bf68fbb
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.LinkedList;
+import java.util.List;
+
+import opennlp.tools.cmdline.AbstractCrossValidatorTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.CVParams;
+import opennlp.tools.cmdline.params.FineGrainedEvaluatorParams;
+import opennlp.tools.langdetect.LanguageDetectorCrossValidator;
+import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor;
+import opennlp.tools.langdetect.LanguageDetectorFactory;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.eval.EvaluationMonitor;
+import opennlp.tools.util.model.ModelUtil;
+
+public final class LanguageDetectorCrossValidatorTool extends
+ AbstractCrossValidatorTool<LanguageSample,
+ LanguageDetectorCrossValidatorTool.CVToolParams> {
+
+ interface CVToolParams extends CVParams, TrainingParams, FineGrainedEvaluatorParams {
+ }
+
+ public LanguageDetectorCrossValidatorTool() {
+ super(LanguageSample.class, CVToolParams.class);
+ }
+
+ public String getShortDescription() {
+ return "K-fold cross validator for the learnable Language Detector";
+ }
+
+ public void run(String format, String[] args) {
+ super.run(format, args);
+
+ mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false);
+ if (mlParams == null) {
+ mlParams = ModelUtil.createDefaultTrainingParameters();
+ }
+
+ List<EvaluationMonitor<LanguageSample>> listeners = new LinkedList<>();
+ if (params.getMisclassified()) {
+ listeners.add(new LanguageDetectorEvaluationErrorListener());
+ }
+
+ LanguageDetectorFineGrainedReportListener reportListener = null;
+ File reportFile = params.getReportOutputFile();
+ OutputStream reportOutputStream = null;
+ if (reportFile != null) {
+ CmdLineUtil.checkOutputFile("Report Output File", reportFile);
+ try {
+ reportOutputStream = new FileOutputStream(reportFile);
+ reportListener = new LanguageDetectorFineGrainedReportListener(reportOutputStream);
+ listeners.add(reportListener);
+ } catch (FileNotFoundException e) {
+ throw createTerminationIOException(e);
+ }
+ }
+
+ LanguageDetectorEvaluationMonitor[] listenersArr = listeners
+ .toArray(new LanguageDetectorEvaluationMonitor[listeners.size()]);
+
+ LanguageDetectorCrossValidator validator;
+ try {
+ LanguageDetectorFactory factory = LanguageDetectorFactory.create(params.getFactory());
+ validator = new LanguageDetectorCrossValidator(mlParams,
+ factory, listenersArr);
+
+ validator.evaluate(sampleStream, params.getFolds());
+ } catch (IOException e) {
+ throw new TerminateToolException(-1,
+ "IO error while reading training data or indexing data: " + e.getMessage(), e);
+ } finally {
+ try {
+ sampleStream.close();
+ } catch (IOException e) {
+ // sorry that this can fail
+ }
+ }
+
+ System.out.println("done");
+
+ if (reportListener != null) {
+ System.out.println("Writing fine-grained report to "
+ + params.getReportOutputFile().getAbsolutePath());
+ reportListener.writeReport();
+
+ try {
+ // TODO: is it a problem to close the stream now?
+ reportOutputStream.close();
+ } catch (IOException e) {
+ // nothing to do
+ }
+ }
+
+ System.out.println();
+
+ System.out.println("Accuracy: " + validator.getDocumentAccuracy() + "\n" +
+ "Number of documents: " + validator.getDocumentCount());
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java
new file mode 100644
index 0000000..073ef31
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.OutputStream;
+
+import opennlp.tools.cmdline.EvaluationErrorPrinter;
+import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.eval.EvaluationMonitor;
+
+/**
+ * A default implementation of {@link EvaluationMonitor} that prints to an
+ * output stream.
+ *
+ */
+public class LanguageDetectorEvaluationErrorListener extends
+ EvaluationErrorPrinter<LanguageSample> implements LanguageDetectorEvaluationMonitor {
+
+ /**
+ * Creates a listener that will print to System.err
+ */
+ public LanguageDetectorEvaluationErrorListener() {
+ super(System.err);
+ }
+
+ /**
+ * Creates a listener that will print to a given {@link OutputStream}
+ */
+ public LanguageDetectorEvaluationErrorListener(OutputStream outputStream) {
+ super(outputStream);
+ }
+
+ @Override
+ public void missclassified(LanguageSample reference, LanguageSample prediction) {
+ printError(reference, prediction);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java
new file mode 100644
index 0000000..fb929bf
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.LinkedList;
+import java.util.List;
+
+import opennlp.tools.cmdline.AbstractEvaluatorTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.PerformanceMonitor;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.EvaluatorParams;
+import opennlp.tools.cmdline.params.FineGrainedEvaluatorParams;
+import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor;
+import opennlp.tools.langdetect.LanguageDetectorEvaluator;
+import opennlp.tools.langdetect.LanguageDetectorME;
+import opennlp.tools.langdetect.LanguageDetectorModel;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.eval.EvaluationMonitor;
+
+public final class LanguageDetectorEvaluatorTool extends
+ AbstractEvaluatorTool<LanguageSample, LanguageDetectorEvaluatorTool.EvalToolParams> {
+
+ interface EvalToolParams extends EvaluatorParams, FineGrainedEvaluatorParams {
+ }
+
+ public LanguageDetectorEvaluatorTool() {
+ super(LanguageSample.class, EvalToolParams.class);
+ }
+
+ public String getShortDescription() {
+ return "Measures the performance of the Language Detector model with the reference data";
+ }
+
+ public void run(String format, String[] args) {
+ super.run(format, args);
+
+ LanguageDetectorModel model = new LanguageDetectorModelLoader().load(params.getModel());
+
+ List<EvaluationMonitor<LanguageSample>> listeners = new LinkedList<>();
+ if (params.getMisclassified()) {
+ listeners.add(new LanguageDetectorEvaluationErrorListener());
+ }
+
+ LanguageDetectorFineGrainedReportListener reportListener = null;
+ File reportFile = params.getReportOutputFile();
+ OutputStream reportOutputStream = null;
+ if (reportFile != null) {
+ CmdLineUtil.checkOutputFile("Report Output File", reportFile);
+ try {
+ reportOutputStream = new FileOutputStream(reportFile);
+ reportListener = new LanguageDetectorFineGrainedReportListener(reportOutputStream);
+ listeners.add(reportListener);
+ } catch (FileNotFoundException e) {
+ throw new TerminateToolException(-1,
+ "IO error while creating LanguageDetector fine-grained report file: "
+ + e.getMessage());
+ }
+ }
+
+ LanguageDetectorEvaluator evaluator = new LanguageDetectorEvaluator(
+ new LanguageDetectorME(model),
+ listeners.toArray(new LanguageDetectorEvaluationMonitor[listeners.size()]));
+
+ final PerformanceMonitor monitor = new PerformanceMonitor("doc");
+
+ ObjectStream<LanguageSample> measuredSampleStream = new ObjectStream<LanguageSample>() {
+
+ public LanguageSample read() throws IOException {
+ monitor.incrementCounter();
+ return sampleStream.read();
+ }
+
+ public void reset() throws IOException {
+ sampleStream.reset();
+ }
+
+ public void close() throws IOException {
+ sampleStream.close();
+ }
+ };
+
+ monitor.startAndPrintThroughput();
+
+ try {
+ evaluator.evaluate(measuredSampleStream);
+ } catch (IOException e) {
+ System.err.println("failed");
+ throw new TerminateToolException(-1, "IO error while reading test data: "
+ + e.getMessage(), e);
+ } finally {
+ try {
+ measuredSampleStream.close();
+ } catch (IOException e) {
+ // sorry that this can fail
+ }
+ }
+
+ monitor.stopAndPrintFinalResult();
+
+ System.out.println();
+
+ System.out.println(evaluator);
+
+ if (reportListener != null) {
+ System.out.println("Writing fine-grained report to "
+ + params.getReportOutputFile().getAbsolutePath());
+ reportListener.writeReport();
+
+ try {
+ // TODO: is it a problem to close the stream now?
+ reportOutputStream.close();
+ } catch (IOException e) {
+ // nothing to do
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java
new file mode 100644
index 0000000..70bf3eb
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.OutputStream;
+
+import opennlp.tools.cmdline.FineGrainedReportListener;
+import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor;
+import opennlp.tools.langdetect.LanguageSample;
+
+/**
+ * Generates a detailed report for the POS Tagger.
+ * <p>
+ * It is possible to use it from an API and access the statistics using the
+ * provided getters
+ */
+public class LanguageDetectorFineGrainedReportListener
+ extends FineGrainedReportListener implements LanguageDetectorEvaluationMonitor {
+
+ /**
+ * Creates a listener that will print to {@link System#err}
+ */
+ public LanguageDetectorFineGrainedReportListener() {
+ this(System.err);
+ }
+
+ /**
+ * Creates a listener that prints to a given {@link OutputStream}
+ */
+ public LanguageDetectorFineGrainedReportListener(OutputStream outputStream) {
+ super(outputStream);
+ }
+
+ // methods inherited from EvaluationMonitor
+
+ public void missclassified(LanguageSample reference, LanguageSample prediction) {
+ statsAdd(reference, prediction);
+ }
+
+ public void correctlyClassified(LanguageSample reference, LanguageSample prediction) {
+ statsAdd(reference, prediction);
+ }
+
+ private void statsAdd(LanguageSample reference, LanguageSample prediction) {
+ getStats().add(reference.getContext(),
+ reference.getLanguage().getLang(), prediction.getLanguage().getLang());
+ }
+
+ public void writeReport() {
+ printGeneralStatistics();
+ printTagsErrorRank();
+ printGeneralConfusionTable();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java
new file mode 100644
index 0000000..c8700fd
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import opennlp.tools.cmdline.ModelLoader;
+import opennlp.tools.langdetect.LanguageDetectorModel;
+
+/**
+ * Loads a Language Detector Model for the command line tools.
+ * <p>
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class LanguageDetectorModelLoader extends ModelLoader<LanguageDetectorModel> {
+
+ public LanguageDetectorModelLoader() {
+ super("Language Detector");
+ }
+
+ @Override
+ protected LanguageDetectorModel loadModel(InputStream modelIn) throws IOException {
+ return new LanguageDetectorModel(modelIn);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java
new file mode 100644
index 0000000..6175fe3
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.File;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CLI;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.PerformanceMonitor;
+import opennlp.tools.cmdline.SystemInputStreamFactory;
+import opennlp.tools.langdetect.Language;
+import opennlp.tools.langdetect.LanguageDetector;
+import opennlp.tools.langdetect.LanguageDetectorME;
+import opennlp.tools.langdetect.LanguageDetectorModel;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ParagraphStream;
+import opennlp.tools.util.PlainTextByLineStream;
+
+public class LanguageDetectorTool extends BasicCmdLineTool {
+
+ @Override
+ public String getShortDescription() {
+ return "learned language detector";
+ }
+
+ @Override
+ public String getHelp() {
+ return "Usage: " + CLI.CMD + " " + getName() + " model < documents";
+ }
+
+ @Override
+ public void run(String[] args) {
+
+ if (0 == args.length) {
+ System.out.println(getHelp());
+ } else {
+
+ LanguageDetectorModel model = new LanguageDetectorModelLoader().load(new File(args[0]));
+
+ LanguageDetector langDetectME = new LanguageDetectorME(model);
+
+ /*
+ * moved initialization to the try block to catch new IOException
+ */
+ ObjectStream<String> documentStream;
+
+ PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "doc");
+ perfMon.start();
+
+ try {
+ documentStream = new ParagraphStream(new PlainTextByLineStream(
+ new SystemInputStreamFactory(), SystemInputStreamFactory.encoding()));
+ String document;
+ while ((document = documentStream.read()) != null) {
+
+ Language lang = langDetectME.predictLanguage(document);
+
+ LanguageSample sample = new LanguageSample(lang, document);
+ System.out.println(sample.toString());
+
+ perfMon.incrementCounter();
+ }
+ } catch (IOException e) {
+ CmdLineUtil.handleStdinIoError(e);
+ }
+
+ perfMon.stopAndPrintFinalResult();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java
new file mode 100644
index 0000000..6735293
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.File;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.AbstractTrainerTool;
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.langdetect.LanguageDetectorFactory;
+import opennlp.tools.langdetect.LanguageDetectorME;
+import opennlp.tools.langdetect.LanguageDetectorModel;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.model.ModelUtil;
+
+public class LanguageDetectorTrainerTool
+ extends AbstractTrainerTool<LanguageSample, LanguageDetectorTrainerTool.TrainerToolParams> {
+
+ interface TrainerToolParams extends TrainingParams {
+ @ArgumentParser.ParameterDescription(valueName = "modelFile", description = "output model file.")
+ File getModel();
+
+ @ArgumentParser.ParameterDescription(valueName = "paramsFile", description = "training parameters file.")
+ @ArgumentParser.OptionalParameter()
+ String getParams();
+ }
+
+ public LanguageDetectorTrainerTool() {
+ super(LanguageSample.class, TrainerToolParams.class);
+ }
+
+ @Override
+ public String getShortDescription() {
+ return "trainer for the learnable language detector";
+ }
+
+ @Override
+ public void run(String format, String[] args) {
+ super.run(format, args);
+
+ mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false);
+ if (mlParams == null) {
+ mlParams = ModelUtil.createDefaultTrainingParameters();
+ }
+
+ File modelOutFile = params.getModel();
+
+ CmdLineUtil.checkOutputFile("language detector model", modelOutFile);
+
+ LanguageDetectorModel model;
+ try {
+ LanguageDetectorFactory factory = LanguageDetectorFactory.create(params.getFactory());
+ model = LanguageDetectorME.train(sampleStream, mlParams, factory);
+ } catch (IOException e) {
+ throw createTerminationIOException(e);
+ }
+ finally {
+ try {
+ sampleStream.close();
+ } catch (IOException e) {
+ // sorry that this can fail
+ }
+ }
+
+ CmdLineUtil.writeModel("language detector", modelOutFile, model);
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java
new file mode 100644
index 0000000..2937c3d
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+
+/**
+ * TrainingParams for Language Detector.
+ *
+ * Note: Do not use this class, internal use only!
+ */
+interface TrainingParams {
+
+ @ParameterDescription(valueName = "paramsFile", description = "training parameters file.")
+ @OptionalParameter()
+ String getParams();
+
+ @ParameterDescription(valueName = "factoryName",
+ description = "A sub-class of LanguageDetectorFactory" +
+ " where to get implementation and resources.")
+ @OptionalParameter
+ String getFactory();
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java
new file mode 100644
index 0000000..ef60063
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.doccat.DocumentSampleStream;
+import opennlp.tools.langdetect.LanguageDetectorSampleStream;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+
+/**
+ * Factory producing OpenNLP {@link DocumentSampleStream}s.
+ */
+public class LanguageDetectorSampleStreamFactory
+ extends AbstractSampleStreamFactory<LanguageSample> {
+
+ interface Parameters extends BasicFormatParams {
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(LanguageSample.class,
+ StreamFactoryRegistry.DEFAULT_FORMAT,
+ new LanguageDetectorSampleStreamFactory(Parameters.class));
+ }
+
+ protected <P> LanguageDetectorSampleStreamFactory(Class<P> params) {
+ super(params);
+ }
+
+ public ObjectStream<LanguageSample> create(String[] args) {
+ Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+ CmdLineUtil.checkInputFile("Data", params.getData());
+ InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
+ ObjectStream<String> lineStream = null;
+ try {
+ lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
+ } catch (IOException ex) {
+ CmdLineUtil.handleCreateObjectStreamError(ex);
+ }
+
+ return new LanguageDetectorSampleStream(lineStream);
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
index 8ed0036..7059e21 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
@@ -39,8 +39,11 @@ import opennlp.tools.util.PlainTextByLineStream;
* <p>
* The input text is tokenized with the {@link SimpleTokenizer}. The input text classified
* by the language model must also be tokenized by the {@link SimpleTokenizer} to produce
- * exactly the same tokenization during testing and training.ø
+ * exactly the same tokenization during testing and training.
+ *
+ * @deprecated will be removed, use the language detector instead
*/
+@Deprecated
public class LeipzigDoccatSampleStream extends
FilterObjectStream<String, DocumentSample> {
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
index bd2453b..d6ff9ba 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
@@ -33,7 +33,10 @@ import opennlp.tools.util.ObjectStreamUtils;
/**
* <b>Note:</b> Do not use this class, internal use only!
+ *
+ * @deprecated will be removed, use the language detector instead
*/
+@Deprecated
public class LeipzigDocumentSampleStreamFactory
extends AbstractSampleStreamFactory<DocumentSample> {
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
new file mode 100644
index 0000000..6c4d009
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.leipzig;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import opennlp.tools.langdetect.Language;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.MarkableFileInputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+
+public class LeipzigLanguageSampleStream implements ObjectStream<LanguageSample> {
+
+ private class LeipzigSentencesStream implements ObjectStream<LanguageSample> {
+ private final String lang;
+ private int sentencesPerSample;
+ private int numberOfSamples;
+
+ private ObjectStream<String> lineStream;
+ private int sampleCount;
+
+ LeipzigSentencesStream(String lang, File sentencesFile, int sentencesPerSample, int numberOfSamples)
+ throws IOException {
+ this.lang = sentencesFile.getName().substring(0, 3);
+ this.sentencesPerSample = sentencesPerSample;
+ this.numberOfSamples = numberOfSamples;
+
+ lineStream = new PlainTextByLineStream(new MarkableFileInputStreamFactory(sentencesFile),
+ StandardCharsets.UTF_8);
+ }
+
+ @Override
+ public LanguageSample read() throws IOException {
+
+ if (sampleCount < numberOfSamples) {
+ StringBuilder sampleString = new StringBuilder();
+
+ int count = 0;
+ String line;
+ while (count < sentencesPerSample && (line = lineStream.read()) != null) {
+
+ int textStart = line.indexOf('\t') + 1;
+
+ // TODO: It should it be changed to contain an array of sample strings ?!
+ sampleString.append(line.substring(textStart) + " ");
+
+ count++;
+ }
+
+ if (sampleString.length() > 0) {
+ sampleCount++;
+ return new LanguageSample(new Language(lang), sampleString);
+ }
+ }
+ return null;
+ }
+ }
+
+ private final int sentencesPerSample;
+
+ private Map<String, Integer> langSampleCounts;
+ private File[] sentencesFiles;
+
+ private Iterator<File> sentencesFilesIt;
+ private ObjectStream<LanguageSample> sampleStream;
+
+ public LeipzigLanguageSampleStream(File leipzigFolder, final int sentencesPerSample,
+ final int samplesPerLanguage) throws IOException {
+ this.sentencesPerSample = sentencesPerSample;
+ // TODO: Use a FileFilter to make this more reliable in case there are files which should be ignored
+ sentencesFiles = leipzigFolder.listFiles();
+ Arrays.sort(sentencesFiles);
+
+ Map<String, Integer> langCounts = Arrays.stream(sentencesFiles)
+ .map(file -> file.getName().substring(0, 3))
+ .collect(Collectors.groupingBy(String::toString, Collectors.summingInt(v -> 1)));
+
+ langSampleCounts = langCounts.entrySet().stream()
+ .collect(Collectors.toMap(Map.Entry::getKey, e -> samplesPerLanguage / e.getValue()));
+
+ reset();
+ }
+
+ public LanguageSample read() throws IOException {
+ LanguageSample sample;
+ if (sampleStream != null && (sample = sampleStream.read()) != null) {
+ return sample;
+ }
+ else {
+ if (sentencesFilesIt.hasNext()) {
+ File sentencesFile = sentencesFilesIt.next();
+ System.out.println(sentencesFile);
+ String lang = sentencesFile.getName().substring(0, 3);
+
+ sampleStream = new LeipzigSentencesStream(lang, sentencesFile,
+ sentencesPerSample, langSampleCounts.get(lang));
+
+ return read();
+ }
+ }
+ return null;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ sentencesFilesIt = Arrays.asList(sentencesFiles).iterator();
+ sampleStream = null;
+ }
+
+ public static void main(String[] args) throws Exception {
+ new LeipzigLanguageSampleStream(new File("/home/blue/opennlp-data-dir/leipzig-lang"),
+ 10, 100000);
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
new file mode 100644
index 0000000..59a7551
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.leipzig;
+
+import java.io.File;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.EncodingParameter;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class LeipzigLanguageSampleStreamFactory
+ extends AbstractSampleStreamFactory<LanguageSample> {
+
+ interface Parameters extends EncodingParameter {
+ @ParameterDescription(valueName = "sentencesDir",
+ description = "dir with Leipig sentences to be used")
+ File getSentencesDir();
+
+ @ParameterDescription(valueName = "sentencesPerSample",
+ description = "number of sentences per sample")
+ String getSentencesPerSample();
+
+ @ParameterDescription(valueName = "samplesPerLanguage",
+ description = "number of samples per language")
+ String getSamplesPerLanguage();
+ }
+
+ protected <P> LeipzigLanguageSampleStreamFactory(Class<P> params) {
+ super(params);
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(LanguageSample.class,
+ "leipzig", new LeipzigLanguageSampleStreamFactory(Parameters.class));
+ }
+
+ public ObjectStream<LanguageSample> create(String[] args) {
+
+ Parameters params = ArgumentParser.parse(args, Parameters.class);
+ File sentencesFileDir = params.getSentencesDir();
+
+ try {
+ return new LeipzigLanguageSampleStream(sentencesFileDir,
+ Integer.parseInt(params.getSentencesPerSample()),
+ Integer.parseInt(params.getSamplesPerLanguage()));
+ } catch (IOException e) {
+ throw new TerminateToolException(-1, "IO error while opening sample data.", e);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
new file mode 100644
index 0000000..f780759
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Objects;
+
+/**
+ * Class for holding the document language and its confidence
+ */
+public class Language {
+ private final String lang;
+ private final double confidence;
+
+ public Language(String lang) {
+ this(lang, 0);
+ }
+
+ public Language(String lang, double confidence) {
+ Objects.requireNonNull(lang, "lang must not be null");
+ this.lang = lang;
+ this.confidence = confidence;
+ }
+
+ public String getLang() {
+ return lang;
+ }
+
+ public double getConfidence() {
+ return confidence;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append(getLang()).append(" (").append(this.confidence).append(")");
+ return sb.toString();
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(getLang(), getConfidence());
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+
+ if (obj instanceof Language) {
+ Language a = (Language) obj;
+
+ return getLang().equals(a.getLang());
+ }
+
+ return false;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
new file mode 100644
index 0000000..0004494
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+/**
+ * The interface for LanguageDetector which provide the @{@link Language} according to the context.
+ */
+public interface LanguageDetector {
+
+ Language[] predictLanguages(CharSequence content);
+
+ Language predictLanguage(CharSequence content);
+
+ String[] getSupportedLanguages();
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
new file mode 100644
index 0000000..b28c601
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Collection;
+import java.util.LinkedList;
+
+import opennlp.tools.ngram.NGramModel;
+import opennlp.tools.util.StringList;
+import opennlp.tools.util.StringUtil;
+import opennlp.tools.util.normalizer.AggregateCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.CharSequenceNormalizer;
+import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.UnicodeCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.UrlCharSequenceNormalizer;
+
+/**
+ * Context generator for document categorizer
+ */
+class LanguageDetectorContextGenerator {
+
+ private final int minLength;
+ private final int maxLength;
+ private final CharSequenceNormalizer normalizer;
+
+ LanguageDetectorContextGenerator(int minLength, int maxLength) {
+ this.minLength = minLength;
+ this.maxLength = maxLength;
+
+ this.normalizer = new AggregateCharSequenceNormalizer(
+ EmojiCharSequenceNormalizer.getInstance(),
+ UrlCharSequenceNormalizer.getInstance(),
+ TwitterCharSequenceNormalizer.getInstance(),
+ NumberCharSequenceNormalizer.getInstance(),
+ UnicodeCharSequenceNormalizer.getInstance(),
+ ShrinkCharSequenceNormalizer.getInstance()
+ );
+ }
+
+ /**
+ * Initializes the current instance with min 2 length and max 5 length of ngrams.
+ */
+ LanguageDetectorContextGenerator() {
+ this(2, 3);
+ }
+
+ public String[] getContext(String document) {
+
+ Collection<String> context = new LinkedList<>();
+
+ NGramModel model = new NGramModel();
+ String normalized = normalizer.normalize(document).toString();
+ model.add(normalized, minLength, maxLength);
+
+ for (StringList tokenList : model) {
+ if (tokenList.size() > 0) {
+ context.add(StringUtil.toLowerCase(tokenList.getToken(0)));
+ }
+ }
+ return context.toArray(new String[context.size()]);
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java
new file mode 100644
index 0000000..ce1823a
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.IOException;
+
+import opennlp.tools.doccat.FeatureGenerator;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+import opennlp.tools.util.eval.CrossValidationPartitioner;
+import opennlp.tools.util.eval.Mean;
+
+/**
+ * Cross validator for language detector
+ */
+public class LanguageDetectorCrossValidator {
+
+ private final TrainingParameters params;
+
+ private Mean documentAccuracy = new Mean();
+
+ private LanguageDetectorEvaluationMonitor[] listeners;
+
+ private LanguageDetectorFactory factory;
+
+
+ /**
+ * Creates a {@link LanguageDetectorCrossValidator} with the given
+ * {@link FeatureGenerator}s.
+ */
+ public LanguageDetectorCrossValidator(TrainingParameters mlParams,
+ LanguageDetectorFactory factory,
+ LanguageDetectorEvaluationMonitor ... listeners) {
+ this.params = mlParams;
+ this.listeners = listeners;
+ this.factory = factory;
+ }
+
+ /**
+ * Starts the evaluation.
+ *
+ * @param samples
+ * the data to train and test
+ * @param nFolds
+ * number of folds
+ *
+ * @throws IOException
+ */
+ public void evaluate(ObjectStream<LanguageSample> samples, int nFolds)
+ throws IOException {
+
+ CrossValidationPartitioner<LanguageSample> partitioner =
+ new CrossValidationPartitioner<>(samples, nFolds);
+
+ while (partitioner.hasNext()) {
+
+ CrossValidationPartitioner.TrainingSampleStream<LanguageSample> trainingSampleStream =
+ partitioner.next();
+
+ LanguageDetectorModel model = LanguageDetectorME.train(
+ trainingSampleStream, params, factory);
+
+ LanguageDetectorEvaluator evaluator = new LanguageDetectorEvaluator(
+ new LanguageDetectorME(model), listeners);
+
+ evaluator.evaluate(trainingSampleStream.getTestSampleStream());
+
+ documentAccuracy.add(evaluator.getAccuracy(),
+ evaluator.getDocumentCount());
+
+ }
+ }
+
+ /**
+ * Retrieves the accuracy for all iterations.
+ *
+ * @return the word accuracy
+ */
+ public double getDocumentAccuracy() {
+ return documentAccuracy.mean();
+ }
+
+ /**
+ * Retrieves the number of words which where validated over all iterations.
+ * The result is the amount of folds multiplied by the total number of words.
+ *
+ * @return the word count
+ */
+ public long getDocumentCount() {
+ return documentAccuracy.count();
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java
new file mode 100644
index 0000000..30f3313
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import opennlp.tools.util.eval.EvaluationMonitor;
+
+/**
+ * {@link EvaluationMonitor} for Language Detector.
+ */
+public interface LanguageDetectorEvaluationMonitor extends
+ EvaluationMonitor<LanguageSample> {
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java
new file mode 100644
index 0000000..bbf73c3
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import opennlp.tools.doccat.DocumentCategorizer;
+import opennlp.tools.util.eval.Evaluator;
+import opennlp.tools.util.eval.Mean;
+
+/**
+ * The {@link LanguageDetectorEvaluator} measures the performance of
+ * the given {@link LanguageDetector} with the provided reference
+ * {@link LanguageSample}s.
+ *
+ * @see LanguageDetector
+ * @see LanguageSample
+ */
+public class LanguageDetectorEvaluator extends Evaluator<LanguageSample> {
+
+ private LanguageDetector languageDetector;
+
+ private Mean accuracy = new Mean();
+
+ /**
+ * Initializes the current instance.
+ *
+ * @param langDetect the language detector instance
+ */
+ public LanguageDetectorEvaluator(LanguageDetector langDetect,
+ LanguageDetectorEvaluationMonitor ... listeners) {
+ super(listeners);
+ this.languageDetector = langDetect;
+ }
+
+ /**
+ * Evaluates the given reference {@link LanguageSample} object.
+ *
+ * This is done by categorizing the document from the provided
+ * {@link LanguageSample}. The detected language is then used
+ * to calculate and update the score.
+ *
+ * @param sample the reference {@link LanguageSample}.
+ */
+ public LanguageSample processSample(LanguageSample sample) {
+
+ CharSequence document = sample.getContext();
+
+ Language predicted = languageDetector.predictLanguage(document);
+
+
+
+ if (sample.getLanguage().getLang().equals(predicted.getLang())) {
+ accuracy.add(1);
+ }
+ else {
+ accuracy.add(0);
+ }
+
+ return new LanguageSample(predicted, sample.getContext());
+ }
+
+ /**
+ * Retrieves the accuracy of provided {@link DocumentCategorizer}.
+ *
+ * accuracy = correctly categorized documents / total documents
+ *
+ * @return the accuracy
+ */
+ public double getAccuracy() {
+ return accuracy.mean();
+ }
+
+ public long getDocumentCount() {
+ return accuracy.count();
+ }
+
+ /**
+ * Represents this objects as human readable {@link String}.
+ */
+ @Override
+ public String toString() {
+ return "Accuracy: " + accuracy.mean() + "\n" +
+ "Number of documents: " + accuracy.count();
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
new file mode 100644
index 0000000..b556a4d
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Iterator;
+
+import opennlp.tools.ml.model.Event;
+import opennlp.tools.util.AbstractEventStream;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * Iterator-like class for modeling language detector events.
+ */
+public class LanguageDetectorEventStream extends AbstractEventStream<LanguageSample> {
+
+ private LanguageDetectorContextGenerator mContextGenerator;
+
+ /**
+ * Initializes the current instance via samples and feature generators.
+ *
+ * @param data {@link ObjectStream} of {@link LanguageSample}s
+ */
+ public LanguageDetectorEventStream(ObjectStream<LanguageSample> data) {
+ super(data);
+
+ mContextGenerator =
+ new LanguageDetectorContextGenerator();
+ }
+
+ @Override
+ protected Iterator<Event> createEvents(final LanguageSample sample) {
+
+ return new Iterator<Event>() {
+
+ private boolean isVirgin = true;
+
+ public boolean hasNext() {
+ return isVirgin;
+ }
+
+ public Event next() {
+
+ isVirgin = false;
+
+ return new Event(sample.getLanguage().getLang(),
+ mContextGenerator.getContext(sample.getContext().toString()));
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ };
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
new file mode 100644
index 0000000..5cebbba
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ext.ExtensionLoader;
+
+
+public class LanguageDetectorFactory extends BaseToolFactory {
+
+ public static LanguageDetectorFactory create(String subclassName)
+ throws InvalidFormatException {
+ if (subclassName == null) {
+ // will create the default factory
+ return new LanguageDetectorFactory();
+ }
+ try {
+ LanguageDetectorFactory theFactory = ExtensionLoader.instantiateExtension(
+ LanguageDetectorFactory.class, subclassName);
+ theFactory.init();
+ return theFactory;
+ } catch (Exception e) {
+ String msg = "Could not instantiate the " + subclassName
+ + ". The initialization throw an exception.";
+ throw new InvalidFormatException(msg, e);
+ }
+ }
+
+ public void init() {
+ // nothing to do
+ }
+
+ @Override
+ public void validateArtifactMap() throws InvalidFormatException {
+ // nothing to validate
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
new file mode 100644
index 0000000..74a1cea
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.ml.AbstractEventTrainer;
+import opennlp.tools.ml.EventTrainer;
+import opennlp.tools.ml.TrainerFactory;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+
+/**
+ * Implements learnable Language Detector
+ */
+public class LanguageDetectorME implements LanguageDetector {
+
+ private LanguageDetectorModel model;
+ private LanguageDetectorContextGenerator mContextGenerator;
+
+ /**
+ * Initializes the current instance with a language detector model. Default feature
+ * generation is used.
+ *
+ * @param model the language detector model
+ */
+ public LanguageDetectorME(LanguageDetectorModel model) {
+ this.model = model;
+ this.mContextGenerator = new LanguageDetectorContextGenerator();
+ }
+
+ @Override
+ public Language[] predictLanguages(CharSequence content) {
+ double[] eval = model.getMaxentModel().eval(mContextGenerator.getContext(content.toString()));
+ Language[] arr = new Language[eval.length];
+ for (int i = 0; i < eval.length; i++) {
+ arr[i] = new Language(model.getMaxentModel().getOutcome(i), eval[i]);
+ }
+
+ Arrays.sort(arr, (o1, o2) -> Double.compare(o2.getConfidence(), o1.getConfidence()));
+ return arr;
+ }
+
+ @Override
+ public Language predictLanguage(CharSequence content) {
+ return predictLanguages(content)[0];
+ }
+
+ @Override
+ public String[] getSupportedLanguages() {
+ int numberLanguages = model.getMaxentModel().getNumOutcomes();
+ String[] languages = new String[numberLanguages];
+ for (int i = 0; i < numberLanguages; i++) {
+ languages[i] = model.getMaxentModel().getOutcome(i);
+ }
+ return languages;
+ }
+
+
+ public static LanguageDetectorModel train(ObjectStream<LanguageSample> samples,
+ TrainingParameters mlParams,
+ LanguageDetectorFactory factory)
+ throws IOException {
+
+ Map<String, String> manifestInfoEntries = new HashMap<>();
+
+ mlParams.putIfAbsent(AbstractEventTrainer.DATA_INDEXER_PARAM,
+ AbstractEventTrainer.DATA_INDEXER_ONE_PASS_VALUE);
+
+ EventTrainer trainer = TrainerFactory.getEventTrainer(
+ mlParams, manifestInfoEntries);
+
+ MaxentModel model = trainer.train(
+ new LanguageDetectorEventStream(samples));
+
+ return new LanguageDetectorModel(model, manifestInfoEntries, factory);
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
new file mode 100644
index 0000000..c0d9703
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Map;
+
+import opennlp.tools.ml.model.AbstractModel;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.BaseModel;
+
+/**
+ * A model for language detection
+ */
+public class LanguageDetectorModel extends BaseModel {
+
+ private static final String COMPONENT_NAME = "LanguageDetectorME";
+ private static final String LANGDETECT_MODEL_ENTRY_NAME = "langdetect.model";
+
+ public LanguageDetectorModel(MaxentModel langdetectModel,
+ Map<String, String> manifestInfoEntries,
+ LanguageDetectorFactory factory) {
+ super(COMPONENT_NAME, "und", manifestInfoEntries, factory);
+
+ artifactMap.put(LANGDETECT_MODEL_ENTRY_NAME, langdetectModel);
+ checkArtifactMap();
+ }
+
+ public LanguageDetectorModel(InputStream in) throws IOException {
+ super(COMPONENT_NAME, in);
+ }
+
+ public LanguageDetectorModel(File modelFile) throws IOException {
+ super(COMPONENT_NAME, modelFile);
+ }
+
+ public LanguageDetectorModel(URL modelURL) throws IOException {
+ super(COMPONENT_NAME, modelURL);
+ }
+
+ @Override
+ protected void validateArtifactMap() throws InvalidFormatException {
+ super.validateArtifactMap();
+
+ if (!(artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME) instanceof AbstractModel)) {
+ throw new InvalidFormatException("Language detector model is incomplete!");
+ }
+ }
+
+ public LanguageDetectorFactory getFactory() {
+ return (LanguageDetectorFactory) this.toolFactory;
+ }
+
+ @Override
+ protected Class<? extends BaseToolFactory> getDefaultFactory() {
+ return LanguageDetectorFactory.class;
+ }
+
+ public MaxentModel getMaxentModel() {
+ return (MaxentModel) artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME);
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java
new file mode 100644
index 0000000..2a407f7
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.IOException;
+
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * This class reads in string encoded training samples, parses them and
+ * outputs {@link LanguageSample} objects.
+ * <p>
+ * Format:<br>
+ * Each line contains one sample document.<br>
+ * The language is the first string in the line followed by a tab and the document content.<br>
+ * Sample line: category-string tab-char document line-break-char(s)<br>
+ */
+public class LanguageDetectorSampleStream
+ extends FilterObjectStream<String, LanguageSample> {
+
+ public LanguageDetectorSampleStream(ObjectStream<String> samples) {
+ super(samples);
+ }
+
+ public LanguageSample read() throws IOException {
+ String sampleString;
+ while ((sampleString = samples.read()) != null) {
+ int tabIndex = sampleString.indexOf("\t");
+ if (tabIndex > 0) {
+ String lang = sampleString.substring(0, tabIndex);
+ String context = sampleString.substring(tabIndex + 1);
+
+ return new LanguageSample(new Language(lang), context);
+ }
+ }
+
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
new file mode 100644
index 0000000..f454864
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Objects;
+
+/**
+ * Class which holds a classified document and its @{@link Language}.
+ */
+public class LanguageSample {
+
+ private final Language language;
+ private final CharSequence context;
+
+ public LanguageSample(Language language, CharSequence context) {
+ this.language = Objects.requireNonNull(language, "language must not be null");
+ this.context = Objects.requireNonNull(context, "context must not be null");
+ }
+
+ public Language getLanguage() {
+ return language;
+ }
+
+ public CharSequence getContext() {
+ return context;
+ }
+
+ @Override
+ public String toString() {
+ return language.getLang() + '\t' + context;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(getContext(), getLanguage());
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+
+ if (obj instanceof LanguageSample) {
+ LanguageSample a = (LanguageSample) obj;
+
+ return getLanguage().equals(a.getLanguage())
+ && getContext().equals(a.getContext());
+ }
+
+ return false;
+ }
+}