You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/05/19 13:13:17 UTC
opennlp git commit: Add support to train on leipzig
Repository: opennlp
Updated Branches:
refs/heads/LangDetect a189d4ecc -> eb6fb32d1
Add support to train on leipzig
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/eb6fb32d
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/eb6fb32d
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/eb6fb32d
Branch: refs/heads/LangDetect
Commit: eb6fb32d1dbb86f3417ae67a1f45daa0da39fa3a
Parents: a189d4e
Author: Jörn Kottmann <jo...@apache.org>
Authored: Thu May 18 17:03:52 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Fri May 19 11:02:44 2017 +0200
----------------------------------------------------------------------
.../tools/cmdline/StreamFactoryRegistry.java | 3 +
.../formats/LeipzigDoccatSampleStream.java | 5 +-
.../LeipzigDocumentSampleStreamFactory.java | 3 +
.../leipzig/LeipzigLanguageSampleStream.java | 133 +++++++++++++++++++
.../LeipzigLanguageSampleStreamFactory.java | 73 ++++++++++
.../LanguageDetectorContextGenerator.java | 21 ++-
.../tools/langdetect/LanguageSample.java | 13 +-
7 files changed, 227 insertions(+), 24 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/eb6fb32d/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index d1e8c89..b258ab2 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -53,6 +53,7 @@ import opennlp.tools.formats.convert.ParseToPOSSampleStreamFactory;
import opennlp.tools.formats.convert.ParseToSentenceSampleStreamFactory;
import opennlp.tools.formats.convert.ParseToTokenSampleStreamFactory;
import opennlp.tools.formats.frenchtreebank.ConstitParseSampleStreamFactory;
+import opennlp.tools.formats.leipzig.LeipzigLanguageSampleStreamFactory;
import opennlp.tools.formats.letsmt.LetsmtSentenceStreamFactory;
import opennlp.tools.formats.moses.MosesSentenceSampleStreamFactory;
import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory;
@@ -117,6 +118,8 @@ public final class StreamFactoryRegistry {
ConlluPOSSampleStreamFactory.registerFactory();
ConlluLemmaSampleStreamFactory.registerFactory();
+
+ LeipzigLanguageSampleStreamFactory.registerFactory();
}
public static final String DEFAULT_FORMAT = "opennlp";
http://git-wip-us.apache.org/repos/asf/opennlp/blob/eb6fb32d/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
index 8ed0036..7059e21 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
@@ -39,8 +39,11 @@ import opennlp.tools.util.PlainTextByLineStream;
* <p>
* The input text is tokenized with the {@link SimpleTokenizer}. The input text classified
* by the language model must also be tokenized by the {@link SimpleTokenizer} to produce
- * exactly the same tokenization during testing and training.ø
+ * exactly the same tokenization during testing and training.
+ *
+ * @deprecated will be removed, use the language detector instead
*/
+@Deprecated
public class LeipzigDoccatSampleStream extends
FilterObjectStream<String, DocumentSample> {
http://git-wip-us.apache.org/repos/asf/opennlp/blob/eb6fb32d/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
index bd2453b..d6ff9ba 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
@@ -33,7 +33,10 @@ import opennlp.tools.util.ObjectStreamUtils;
/**
* <b>Note:</b> Do not use this class, internal use only!
+ *
+ * @deprecated will be removed, use the language detector instead
*/
+@Deprecated
public class LeipzigDocumentSampleStreamFactory
extends AbstractSampleStreamFactory<DocumentSample> {
http://git-wip-us.apache.org/repos/asf/opennlp/blob/eb6fb32d/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
new file mode 100644
index 0000000..582fb08
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.leipzig;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import opennlp.tools.langdetect.Language;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.MarkableFileInputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+
+public class LeipzigLanguageSampleStream implements ObjectStream<LanguageSample> {
+
+ private class LeipzigSentencesStream implements ObjectStream<LanguageSample> {
+ private final String lang;
+ private int sentencesPerSample;
+ private int numberOfSamples;
+
+ private ObjectStream<String> lineStream;
+ private int sampleCount;
+
+ LeipzigSentencesStream(String lang, File sentencesFile, int sentencesPerSample, int numberOfSamples)
+ throws IOException {
+ this.lang = sentencesFile.getName().substring(0, 3);
+ this.sentencesPerSample = sentencesPerSample;
+ this.numberOfSamples = numberOfSamples;
+
+ lineStream = new PlainTextByLineStream(new MarkableFileInputStreamFactory(sentencesFile),
+ StandardCharsets.UTF_8);
+ }
+
+ @Override
+ public LanguageSample read() throws IOException {
+ if (sampleCount < numberOfSamples) {
+ StringBuilder sampleString = new StringBuilder();
+
+ int count = 0;
+ String line;
+ while (count < sentencesPerSample && (line = lineStream.read()) != null) {
+
+ // TODO: It should it be changed to contain an array of sample strings ?!
+ sampleString.append(line + " ");
+
+ count++;
+ }
+
+ if (sampleString.length() > 0) {
+ sampleCount++;
+ return new LanguageSample(new Language(lang), sampleString);
+ }
+ }
+ return null;
+ }
+ }
+
+ private final int sentencesPerSample;
+
+ private Map<String, Integer> langSampleCounts;
+ private File[] sentencesFiles;
+
+ private Iterator<File> sentencesFilesIt;
+ private ObjectStream<LanguageSample> sampleStream;
+
+ public LeipzigLanguageSampleStream(File leipzigFolder, final int sentencesPerSample,
+ final int samplesPerLanguage) throws IOException {
+ this.sentencesPerSample = sentencesPerSample;
+ // TODO: Use a FileFilter to make this more reliable in case there are files which should be ignored
+ sentencesFiles = leipzigFolder.listFiles();
+ Arrays.sort(sentencesFiles);
+
+ Map<String, Integer> langCounts = Arrays.stream(sentencesFiles)
+ .map(file -> file.getName().substring(0, 3))
+ .collect(Collectors.groupingBy(String::toString, Collectors.summingInt(v -> 1)));
+
+ langSampleCounts = langCounts.entrySet().stream()
+ .collect(Collectors.toMap(Map.Entry::getKey, e -> samplesPerLanguage / e.getValue()));
+
+ reset();
+ }
+
+ public LanguageSample read() throws IOException {
+ LanguageSample sample;
+ if (sampleStream != null && (sample = sampleStream.read()) != null) {
+ return sample;
+ }
+ else {
+ if (sentencesFilesIt.hasNext()) {
+ File sentencesFile = sentencesFilesIt.next();
+ System.out.println(sentencesFile);
+ String lang = sentencesFile.getName().substring(0, 3);
+
+ sampleStream = new LeipzigSentencesStream(lang, sentencesFile,
+ sentencesPerSample, langSampleCounts.get(lang));
+
+ return read();
+ }
+ }
+ return null;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ sentencesFilesIt = Arrays.asList(sentencesFiles).iterator();
+ sampleStream = null;
+ }
+
+ public static void main(String[] args) throws Exception {
+ new LeipzigLanguageSampleStream(new File("/home/blue/opennlp-data-dir/leipzig-lang"),
+ 10, 100000);
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/eb6fb32d/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
new file mode 100644
index 0000000..96b0378
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.leipzig;
+
+import java.io.File;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.EncodingParameter;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class LeipzigLanguageSampleStreamFactory
+ extends AbstractSampleStreamFactory<LanguageSample> {
+
+ interface Parameters extends EncodingParameter {
+ @ParameterDescription(valueName = "sentencesDir",
+ description = "dir with Leipig sentences to be used")
+ File getSentencesDir();
+
+ @ParameterDescription(valueName = "sentencesPerSample",
+ description = "number of sentences per sample")
+ int getSentencesPerSample();
+
+ @ParameterDescription(valueName = "samplesPerLanguage",
+ description = "number of samples per language")
+ int getSamplesPerLanguage();
+ }
+
+ protected <P> LeipzigLanguageSampleStreamFactory(Class<P> params) {
+ super(params);
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(LanguageSample.class,
+ "leipzig", new LeipzigLanguageSampleStreamFactory(Parameters.class));
+ }
+
+ public ObjectStream<LanguageSample> create(String[] args) {
+
+ Parameters params = ArgumentParser.parse(args, Parameters.class);
+ File sentencesFileDir = params.getSentencesDir();
+
+ try {
+ return new LeipzigLanguageSampleStream(sentencesFileDir, params.getSentencesPerSample(),
+ params.getSamplesPerLanguage());
+ } catch (IOException e) {
+ throw new TerminateToolException(-1, "IO error while opening sample data.", e);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/eb6fb32d/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
index c63ba76..dcfe0e9 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
@@ -25,12 +25,6 @@ import opennlp.tools.util.StringList;
import opennlp.tools.util.StringUtil;
import opennlp.tools.util.normalizer.AggregateCharSequenceNormalizer;
import opennlp.tools.util.normalizer.CharSequenceNormalizer;
-import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.UnicodeCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.UrlCharSequenceNormalizer;
/**
* Context generator for document categorizer
@@ -46,19 +40,20 @@ class LanguageDetectorContextGenerator {
this.maxLength = maxLength;
this.normalizer = new AggregateCharSequenceNormalizer(
- EmojiCharSequenceNormalizer.getInstance(),
- UrlCharSequenceNormalizer.getInstance(),
- TwitterCharSequenceNormalizer.getInstance(),
- NumberCharSequenceNormalizer.getInstance(),
- UnicodeCharSequenceNormalizer.getInstance(),
- ShrinkCharSequenceNormalizer.getInstance());
+ // EmojiCharSequenceNormalizer.getInstance(),
+ //UrlCharSequenceNormalizer.getInstance(),
+ //TwitterCharSequenceNormalizer.getInstance(),
+ //NumberCharSequenceNormalizer.getInstance(),
+ //UnicodeCharSequenceNormalizer.getInstance(),
+ //ShrinkCharSequenceNormalizer.getInstance());
+ );
}
/**
* Initializes the current instance with min 2 length and max 5 length of ngrams.
*/
LanguageDetectorContextGenerator() {
- this(2, 5);
+ this(3, 3);
}
public String[] getContext(String document) {
http://git-wip-us.apache.org/repos/asf/opennlp/blob/eb6fb32d/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
index 6f2fda7..f454864 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
@@ -28,10 +28,8 @@ public class LanguageSample {
private final CharSequence context;
public LanguageSample(Language language, CharSequence context) {
- Objects.requireNonNull(context, "context must not be null");
- Objects.requireNonNull(language, "language must not be null");
- this.language = language;
- this.context = context;
+ this.language = Objects.requireNonNull(language, "language must not be null");
+ this.context = Objects.requireNonNull(context, "context must not be null");
}
public Language getLanguage() {
@@ -44,12 +42,7 @@ public class LanguageSample {
@Override
public String toString() {
-
- StringBuilder sampleString = new StringBuilder();
-
- sampleString.append(language.getLang()).append('\t').append(context);
-
- return sampleString.toString();
+ return language.getLang() + '\t' + context;
}
@Override