You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/05/19 13:13:17 UTC
opennlp git commit: Add support to train on leipzig

Repository: opennlp
Updated Branches:
  refs/heads/LangDetect a189d4ecc -> eb6fb32d1


Add support to train on leipzig


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/eb6fb32d
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/eb6fb32d
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/eb6fb32d

Branch: refs/heads/LangDetect
Commit: eb6fb32d1dbb86f3417ae67a1f45daa0da39fa3a
Parents: a189d4e
Author: Jörn Kottmann <jo...@apache.org>
Authored: Thu May 18 17:03:52 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Fri May 19 11:02:44 2017 +0200

----------------------------------------------------------------------
 .../tools/cmdline/StreamFactoryRegistry.java    |   3 +
 .../formats/LeipzigDoccatSampleStream.java      |   5 +-
 .../LeipzigDocumentSampleStreamFactory.java     |   3 +
 .../leipzig/LeipzigLanguageSampleStream.java    | 133 +++++++++++++++++++
 .../LeipzigLanguageSampleStreamFactory.java     |  73 ++++++++++
 .../LanguageDetectorContextGenerator.java       |  21 ++-
 .../tools/langdetect/LanguageSample.java        |  13 +-
 7 files changed, 227 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/eb6fb32d/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index d1e8c89..b258ab2 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -53,6 +53,7 @@ import opennlp.tools.formats.convert.ParseToPOSSampleStreamFactory;
 import opennlp.tools.formats.convert.ParseToSentenceSampleStreamFactory;
 import opennlp.tools.formats.convert.ParseToTokenSampleStreamFactory;
 import opennlp.tools.formats.frenchtreebank.ConstitParseSampleStreamFactory;
+import opennlp.tools.formats.leipzig.LeipzigLanguageSampleStreamFactory;
 import opennlp.tools.formats.letsmt.LetsmtSentenceStreamFactory;
 import opennlp.tools.formats.moses.MosesSentenceSampleStreamFactory;
 import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory;
@@ -117,6 +118,8 @@ public final class StreamFactoryRegistry {
 
     ConlluPOSSampleStreamFactory.registerFactory();
     ConlluLemmaSampleStreamFactory.registerFactory();
+
+    LeipzigLanguageSampleStreamFactory.registerFactory();
   }
 
   public static final String DEFAULT_FORMAT = "opennlp";

http://git-wip-us.apache.org/repos/asf/opennlp/blob/eb6fb32d/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
index 8ed0036..7059e21 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
@@ -39,8 +39,11 @@ import opennlp.tools.util.PlainTextByLineStream;
  * <p>
  * The input text is tokenized with the {@link SimpleTokenizer}. The input text classified
  * by the language model must also be tokenized by the {@link SimpleTokenizer} to produce
- * exactly the same tokenization during testing and training.ø
+ * exactly the same tokenization during testing and training.
+ *
+ * @deprecated will be removed, use the language detector instead
  */
+@Deprecated
 public class LeipzigDoccatSampleStream extends
     FilterObjectStream<String, DocumentSample> {
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/eb6fb32d/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
index bd2453b..d6ff9ba 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
@@ -33,7 +33,10 @@ import opennlp.tools.util.ObjectStreamUtils;
 
 /**
  * <b>Note:</b> Do not use this class, internal use only!
+ *
+ * @deprecated will be removed, use the language detector instead
  */
+@Deprecated
 public class LeipzigDocumentSampleStreamFactory
     extends AbstractSampleStreamFactory<DocumentSample> {
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/eb6fb32d/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
new file mode 100644
index 0000000..582fb08
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.leipzig;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import opennlp.tools.langdetect.Language;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.MarkableFileInputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+
+public class LeipzigLanguageSampleStream implements ObjectStream<LanguageSample> {
+
+  private class LeipzigSentencesStream implements ObjectStream<LanguageSample> {
+    private final String lang;
+    private int sentencesPerSample;
+    private int numberOfSamples;
+
+    private ObjectStream<String> lineStream;
+    private int sampleCount;
+
+    LeipzigSentencesStream(String lang, File sentencesFile, int sentencesPerSample, int numberOfSamples)
+        throws IOException {
+      this.lang = sentencesFile.getName().substring(0, 3);
+      this.sentencesPerSample = sentencesPerSample;
+      this.numberOfSamples = numberOfSamples;
+
+      lineStream = new PlainTextByLineStream(new MarkableFileInputStreamFactory(sentencesFile),
+          StandardCharsets.UTF_8);
+    }
+
+    @Override
+    public LanguageSample read() throws IOException {
+      if (sampleCount < numberOfSamples) {
+        StringBuilder sampleString = new StringBuilder();
+
+        int count = 0;
+        String line;
+        while (count < sentencesPerSample && (line = lineStream.read()) != null) {
+
+          // TODO: It should it be changed to contain an array of sample strings ?!
+          sampleString.append(line + " ");
+
+          count++;
+        }
+
+        if (sampleString.length() > 0) {
+          sampleCount++;
+          return new LanguageSample(new Language(lang), sampleString);
+        }
+      }
+      return null;
+    }
+  }
+
+  private final int sentencesPerSample;
+
+  private Map<String, Integer> langSampleCounts;
+  private File[] sentencesFiles;
+
+  private Iterator<File> sentencesFilesIt;
+  private ObjectStream<LanguageSample> sampleStream;
+
+  public LeipzigLanguageSampleStream(File leipzigFolder, final int sentencesPerSample,
+                                     final int samplesPerLanguage) throws IOException {
+    this.sentencesPerSample = sentencesPerSample;
+    // TODO: Use a FileFilter to make this more reliable in case there are files which should be ignored
+    sentencesFiles = leipzigFolder.listFiles();
+    Arrays.sort(sentencesFiles);
+
+    Map<String, Integer> langCounts = Arrays.stream(sentencesFiles)
+        .map(file -> file.getName().substring(0, 3))
+        .collect(Collectors.groupingBy(String::toString, Collectors.summingInt(v -> 1)));
+
+    langSampleCounts = langCounts.entrySet().stream()
+        .collect(Collectors.toMap(Map.Entry::getKey, e -> samplesPerLanguage / e.getValue()));
+
+    reset();
+  }
+
+  public LanguageSample read() throws IOException {
+    LanguageSample sample;
+    if (sampleStream != null && (sample = sampleStream.read()) != null) {
+      return sample;
+    }
+    else {
+      if (sentencesFilesIt.hasNext()) {
+        File sentencesFile = sentencesFilesIt.next();
+        System.out.println(sentencesFile);
+        String lang = sentencesFile.getName().substring(0, 3);
+
+        sampleStream = new LeipzigSentencesStream(lang, sentencesFile,
+            sentencesPerSample, langSampleCounts.get(lang));
+
+        return read();
+      }
+    }
+    return null;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    sentencesFilesIt = Arrays.asList(sentencesFiles).iterator();
+    sampleStream = null;
+  }
+
+  public static void main(String[] args) throws Exception {
+    new LeipzigLanguageSampleStream(new File("/home/blue/opennlp-data-dir/leipzig-lang"),
+        10, 100000);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/eb6fb32d/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
new file mode 100644
index 0000000..96b0378
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.leipzig;
+
+import java.io.File;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.EncodingParameter;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class LeipzigLanguageSampleStreamFactory
+    extends AbstractSampleStreamFactory<LanguageSample> {
+
+  interface Parameters extends EncodingParameter {
+    @ParameterDescription(valueName = "sentencesDir",
+        description = "dir with Leipig sentences to be used")
+    File getSentencesDir();
+
+    @ParameterDescription(valueName = "sentencesPerSample",
+        description = "number of sentences per sample")
+    int getSentencesPerSample();
+
+    @ParameterDescription(valueName = "samplesPerLanguage",
+        description = "number of samples per language")
+    int getSamplesPerLanguage();
+  }
+
+  protected <P> LeipzigLanguageSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(LanguageSample.class,
+        "leipzig", new LeipzigLanguageSampleStreamFactory(Parameters.class));
+  }
+
+  public ObjectStream<LanguageSample> create(String[] args) {
+
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+    File sentencesFileDir = params.getSentencesDir();
+
+    try {
+      return new LeipzigLanguageSampleStream(sentencesFileDir, params.getSentencesPerSample(),
+          params.getSamplesPerLanguage());
+    } catch (IOException e) {
+      throw new TerminateToolException(-1, "IO error while opening sample data.", e);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/eb6fb32d/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
index c63ba76..dcfe0e9 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
@@ -25,12 +25,6 @@ import opennlp.tools.util.StringList;
 import opennlp.tools.util.StringUtil;
 import opennlp.tools.util.normalizer.AggregateCharSequenceNormalizer;
 import opennlp.tools.util.normalizer.CharSequenceNormalizer;
-import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.UnicodeCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.UrlCharSequenceNormalizer;
 
 /**
  * Context generator for document categorizer
@@ -46,19 +40,20 @@ class LanguageDetectorContextGenerator {
     this.maxLength = maxLength;
 
     this.normalizer = new AggregateCharSequenceNormalizer(
-        EmojiCharSequenceNormalizer.getInstance(),
-        UrlCharSequenceNormalizer.getInstance(),
-        TwitterCharSequenceNormalizer.getInstance(),
-        NumberCharSequenceNormalizer.getInstance(),
-        UnicodeCharSequenceNormalizer.getInstance(),
-        ShrinkCharSequenceNormalizer.getInstance());
+        // EmojiCharSequenceNormalizer.getInstance(),
+        //UrlCharSequenceNormalizer.getInstance(),
+        //TwitterCharSequenceNormalizer.getInstance(),
+        //NumberCharSequenceNormalizer.getInstance(),
+        //UnicodeCharSequenceNormalizer.getInstance(),
+        //ShrinkCharSequenceNormalizer.getInstance());
+    );
   }
 
   /**
    * Initializes the current instance with min 2 length and max 5 length of ngrams.
    */
   LanguageDetectorContextGenerator() {
-    this(2, 5);
+    this(3, 3);
   }
 
   public String[] getContext(String document) {

http://git-wip-us.apache.org/repos/asf/opennlp/blob/eb6fb32d/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
index 6f2fda7..f454864 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
@@ -28,10 +28,8 @@ public class LanguageSample {
   private final CharSequence context;
 
   public LanguageSample(Language language, CharSequence context) {
-    Objects.requireNonNull(context, "context must not be null");
-    Objects.requireNonNull(language, "language must not be null");
-    this.language = language;
-    this.context = context;
+    this.language = Objects.requireNonNull(language, "language must not be null");
+    this.context = Objects.requireNonNull(context, "context must not be null");
   }
 
   public Language getLanguage() {
@@ -44,12 +42,7 @@ public class LanguageSample {
 
   @Override
   public String toString() {
-
-    StringBuilder sampleString = new StringBuilder();
-
-    sampleString.append(language.getLang()).append('\t').append(context);
-
-    return sampleString.toString();
+    return language.getLang() + '\t' +  context;
   }
 
   @Override