You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/08/16 09:27:26 UTC

[opennlp] branch master updated: OPENNLP-1119 Select sentences randomly and shuffle order of samples

This is an automated email from the ASF dual-hosted git repository.

joern pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/master by this push:
     new ccf1b0a  OPENNLP-1119 Select sentences randomly and shuffle order of samples
ccf1b0a is described below

commit ccf1b0aa6159ad15c07cbc75ce6b9f17ad18f9f0
Author: Jörn Kottmann <jo...@apache.org>
AuthorDate: Thu Jul 27 18:03:57 2017 +0200

    OPENNLP-1119 Select sentences randomly and shuffle order of samples
    
    The samples should be build from randomly picked lines taken
    from a sentences file. The samples in the stream should be shuffled.
---
 .../leipzig/LeipzigLanguageSampleStream.java       | 96 +++++++++++++++-------
 .../LeipzigLanguageSampleStreamFactory.java        |  4 +-
 .../tools/formats/leipzig/SampleShuffleStream.java | 61 ++++++++++++++
 3 files changed, 128 insertions(+), 33 deletions(-)

diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
index 6c4d009..9374a20 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
@@ -20,10 +20,18 @@ package opennlp.tools.formats.leipzig;
 import java.io.File;
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
 import java.util.Iterator;
+import java.util.List;
 import java.util.Map;
+import java.util.Random;
+import java.util.Set;
 import java.util.stream.Collectors;
+import java.util.stream.IntStream;
 
 import opennlp.tools.langdetect.Language;
 import opennlp.tools.langdetect.LanguageSample;
@@ -34,46 +42,72 @@ import opennlp.tools.util.PlainTextByLineStream;
 public class LeipzigLanguageSampleStream implements ObjectStream<LanguageSample> {
 
   private class LeipzigSentencesStream implements ObjectStream<LanguageSample> {
+
     private final String lang;
-    private int sentencesPerSample;
-    private int numberOfSamples;
 
-    private ObjectStream<String> lineStream;
-    private int sampleCount;
+    private Iterator<String> lineIterator;
 
     LeipzigSentencesStream(String lang, File sentencesFile, int sentencesPerSample, int numberOfSamples)
         throws IOException {
-      this.lang = sentencesFile.getName().substring(0, 3);
-      this.sentencesPerSample = sentencesPerSample;
-      this.numberOfSamples = numberOfSamples;
 
-      lineStream = new PlainTextByLineStream(new MarkableFileInputStreamFactory(sentencesFile),
-          StandardCharsets.UTF_8);
+      this.lang = lang;
+
+      // The file name contains the number of lines, but to make this more stable
+      // the file is once scanned for the count even tough this is slower
+      int totalLineCount = (int) Files.lines(sentencesFile.toPath()).count();
+
+      List<Integer> indexes = IntStream.range(0, totalLineCount)
+          .boxed().collect(Collectors.toList());
+
+      Collections.shuffle(indexes, random);
+
+      Set<Integer> selectedLines = new HashSet<>(
+          indexes.subList(0, sentencesPerSample * numberOfSamples));
+
+      List<String> sentences = new ArrayList<>();
+
+      try (ObjectStream<String> lineStream = new PlainTextByLineStream(
+          new MarkableFileInputStreamFactory(sentencesFile), StandardCharsets.UTF_8)) {
+
+        int lineIndex = 0;
+        String line;
+        while ((line = lineStream.read()) != null) {
+
+          int tabIndex = line.indexOf('\t');
+          if (tabIndex != -1) {
+            if (selectedLines.contains(lineIndex)) {
+              sentences.add(line);
+            }
+          }
+
+          lineIndex++;
+        }
+      }
+
+      Collections.shuffle(sentences, random);
+
+      lineIterator = sentences.iterator();
     }
 
     @Override
     public LanguageSample read() throws IOException {
+      StringBuilder sampleString = new StringBuilder();
 
-      if (sampleCount < numberOfSamples) {
-        StringBuilder sampleString = new StringBuilder();
-
-        int count = 0;
-        String line;
-        while (count < sentencesPerSample && (line = lineStream.read()) != null) {
+      int count = 0;
+      while (count < sentencesPerSample && lineIterator.hasNext()) {
 
-          int textStart = line.indexOf('\t') + 1;
+        String line = lineIterator.next();
+        int textStart = line.indexOf('\t') + 1;
 
-          // TODO: It should it be changed to contain an array of sample strings ?!
-          sampleString.append(line.substring(textStart) + " ");
+        sampleString.append(line.substring(textStart) + " ");
 
-          count++;
-        }
+        count++;
+      }
 
-        if (sampleString.length() > 0) {
-          sampleCount++;
-          return new LanguageSample(new Language(lang), sampleString);
-        }
+      if (sampleString.length() > 0) {
+        return new LanguageSample(new Language(lang), sampleString);
       }
+
       return null;
     }
   }
@@ -86,10 +120,13 @@ public class LeipzigLanguageSampleStream implements ObjectStream<LanguageSample>
   private Iterator<File> sentencesFilesIt;
   private ObjectStream<LanguageSample> sampleStream;
 
+  private final Random random;
+
   public LeipzigLanguageSampleStream(File leipzigFolder, final int sentencesPerSample,
                                      final int samplesPerLanguage) throws IOException {
     this.sentencesPerSample = sentencesPerSample;
-    // TODO: Use a FileFilter to make this more reliable in case there are files which should be ignored
+    // TODO: Use a FileFilter to make this more reliable in case there are
+    //       files which should be ignored or are shorter than 3 chars for the lang detect substring
     sentencesFiles = leipzigFolder.listFiles();
     Arrays.sort(sentencesFiles);
 
@@ -100,6 +137,8 @@ public class LeipzigLanguageSampleStream implements ObjectStream<LanguageSample>
     langSampleCounts = langCounts.entrySet().stream()
         .collect(Collectors.toMap(Map.Entry::getKey, e -> samplesPerLanguage / e.getValue()));
 
+    random = new Random(23);
+
     reset();
   }
 
@@ -111,7 +150,7 @@ public class LeipzigLanguageSampleStream implements ObjectStream<LanguageSample>
     else {
       if (sentencesFilesIt.hasNext()) {
         File sentencesFile = sentencesFilesIt.next();
-        System.out.println(sentencesFile);
+
         String lang = sentencesFile.getName().substring(0, 3);
 
         sampleStream = new LeipzigSentencesStream(lang, sentencesFile,
@@ -128,9 +167,4 @@ public class LeipzigLanguageSampleStream implements ObjectStream<LanguageSample>
     sentencesFilesIt = Arrays.asList(sentencesFiles).iterator();
     sampleStream = null;
   }
-
-  public static void main(String[] args) throws Exception {
-    new LeipzigLanguageSampleStream(new File("/home/blue/opennlp-data-dir/leipzig-lang"),
-        10, 100000);
-  }
 }
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
index 59a7551..f7fbc08 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
@@ -64,9 +64,9 @@ public class LeipzigLanguageSampleStreamFactory
     File sentencesFileDir = params.getSentencesDir();
 
     try {
-      return new LeipzigLanguageSampleStream(sentencesFileDir,
+      return new SampleShuffleStream(new LeipzigLanguageSampleStream(sentencesFileDir,
           Integer.parseInt(params.getSentencesPerSample()),
-          Integer.parseInt(params.getSamplesPerLanguage()));
+          Integer.parseInt(params.getSamplesPerLanguage())));
     } catch (IOException e) {
       throw new TerminateToolException(-1, "IO error while opening sample data.", e);
     }
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/SampleShuffleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/SampleShuffleStream.java
new file mode 100644
index 0000000..be81c12
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/SampleShuffleStream.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.leipzig;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+
+import opennlp.tools.util.ObjectStream;
+
+class SampleShuffleStream<T> implements ObjectStream<T> {
+
+  private List<T> bufferedSamples = new ArrayList<>();
+
+  private Iterator<T> sampleIt;
+
+  SampleShuffleStream(ObjectStream<T> samples) throws IOException {
+
+    T sample;
+    while ((sample = samples.read()) != null) {
+      bufferedSamples.add(sample);
+    }
+
+    Collections.shuffle(bufferedSamples, new Random(23));
+
+    reset();
+  }
+
+  @Override
+  public T read() throws IOException {
+
+    if (sampleIt.hasNext()) {
+      return sampleIt.next();
+    }
+
+    return null;
+  }
+
+  @Override
+  public void reset() throws IOException, UnsupportedOperationException {
+    sampleIt = bufferedSamples.iterator();
+  }
+}

-- 
To stop receiving notification emails like this one, please contact
['"commits@opennlp.apache.org" <co...@opennlp.apache.org>'].