You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/08/16 09:27:26 UTC
[opennlp] branch master updated: OPENNLP-1119 Select sentences
randomly and shuffle order of samples
This is an automated email from the ASF dual-hosted git repository.
joern pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new ccf1b0a OPENNLP-1119 Select sentences randomly and shuffle order of samples
ccf1b0a is described below
commit ccf1b0aa6159ad15c07cbc75ce6b9f17ad18f9f0
Author: Jörn Kottmann <jo...@apache.org>
AuthorDate: Thu Jul 27 18:03:57 2017 +0200
OPENNLP-1119 Select sentences randomly and shuffle order of samples
The samples should be build from randomly picked lines taken
from a sentences file. The samples in the stream should be shuffled.
---
.../leipzig/LeipzigLanguageSampleStream.java | 96 +++++++++++++++-------
.../LeipzigLanguageSampleStreamFactory.java | 4 +-
.../tools/formats/leipzig/SampleShuffleStream.java | 61 ++++++++++++++
3 files changed, 128 insertions(+), 33 deletions(-)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
index 6c4d009..9374a20 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
@@ -20,10 +20,18 @@ package opennlp.tools.formats.leipzig;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
import java.util.Iterator;
+import java.util.List;
import java.util.Map;
+import java.util.Random;
+import java.util.Set;
import java.util.stream.Collectors;
+import java.util.stream.IntStream;
import opennlp.tools.langdetect.Language;
import opennlp.tools.langdetect.LanguageSample;
@@ -34,46 +42,72 @@ import opennlp.tools.util.PlainTextByLineStream;
public class LeipzigLanguageSampleStream implements ObjectStream<LanguageSample> {
private class LeipzigSentencesStream implements ObjectStream<LanguageSample> {
+
private final String lang;
- private int sentencesPerSample;
- private int numberOfSamples;
- private ObjectStream<String> lineStream;
- private int sampleCount;
+ private Iterator<String> lineIterator;
LeipzigSentencesStream(String lang, File sentencesFile, int sentencesPerSample, int numberOfSamples)
throws IOException {
- this.lang = sentencesFile.getName().substring(0, 3);
- this.sentencesPerSample = sentencesPerSample;
- this.numberOfSamples = numberOfSamples;
- lineStream = new PlainTextByLineStream(new MarkableFileInputStreamFactory(sentencesFile),
- StandardCharsets.UTF_8);
+ this.lang = lang;
+
+ // The file name contains the number of lines, but to make this more stable
+ // the file is once scanned for the count even tough this is slower
+ int totalLineCount = (int) Files.lines(sentencesFile.toPath()).count();
+
+ List<Integer> indexes = IntStream.range(0, totalLineCount)
+ .boxed().collect(Collectors.toList());
+
+ Collections.shuffle(indexes, random);
+
+ Set<Integer> selectedLines = new HashSet<>(
+ indexes.subList(0, sentencesPerSample * numberOfSamples));
+
+ List<String> sentences = new ArrayList<>();
+
+ try (ObjectStream<String> lineStream = new PlainTextByLineStream(
+ new MarkableFileInputStreamFactory(sentencesFile), StandardCharsets.UTF_8)) {
+
+ int lineIndex = 0;
+ String line;
+ while ((line = lineStream.read()) != null) {
+
+ int tabIndex = line.indexOf('\t');
+ if (tabIndex != -1) {
+ if (selectedLines.contains(lineIndex)) {
+ sentences.add(line);
+ }
+ }
+
+ lineIndex++;
+ }
+ }
+
+ Collections.shuffle(sentences, random);
+
+ lineIterator = sentences.iterator();
}
@Override
public LanguageSample read() throws IOException {
+ StringBuilder sampleString = new StringBuilder();
- if (sampleCount < numberOfSamples) {
- StringBuilder sampleString = new StringBuilder();
-
- int count = 0;
- String line;
- while (count < sentencesPerSample && (line = lineStream.read()) != null) {
+ int count = 0;
+ while (count < sentencesPerSample && lineIterator.hasNext()) {
- int textStart = line.indexOf('\t') + 1;
+ String line = lineIterator.next();
+ int textStart = line.indexOf('\t') + 1;
- // TODO: It should it be changed to contain an array of sample strings ?!
- sampleString.append(line.substring(textStart) + " ");
+ sampleString.append(line.substring(textStart) + " ");
- count++;
- }
+ count++;
+ }
- if (sampleString.length() > 0) {
- sampleCount++;
- return new LanguageSample(new Language(lang), sampleString);
- }
+ if (sampleString.length() > 0) {
+ return new LanguageSample(new Language(lang), sampleString);
}
+
return null;
}
}
@@ -86,10 +120,13 @@ public class LeipzigLanguageSampleStream implements ObjectStream<LanguageSample>
private Iterator<File> sentencesFilesIt;
private ObjectStream<LanguageSample> sampleStream;
+ private final Random random;
+
public LeipzigLanguageSampleStream(File leipzigFolder, final int sentencesPerSample,
final int samplesPerLanguage) throws IOException {
this.sentencesPerSample = sentencesPerSample;
- // TODO: Use a FileFilter to make this more reliable in case there are files which should be ignored
+ // TODO: Use a FileFilter to make this more reliable in case there are
+ // files which should be ignored or are shorter than 3 chars for the lang detect substring
sentencesFiles = leipzigFolder.listFiles();
Arrays.sort(sentencesFiles);
@@ -100,6 +137,8 @@ public class LeipzigLanguageSampleStream implements ObjectStream<LanguageSample>
langSampleCounts = langCounts.entrySet().stream()
.collect(Collectors.toMap(Map.Entry::getKey, e -> samplesPerLanguage / e.getValue()));
+ random = new Random(23);
+
reset();
}
@@ -111,7 +150,7 @@ public class LeipzigLanguageSampleStream implements ObjectStream<LanguageSample>
else {
if (sentencesFilesIt.hasNext()) {
File sentencesFile = sentencesFilesIt.next();
- System.out.println(sentencesFile);
+
String lang = sentencesFile.getName().substring(0, 3);
sampleStream = new LeipzigSentencesStream(lang, sentencesFile,
@@ -128,9 +167,4 @@ public class LeipzigLanguageSampleStream implements ObjectStream<LanguageSample>
sentencesFilesIt = Arrays.asList(sentencesFiles).iterator();
sampleStream = null;
}
-
- public static void main(String[] args) throws Exception {
- new LeipzigLanguageSampleStream(new File("/home/blue/opennlp-data-dir/leipzig-lang"),
- 10, 100000);
- }
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
index 59a7551..f7fbc08 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
@@ -64,9 +64,9 @@ public class LeipzigLanguageSampleStreamFactory
File sentencesFileDir = params.getSentencesDir();
try {
- return new LeipzigLanguageSampleStream(sentencesFileDir,
+ return new SampleShuffleStream(new LeipzigLanguageSampleStream(sentencesFileDir,
Integer.parseInt(params.getSentencesPerSample()),
- Integer.parseInt(params.getSamplesPerLanguage()));
+ Integer.parseInt(params.getSamplesPerLanguage())));
} catch (IOException e) {
throw new TerminateToolException(-1, "IO error while opening sample data.", e);
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/SampleShuffleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/SampleShuffleStream.java
new file mode 100644
index 0000000..be81c12
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/SampleShuffleStream.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.leipzig;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+
+import opennlp.tools.util.ObjectStream;
+
+class SampleShuffleStream<T> implements ObjectStream<T> {
+
+ private List<T> bufferedSamples = new ArrayList<>();
+
+ private Iterator<T> sampleIt;
+
+ SampleShuffleStream(ObjectStream<T> samples) throws IOException {
+
+ T sample;
+ while ((sample = samples.read()) != null) {
+ bufferedSamples.add(sample);
+ }
+
+ Collections.shuffle(bufferedSamples, new Random(23));
+
+ reset();
+ }
+
+ @Override
+ public T read() throws IOException {
+
+ if (sampleIt.hasNext()) {
+ return sampleIt.next();
+ }
+
+ return null;
+ }
+
+ @Override
+ public void reset() throws IOException, UnsupportedOperationException {
+ sampleIt = bufferedSamples.iterator();
+ }
+}
--
To stop receiving notification emails like this one, please contact
['"commits@opennlp.apache.org" <co...@opennlp.apache.org>'].