You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/12/20 09:04:00 UTC
[opennlp] branch master updated: OPENNLP-1166: TwoPassDataIndexer
fails if features contain \n
This is an automated email from the ASF dual-hosted git repository.
joern pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new a5cb9a4 OPENNLP-1166: TwoPassDataIndexer fails if features contain \n
a5cb9a4 is described below
commit a5cb9a4b4a196bd2048cd21ed2fef175bc1181cf
Author: thygesen <th...@apache.org>
AuthorDate: Tue Dec 12 11:52:04 2017 +0100
OPENNLP-1166: TwoPassDataIndexer fails if features contain \n
---
.../opennlp/tools/ml/model/TwoPassDataIndexer.java | 97 +++++++++++++++++++---
.../tools/ml/model/TwoPassDataIndexerTest.java | 28 +++++++
2 files changed, 113 insertions(+), 12 deletions(-)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/ml/model/TwoPassDataIndexer.java b/opennlp-tools/src/main/java/opennlp/tools/ml/model/TwoPassDataIndexer.java
index 5e347e8..4121e36 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ml/model/TwoPassDataIndexer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ml/model/TwoPassDataIndexer.java
@@ -17,13 +17,16 @@
package opennlp.tools.ml.model;
-import java.io.BufferedWriter;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
import java.io.File;
+import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.nio.charset.StandardCharsets;
+import java.math.BigInteger;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -59,20 +62,28 @@ public class TwoPassDataIndexer extends AbstractDataIndexer {
File tmp = File.createTempFile("events", null);
tmp.deleteOnExit();
int numEvents;
- try (Writer osw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmp),
- StandardCharsets.UTF_8))) {
- numEvents = computeEventCounts(eventStream, osw, predicateIndex, cutoff);
+ BigInteger writeHash;
+ HashSumEventStream writeEventStream = new HashSumEventStream(eventStream); // do not close.
+ try (DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(tmp)))) {
+ numEvents = computeEventCounts(writeEventStream, dos, predicateIndex, cutoff);
}
+ writeHash = writeEventStream.calculateHashSum();
+
display("done. " + numEvents + " events\n");
display("\tIndexing... ");
List<ComparableEvent> eventsToCompare;
- try (FileEventStream fes = new FileEventStream(tmp)) {
- eventsToCompare = index(fes, predicateIndex);
+ BigInteger readHash = null;
+ try (HashSumEventStream readStream = new HashSumEventStream(new EventStream(tmp))) {
+ eventsToCompare = index(readStream, predicateIndex);
+ readHash = readStream.calculateHashSum();
}
-
tmp.delete();
+
+ if (readHash.compareTo(writeHash) != 0)
+ throw new IOException("Event hash for writing and reading events did not match.");
+
display("done.\n");
if (sort) {
@@ -91,12 +102,19 @@ public class TwoPassDataIndexer extends AbstractDataIndexer {
* occur at least <tt>cutoff</tt> times are added to the
* <tt>predicatesInOut</tt> map along with a unique integer index.
*
+ * Protocol:
+ * 1 - (utf string) - Event outcome
+ * 2 - (int) - Event context array length
+ * 3+ - (utf string) - Event context string
+ * 4 - (int) - Event values array length
+ * 5+ - (float) - Event value
+ *
* @param eventStream an <code>EventStream</code> value
* @param eventStore a writer to which the events are written to for later processing.
* @param predicatesInOut a <code>TObjectIntHashMap</code> value
* @param cutoff an <code>int</code> value
*/
- private int computeEventCounts(ObjectStream<Event> eventStream, Writer eventStore,
+ private int computeEventCounts(ObjectStream<Event> eventStream, DataOutputStream eventStore,
Map<String,Integer> predicatesInOut, int cutoff) throws IOException {
Map<String,Integer> counter = new HashMap<>();
int eventCount = 0;
@@ -104,9 +122,23 @@ public class TwoPassDataIndexer extends AbstractDataIndexer {
Event ev;
while ((ev = eventStream.read()) != null) {
eventCount++;
- eventStore.write(FileEventStream.toLine(ev));
+
+ eventStore.writeUTF(ev.getOutcome());
+
+ eventStore.writeInt(ev.getContext().length);
String[] ec = ev.getContext();
update(ec, counter);
+ for (String ctxString : ec)
+ eventStore.writeUTF(ctxString);
+
+ if (ev.getValues() == null) {
+ eventStore.writeInt(0);
+ }
+ else {
+ eventStore.writeInt(ev.getValues().length);
+ for (float value : ev.getValues())
+ eventStore.writeFloat(value);
+ }
}
String[] predicateSet = counter.entrySet().stream()
@@ -122,4 +154,45 @@ public class TwoPassDataIndexer extends AbstractDataIndexer {
return eventCount;
}
+
+ private static class EventStream implements ObjectStream<Event> {
+
+ private final DataInputStream inputStream;
+
+ public EventStream(File file) throws IOException {
+ inputStream = new DataInputStream(new BufferedInputStream(new FileInputStream(file)));
+ }
+
+ @Override
+ public Event read() throws IOException {
+ if (inputStream.available() != 0) {
+ String outcome = inputStream.readUTF();
+ int contextLenght = inputStream.readInt();
+ String[] context = new String[contextLenght];
+ for (int i = 0; i < contextLenght; i++)
+ context[i] = inputStream.readUTF();
+ int valuesLength = inputStream.readInt();
+ float[] values = null;
+ if (valuesLength > 0) {
+ values = new float[valuesLength];
+ for (int i = 0; i < valuesLength; i++)
+ values[i] = inputStream.readFloat();
+ }
+ return new Event(outcome, context, values);
+ }
+ else {
+ return null;
+ }
+ }
+
+ @Override
+ public void reset() throws IOException, UnsupportedOperationException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void close() throws IOException {
+ inputStream.close();
+ }
+ }
}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ml/model/TwoPassDataIndexerTest.java b/opennlp-tools/src/test/java/opennlp/tools/ml/model/TwoPassDataIndexerTest.java
index c246936..a8a1b22 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/ml/model/TwoPassDataIndexerTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/ml/model/TwoPassDataIndexerTest.java
@@ -23,8 +23,15 @@ import java.util.Collections;
import org.junit.Assert;
import org.junit.Test;
+import opennlp.tools.namefind.DefaultNameContextGenerator;
+import opennlp.tools.namefind.NameContextGenerator;
+import opennlp.tools.namefind.NameFinderEventStream;
+import opennlp.tools.namefind.NameSample;
import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ObjectStreamUtils;
+import opennlp.tools.util.Span;
import opennlp.tools.util.TrainingParameters;
+import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
public class TwoPassDataIndexerTest {
@@ -61,4 +68,25 @@ public class TwoPassDataIndexerTest {
Assert.assertArrayEquals(new String[]{"other", "org-start", "org-cont"}, indexer.getOutcomeLabels());
Assert.assertArrayEquals(new int[]{5}, indexer.getPredCounts());
}
+
+ @Test
+ public void testIndexWithNewline() throws IOException {
+
+ String[] sentence = "He belongs to Apache \n Software Foundation .".split(" ");
+
+ NameContextGenerator CG = new DefaultNameContextGenerator(
+ (AdaptiveFeatureGenerator[]) null);
+
+ NameSample nameSample = new NameSample(sentence,
+ new Span[] { new Span(3, 7) }, false);
+
+ ObjectStream<Event> eventStream = new NameFinderEventStream(
+ ObjectStreamUtils.createObjectStream(nameSample), "org", CG, null);
+
+ DataIndexer indexer = new TwoPassDataIndexer();
+ indexer.init(new TrainingParameters(Collections.emptyMap()), null);
+ indexer.index(eventStream);
+ Assert.assertEquals(5, indexer.getContexts().length);
+
+ }
}
--
To stop receiving notification emails like this one, please contact
['"commits@opennlp.apache.org" <co...@opennlp.apache.org>'].