You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/12/20 09:04:00 UTC

[opennlp] branch master updated: OPENNLP-1166: TwoPassDataIndexer fails if features contain \n

This is an automated email from the ASF dual-hosted git repository.

joern pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/master by this push:
     new a5cb9a4  OPENNLP-1166: TwoPassDataIndexer fails if features contain \n
a5cb9a4 is described below

commit a5cb9a4b4a196bd2048cd21ed2fef175bc1181cf
Author: thygesen <th...@apache.org>
AuthorDate: Tue Dec 12 11:52:04 2017 +0100

    OPENNLP-1166: TwoPassDataIndexer fails if features contain \n
---
 .../opennlp/tools/ml/model/TwoPassDataIndexer.java | 97 +++++++++++++++++++---
 .../tools/ml/model/TwoPassDataIndexerTest.java     | 28 +++++++
 2 files changed, 113 insertions(+), 12 deletions(-)

diff --git a/opennlp-tools/src/main/java/opennlp/tools/ml/model/TwoPassDataIndexer.java b/opennlp-tools/src/main/java/opennlp/tools/ml/model/TwoPassDataIndexer.java
index 5e347e8..4121e36 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ml/model/TwoPassDataIndexer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ml/model/TwoPassDataIndexer.java
@@ -17,13 +17,16 @@
 
 package opennlp.tools.ml.model;
 
-import java.io.BufferedWriter;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.nio.charset.StandardCharsets;
+import java.math.BigInteger;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -59,20 +62,28 @@ public class TwoPassDataIndexer extends AbstractDataIndexer {
     File tmp = File.createTempFile("events", null);
     tmp.deleteOnExit();
     int numEvents;
-    try (Writer osw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmp),
-        StandardCharsets.UTF_8))) {
-      numEvents = computeEventCounts(eventStream, osw, predicateIndex, cutoff);
+    BigInteger writeHash;
+    HashSumEventStream writeEventStream = new HashSumEventStream(eventStream);  // do not close.
+    try (DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(tmp)))) {
+      numEvents = computeEventCounts(writeEventStream, dos, predicateIndex, cutoff);
     }
+    writeHash = writeEventStream.calculateHashSum();
+
     display("done. " + numEvents + " events\n");
 
     display("\tIndexing...  ");
 
     List<ComparableEvent> eventsToCompare;
-    try (FileEventStream fes = new FileEventStream(tmp)) {
-      eventsToCompare = index(fes, predicateIndex);
+    BigInteger readHash = null;
+    try (HashSumEventStream readStream = new HashSumEventStream(new EventStream(tmp))) {
+      eventsToCompare = index(readStream, predicateIndex);
+      readHash = readStream.calculateHashSum();
     }
-
     tmp.delete();
+
+    if (readHash.compareTo(writeHash) != 0)
+      throw new IOException("Event hash for writing and reading events did not match.");
+
     display("done.\n");
 
     if (sort) {
@@ -91,12 +102,19 @@ public class TwoPassDataIndexer extends AbstractDataIndexer {
    * occur at least <tt>cutoff</tt> times are added to the
    * <tt>predicatesInOut</tt> map along with a unique integer index.
    *
+   * Protocol:
+   *  1 - (utf string) - Event outcome
+   *  2 - (int) - Event context array length
+   *  3+ - (utf string) - Event context string
+   *  4 - (int) - Event values array length
+   *  5+ - (float) - Event value
+   *
    * @param eventStream an <code>EventStream</code> value
    * @param eventStore a writer to which the events are written to for later processing.
    * @param predicatesInOut a <code>TObjectIntHashMap</code> value
    * @param cutoff an <code>int</code> value
    */
-  private int computeEventCounts(ObjectStream<Event> eventStream, Writer eventStore,
+  private int computeEventCounts(ObjectStream<Event> eventStream, DataOutputStream eventStore,
       Map<String,Integer> predicatesInOut, int cutoff) throws IOException {
     Map<String,Integer> counter = new HashMap<>();
     int eventCount = 0;
@@ -104,9 +122,23 @@ public class TwoPassDataIndexer extends AbstractDataIndexer {
     Event ev;
     while ((ev = eventStream.read()) != null) {
       eventCount++;
-      eventStore.write(FileEventStream.toLine(ev));
+
+      eventStore.writeUTF(ev.getOutcome());
+
+      eventStore.writeInt(ev.getContext().length);
       String[] ec = ev.getContext();
       update(ec, counter);
+      for (String ctxString : ec)
+        eventStore.writeUTF(ctxString);
+
+      if (ev.getValues() == null) {
+        eventStore.writeInt(0);
+      }
+      else {
+        eventStore.writeInt(ev.getValues().length);
+        for (float value : ev.getValues())
+          eventStore.writeFloat(value);
+      }
     }
 
     String[] predicateSet = counter.entrySet().stream()
@@ -122,4 +154,45 @@ public class TwoPassDataIndexer extends AbstractDataIndexer {
 
     return eventCount;
   }
+
+  private static class EventStream implements ObjectStream<Event> {
+
+    private final DataInputStream inputStream;
+
+    public EventStream(File file) throws IOException {
+      inputStream = new DataInputStream(new BufferedInputStream(new FileInputStream(file)));
+    }
+
+    @Override
+    public Event read() throws IOException {
+      if (inputStream.available() != 0) {
+        String outcome = inputStream.readUTF();
+        int contextLenght = inputStream.readInt();
+        String[] context = new String[contextLenght];
+        for (int i = 0; i < contextLenght; i++)
+          context[i] = inputStream.readUTF();
+        int valuesLength = inputStream.readInt();
+        float[] values = null;
+        if (valuesLength > 0) {
+          values = new float[valuesLength];
+          for (int i = 0; i < valuesLength; i++)
+            values[i] = inputStream.readFloat();
+        }
+        return new Event(outcome, context, values);
+      }
+      else {
+        return null;
+      }
+    }
+
+    @Override
+    public void reset() throws IOException, UnsupportedOperationException {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void close() throws IOException {
+      inputStream.close();
+    }
+  }
 }
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ml/model/TwoPassDataIndexerTest.java b/opennlp-tools/src/test/java/opennlp/tools/ml/model/TwoPassDataIndexerTest.java
index c246936..a8a1b22 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/ml/model/TwoPassDataIndexerTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/ml/model/TwoPassDataIndexerTest.java
@@ -23,8 +23,15 @@ import java.util.Collections;
 import org.junit.Assert;
 import org.junit.Test;
 
+import opennlp.tools.namefind.DefaultNameContextGenerator;
+import opennlp.tools.namefind.NameContextGenerator;
+import opennlp.tools.namefind.NameFinderEventStream;
+import opennlp.tools.namefind.NameSample;
 import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ObjectStreamUtils;
+import opennlp.tools.util.Span;
 import opennlp.tools.util.TrainingParameters;
+import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
 
 public class TwoPassDataIndexerTest {
 
@@ -61,4 +68,25 @@ public class TwoPassDataIndexerTest {
     Assert.assertArrayEquals(new String[]{"other", "org-start", "org-cont"}, indexer.getOutcomeLabels());
     Assert.assertArrayEquals(new int[]{5}, indexer.getPredCounts());
   }
+
+  @Test
+  public void testIndexWithNewline() throws IOException {
+
+    String[] sentence = "He belongs to Apache \n Software Foundation .".split(" ");
+
+    NameContextGenerator CG = new DefaultNameContextGenerator(
+            (AdaptiveFeatureGenerator[]) null);
+
+    NameSample nameSample = new NameSample(sentence,
+            new Span[] { new Span(3, 7) }, false);
+
+    ObjectStream<Event> eventStream = new NameFinderEventStream(
+            ObjectStreamUtils.createObjectStream(nameSample), "org", CG, null);
+
+    DataIndexer indexer = new TwoPassDataIndexer();
+    indexer.init(new TrainingParameters(Collections.emptyMap()), null);
+    indexer.index(eventStream);
+    Assert.assertEquals(5, indexer.getContexts().length);
+
+  }
 }

-- 
To stop receiving notification emails like this one, please contact
['"commits@opennlp.apache.org" <co...@opennlp.apache.org>'].