You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@avro.apache.org by dk...@apache.org on 2020/05/26 12:09:21 UTC

[avro] branch master updated: AVRO-2703: Use KMP Algorithm For Sync Marker Search (#782)

This is an automated email from the ASF dual-hosted git repository.

dkulp pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/avro.git


The following commit(s) were added to refs/heads/master by this push:
     new 6f431fa  AVRO-2703: Use KMP Algorithm For Sync Marker Search (#782)
6f431fa is described below

commit 6f431fa1717b4557768cb274524874d064485dfc
Author: belugabehr <12...@users.noreply.github.com>
AuthorDate: Tue May 26 08:09:14 2020 -0400

    AVRO-2703: Use KMP Algorithm For Sync Marker Search (#782)
    
    * AVRO-2703: Use KMP Algorithm For Sync Marker Search
    
    * Use different implementation to compute partial match table
    
    Co-authored-by: David Mollitor <dm...@apache.org>
---
 .../java/org/apache/avro/file/DataFileReader.java  | 76 +++++++++++++++-------
 1 file changed, 52 insertions(+), 24 deletions(-)

diff --git a/lang/java/avro/src/main/java/org/apache/avro/file/DataFileReader.java b/lang/java/avro/src/main/java/org/apache/avro/file/DataFileReader.java
index 3c343f2..b06105f 100644
--- a/lang/java/avro/src/main/java/org/apache/avro/file/DataFileReader.java
+++ b/lang/java/avro/src/main/java/org/apache/avro/file/DataFileReader.java
@@ -18,7 +18,6 @@
 package org.apache.avro.file;
 
 import java.io.IOException;
-import java.io.EOFException;
 import java.io.InputStream;
 import java.io.File;
 import java.util.Arrays;
@@ -38,6 +37,7 @@ import static org.apache.avro.file.DataFileConstants.MAGIC;
 public class DataFileReader<D> extends DataFileStream<D> implements FileReader<D> {
   private SeekableInputStream sin;
   private long blockStart;
+  private int[] partialMatchTable;
 
   /** Open a reader for a file. */
   public static <D> FileReader<D> openReader(File file, DatumReader<D> reader) throws IOException {
@@ -167,36 +167,64 @@ public class DataFileReader<D> extends DataFileStream<D> implements FileReader<D
    * {@link #next()}.
    */
   @Override
-  public void sync(long position) throws IOException {
+  public void sync(final long position) throws IOException {
     seek(position);
     // work around an issue where 1.5.4 C stored sync in metadata
-    if ((position == 0) && (getMeta("avro.sync") != null)) {
+    if ((position == 0L) && (getMeta("avro.sync") != null)) {
       initialize(sin); // re-init to skip header
       return;
     }
-    try {
-      int i = 0, b;
-      InputStream in = vin.inputStream();
-      vin.readFixed(syncBuffer);
-      do {
-        int j = 0;
-        for (; j < SYNC_SIZE; j++) {
-          if (getHeader().sync[j] != syncBuffer[(i + j) % SYNC_SIZE])
-            break;
-        }
-        if (j == SYNC_SIZE) { // matched a complete sync
-          blockStart = position + i + SYNC_SIZE;
-          return;
-        }
-        b = in.read();
-        syncBuffer[i++ % SYNC_SIZE] = (byte) b;
-      } while (b != -1);
-    } catch (EOFException e) {
-      // fall through
+
+    if (this.partialMatchTable == null) {
+      this.partialMatchTable = computePartialMatchTable(getHeader().sync);
+    }
+
+    final byte[] sync = getHeader().sync;
+    final InputStream in = vin.inputStream();
+    final int[] pm = this.partialMatchTable;
+
+    // Search for the sequence of bytes in the stream using Knuth-Morris-Pratt
+    long i = 0L;
+    for (int b = in.read(), j = 0; b != -1; b = in.read(), i++) {
+      final byte cb = (byte) b;
+      while (j > 0 && sync[j] != cb) {
+        j = pm[j - 1];
+      }
+      if (sync[j] == cb) {
+        j++;
+      }
+      if (j == SYNC_SIZE) {
+        this.blockStart = position + i + 1L;
+        return;
+      }
     }
-    // if no match or EOF set start to the end position
+    // if no match set start to the end position
     blockStart = sin.tell();
-    // System.out.println("block start location after EOF: " + blockStart );
+  }
+
+  /**
+   * Compute that Knuth-Morris-Pratt partial match table.
+   *
+   * @param pattern The pattern being searched
+   * @return the pre-computed partial match table
+   *
+   * @see <a href= "https://github.com/williamfiset/Algorithms">William Fiset
+   *      Algorithms</a>
+   */
+  private int[] computePartialMatchTable(final byte[] pattern) {
+    final int[] pm = new int[pattern.length];
+    for (int i = 1, len = 0; i < pattern.length;) {
+      if (pattern[i] == pattern[len]) {
+        pm[i++] = ++len;
+      } else {
+        if (len > 0) {
+          len = pm[len - 1];
+        } else {
+          i++;
+        }
+      }
+    }
+    return pm;
   }
 
   @Override