You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@avro.apache.org by dk...@apache.org on 2020/05/26 12:09:21 UTC
[avro] branch master updated: AVRO-2703: Use KMP Algorithm For Sync
Marker Search (#782)
This is an automated email from the ASF dual-hosted git repository.
dkulp pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/avro.git
The following commit(s) were added to refs/heads/master by this push:
new 6f431fa AVRO-2703: Use KMP Algorithm For Sync Marker Search (#782)
6f431fa is described below
commit 6f431fa1717b4557768cb274524874d064485dfc
Author: belugabehr <12...@users.noreply.github.com>
AuthorDate: Tue May 26 08:09:14 2020 -0400
AVRO-2703: Use KMP Algorithm For Sync Marker Search (#782)
* AVRO-2703: Use KMP Algorithm For Sync Marker Search
* Use different implementation to compute partial match table
Co-authored-by: David Mollitor <dm...@apache.org>
---
.../java/org/apache/avro/file/DataFileReader.java | 76 +++++++++++++++-------
1 file changed, 52 insertions(+), 24 deletions(-)
diff --git a/lang/java/avro/src/main/java/org/apache/avro/file/DataFileReader.java b/lang/java/avro/src/main/java/org/apache/avro/file/DataFileReader.java
index 3c343f2..b06105f 100644
--- a/lang/java/avro/src/main/java/org/apache/avro/file/DataFileReader.java
+++ b/lang/java/avro/src/main/java/org/apache/avro/file/DataFileReader.java
@@ -18,7 +18,6 @@
package org.apache.avro.file;
import java.io.IOException;
-import java.io.EOFException;
import java.io.InputStream;
import java.io.File;
import java.util.Arrays;
@@ -38,6 +37,7 @@ import static org.apache.avro.file.DataFileConstants.MAGIC;
public class DataFileReader<D> extends DataFileStream<D> implements FileReader<D> {
private SeekableInputStream sin;
private long blockStart;
+ private int[] partialMatchTable;
/** Open a reader for a file. */
public static <D> FileReader<D> openReader(File file, DatumReader<D> reader) throws IOException {
@@ -167,36 +167,64 @@ public class DataFileReader<D> extends DataFileStream<D> implements FileReader<D
* {@link #next()}.
*/
@Override
- public void sync(long position) throws IOException {
+ public void sync(final long position) throws IOException {
seek(position);
// work around an issue where 1.5.4 C stored sync in metadata
- if ((position == 0) && (getMeta("avro.sync") != null)) {
+ if ((position == 0L) && (getMeta("avro.sync") != null)) {
initialize(sin); // re-init to skip header
return;
}
- try {
- int i = 0, b;
- InputStream in = vin.inputStream();
- vin.readFixed(syncBuffer);
- do {
- int j = 0;
- for (; j < SYNC_SIZE; j++) {
- if (getHeader().sync[j] != syncBuffer[(i + j) % SYNC_SIZE])
- break;
- }
- if (j == SYNC_SIZE) { // matched a complete sync
- blockStart = position + i + SYNC_SIZE;
- return;
- }
- b = in.read();
- syncBuffer[i++ % SYNC_SIZE] = (byte) b;
- } while (b != -1);
- } catch (EOFException e) {
- // fall through
+
+ if (this.partialMatchTable == null) {
+ this.partialMatchTable = computePartialMatchTable(getHeader().sync);
+ }
+
+ final byte[] sync = getHeader().sync;
+ final InputStream in = vin.inputStream();
+ final int[] pm = this.partialMatchTable;
+
+ // Search for the sequence of bytes in the stream using Knuth-Morris-Pratt
+ long i = 0L;
+ for (int b = in.read(), j = 0; b != -1; b = in.read(), i++) {
+ final byte cb = (byte) b;
+ while (j > 0 && sync[j] != cb) {
+ j = pm[j - 1];
+ }
+ if (sync[j] == cb) {
+ j++;
+ }
+ if (j == SYNC_SIZE) {
+ this.blockStart = position + i + 1L;
+ return;
+ }
}
- // if no match or EOF set start to the end position
+ // if no match set start to the end position
blockStart = sin.tell();
- // System.out.println("block start location after EOF: " + blockStart );
+ }
+
+ /**
+ * Compute that Knuth-Morris-Pratt partial match table.
+ *
+ * @param pattern The pattern being searched
+ * @return the pre-computed partial match table
+ *
+ * @see <a href= "https://github.com/williamfiset/Algorithms">William Fiset
+ * Algorithms</a>
+ */
+ private int[] computePartialMatchTable(final byte[] pattern) {
+ final int[] pm = new int[pattern.length];
+ for (int i = 1, len = 0; i < pattern.length;) {
+ if (pattern[i] == pattern[len]) {
+ pm[i++] = ++len;
+ } else {
+ if (len > 0) {
+ len = pm[len - 1];
+ } else {
+ i++;
+ }
+ }
+ }
+ return pm;
}
@Override