You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@avro.apache.org by nk...@apache.org on 2018/07/30 08:18:28 UTC

[avro] branch master updated: AVRO-2195: Add Zstandard Codec (Benson Qiu via Nandor Kollar)

This is an automated email from the ASF dual-hosted git repository.

nkollar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/avro.git


The following commit(s) were added to refs/heads/master by this push:
     new cf2f303  AVRO-2195: Add Zstandard Codec (Benson Qiu via Nandor Kollar)
cf2f303 is described below

commit cf2f30336efe0ecc3debc7bede86fde6d23f7c79
Author: Nandor Kollar <nk...@apache.org>
AuthorDate: Mon Jul 30 10:00:23 2018 +0200

    AVRO-2195: Add Zstandard Codec (Benson Qiu via Nandor Kollar)
---
 lang/java/avro/pom.xml                             |   4 +
 .../java/org/apache/avro/file/CodecFactory.java    |   9 +-
 .../org/apache/avro/file/DataFileConstants.java    |   1 +
 .../java/org/apache/avro/file/ZstandardCodec.java  | 105 +++++++++++++++++++++
 .../test/java/org/apache/avro/TestDataFile.java    |   1 +
 .../org/apache/avro/file/TestZstandardCodec.java   |  62 ++++++++++++
 .../avro/mapreduce/TestAvroKeyOutputFormat.java    |   8 ++
 lang/java/pom.xml                                  |   9 +-
 8 files changed, 197 insertions(+), 2 deletions(-)

diff --git a/lang/java/avro/pom.xml b/lang/java/avro/pom.xml
index ab798ee..bf12ebb 100644
--- a/lang/java/avro/pom.xml
+++ b/lang/java/avro/pom.xml
@@ -203,6 +203,10 @@
       <artifactId>hamcrest-all</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>com.github.luben</groupId>
+      <artifactId>zstd-jni</artifactId>
+    </dependency>
   </dependencies>
 
 </project>
diff --git a/lang/java/avro/src/main/java/org/apache/avro/file/CodecFactory.java b/lang/java/avro/src/main/java/org/apache/avro/file/CodecFactory.java
index 3df98e2..db51fc6 100644
--- a/lang/java/avro/src/main/java/org/apache/avro/file/CodecFactory.java
+++ b/lang/java/avro/src/main/java/org/apache/avro/file/CodecFactory.java
@@ -65,6 +65,11 @@ public abstract class CodecFactory {
     return new BZip2Codec.Option();
   }
 
+  /** zstandard codec.*/
+  public static CodecFactory zstandardCodec() {
+    return new ZstandardCodec.Option();
+  }
+
   /** Creates internal Codec. */
   protected abstract Codec createInstance();
 
@@ -83,17 +88,19 @@ public abstract class CodecFactory {
     addCodec("snappy", snappyCodec());
     addCodec("bzip2", bzip2Codec());
     addCodec("xz", xzCodec(DEFAULT_XZ_LEVEL));
+    addCodec("zstandard", zstandardCodec());
   }
 
   /** Maps a codec name into a CodecFactory.
    *
-   * Currently there are five codecs registered by default:
+   * Currently there are six codecs registered by default:
    * <ul>
    *   <li>{@code null}</li>
    *   <li>{@code deflate}</li>
    *   <li>{@code snappy}</li>
    *   <li>{@code bzip2}</li>
    *   <li>{@code xz}</li>
+   *   <li>{@code zstandard}</li>
    * </ul>
    */
   public static CodecFactory fromString(String s) {
diff --git a/lang/java/avro/src/main/java/org/apache/avro/file/DataFileConstants.java b/lang/java/avro/src/main/java/org/apache/avro/file/DataFileConstants.java
index a81b0df..1f6dec2 100644
--- a/lang/java/avro/src/main/java/org/apache/avro/file/DataFileConstants.java
+++ b/lang/java/avro/src/main/java/org/apache/avro/file/DataFileConstants.java
@@ -39,5 +39,6 @@ public class DataFileConstants {
   public static final String SNAPPY_CODEC = "snappy";
   public static final String BZIP2_CODEC = "bzip2";
   public static final String XZ_CODEC = "xz";
+  public static final String ZSTANDARD_CODEC = "zstandard";
 
 }
diff --git a/lang/java/avro/src/main/java/org/apache/avro/file/ZstandardCodec.java b/lang/java/avro/src/main/java/org/apache/avro/file/ZstandardCodec.java
new file mode 100644
index 0000000..4ec8433
--- /dev/null
+++ b/lang/java/avro/src/main/java/org/apache/avro/file/ZstandardCodec.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.avro.file;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+
+import org.apache.commons.compress.compressors.zstandard.ZstdCompressorInputStream;
+import org.apache.commons.compress.compressors.zstandard.ZstdCompressorOutputStream;
+import org.apache.commons.compress.utils.IOUtils;
+
+public class ZstandardCodec extends Codec {
+
+    static class Option extends CodecFactory {
+
+        @Override
+        protected Codec createInstance() {
+          return new ZstandardCodec();
+        }
+      }
+
+    private ByteArrayOutputStream outputBuffer;
+
+    @Override
+    public String getName() {
+        return DataFileConstants.ZSTANDARD_CODEC;
+    }
+
+    @Override
+    public ByteBuffer compress(ByteBuffer uncompressedData) throws IOException {
+        ByteArrayOutputStream baos = getOutputBuffer(uncompressedData.remaining());
+        OutputStream outputStream = new ZstdCompressorOutputStream(baos);
+        writeAndClose(uncompressedData, outputStream);
+        return ByteBuffer.wrap(baos.toByteArray());
+    }
+
+    @Override
+    public ByteBuffer decompress(ByteBuffer compressedData) throws IOException {
+        ByteArrayOutputStream baos = getOutputBuffer(compressedData.remaining());
+        InputStream bytesIn = new ByteArrayInputStream(
+          compressedData.array(),
+          compressedData.arrayOffset() + compressedData.position(),
+          compressedData.remaining());
+        InputStream ios = new ZstdCompressorInputStream(bytesIn);
+        try {
+          IOUtils.copy(ios, baos);
+        } finally {
+          ios.close();
+        }
+        return ByteBuffer.wrap(baos.toByteArray());
+    }
+
+    private void writeAndClose(ByteBuffer data, OutputStream to) throws IOException {
+        byte[] input = data.array();
+        int offset = data.arrayOffset() + data.position();
+        int length = data.remaining();
+        try {
+          to.write(input, offset, length);
+        } finally {
+          to.close();
+        }
+      }
+
+    // get and initialize the output buffer for use.
+    private ByteArrayOutputStream getOutputBuffer(int suggestedLength) {
+      if (outputBuffer == null) {
+        outputBuffer = new ByteArrayOutputStream(suggestedLength);
+      }
+      outputBuffer.reset();
+      return outputBuffer;
+    }
+
+    @Override
+    public int hashCode() {
+      return getName().hashCode();
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj)
+            return true;
+          if (obj == null || obj.getClass() != getClass())
+            return false;
+          return true;
+    }
+}
diff --git a/lang/java/avro/src/test/java/org/apache/avro/TestDataFile.java b/lang/java/avro/src/test/java/org/apache/avro/TestDataFile.java
index cf41b0d..64e6730 100644
--- a/lang/java/avro/src/test/java/org/apache/avro/TestDataFile.java
+++ b/lang/java/avro/src/test/java/org/apache/avro/TestDataFile.java
@@ -70,6 +70,7 @@ public class TestDataFile {
     r.add(new Object[] { CodecFactory.xzCodec(0) });
     r.add(new Object[] { CodecFactory.xzCodec(1) });
     r.add(new Object[] { CodecFactory.xzCodec(6) });
+    r.add(new Object[] { CodecFactory.zstandardCodec() });
     return r;
   }
 
diff --git a/lang/java/avro/src/test/java/org/apache/avro/file/TestZstandardCodec.java b/lang/java/avro/src/test/java/org/apache/avro/file/TestZstandardCodec.java
new file mode 100644
index 0000000..62726f7
--- /dev/null
+++ b/lang/java/avro/src/test/java/org/apache/avro/file/TestZstandardCodec.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.avro.file;
+
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.junit.Test;
+
+public class TestZstandardCodec {
+
+	@Test
+	public void testZstandardCompressionAndDecompression() throws IOException {
+		Codec codec = CodecFactory.zstandardCodec().createInstance();
+		assertTrue(codec instanceof ZstandardCodec);
+		assertTrue(codec.getName().equals("zstandard"));
+
+		// Generate some random input.
+	    final int inputByteSize = 64 * 1024;
+	    byte[] inputByteArray = new byte[inputByteSize];
+	    for (int i = 0; i < inputByteSize; i++) {
+	      inputByteArray[i] = (byte)(65 + i % 10);
+	    }
+
+	    ByteBuffer inputByteBuffer = ByteBuffer.allocate(inputByteSize * 2);
+	    inputByteBuffer.put(inputByteArray);
+
+	    ByteBuffer compressedBuffer = codec.compress(inputByteBuffer);
+	    //Make sure something returned
+	    assertTrue(compressedBuffer.array().length > 0);
+	    //Make sure the compressed output is smaller then the original
+	    assertTrue(compressedBuffer.array().length < inputByteArray.length);
+
+	    ByteBuffer decompressedBuffer = codec.decompress(compressedBuffer);
+
+	    //The original array should be the same length as the decompressed array
+	    assertTrue(decompressedBuffer.array().length == inputByteArray.length);
+
+	    //Every byte in the outputByteArray should equal every byte in the input array
+	    byte[] outputByteArray = decompressedBuffer.array();
+	    for (int i = 0; i < inputByteSize; i++) {
+	      inputByteArray[i] = outputByteArray[i];
+	    }
+	}
+}
diff --git a/lang/java/mapred/src/test/java/org/apache/avro/mapreduce/TestAvroKeyOutputFormat.java b/lang/java/mapred/src/test/java/org/apache/avro/mapreduce/TestAvroKeyOutputFormat.java
index 0248037..270a2d3 100644
--- a/lang/java/mapred/src/test/java/org/apache/avro/mapreduce/TestAvroKeyOutputFormat.java
+++ b/lang/java/mapred/src/test/java/org/apache/avro/mapreduce/TestAvroKeyOutputFormat.java
@@ -80,6 +80,14 @@ public class TestAvroKeyOutputFormat {
   }
 
   @Test
+  public void testWithZstandardCode() throws IOException {
+    Configuration conf = new Configuration();
+    conf.setBoolean("mapred.output.compress", true);
+    conf.set(AvroJob.CONF_OUTPUT_CODEC, DataFileConstants.ZSTANDARD_CODEC);
+    testGetRecordWriter(conf, CodecFactory.zstandardCodec(), DataFileConstants.DEFAULT_SYNC_INTERVAL);
+  }
+
+  @Test
   public void testWithDeflateCodeWithHadoopConfig() throws IOException {
     Configuration conf = new Configuration();
     conf.setBoolean("mapred.output.compress", true);
diff --git a/lang/java/pom.xml b/lang/java/pom.xml
index f7cbe7d..0dc2d23 100644
--- a/lang/java/pom.xml
+++ b/lang/java/pom.xml
@@ -54,7 +54,7 @@
     <ant.version>1.10.0</ant.version>
     <commons-cli.version>1.3.1</commons-cli.version>
     <commons-codec.version>1.10</commons-codec.version>
-    <commons-compress.version>1.13</commons-compress.version>
+    <commons-compress.version>1.16</commons-compress.version>
     <commons-lang.version>2.6</commons-lang.version>
     <commons-logging.version>1.2</commons-logging.version>
     <tukaani.version>1.6</tukaani.version>
@@ -65,6 +65,7 @@
     <!-- This Guava version should match Hadoop's Guava version. See AVRO-1781. -->
     <guava.version>11.0.2</guava.version>
     <findbugs-annotations.version>1.3.9-1</findbugs-annotations.version>
+    <zstd-jni.version>1.3.4-10</zstd-jni.version>
 
     <!-- version properties for plugins -->
     <bundle-plugin-version>3.2.0</bundle-plugin-version>
@@ -476,6 +477,12 @@
         <artifactId>grpc-netty</artifactId>
         <version>${grpc.version}</version>
       </dependency>
+      <dependency>
+        <groupId>com.github.luben</groupId>
+        <artifactId>zstd-jni</artifactId>
+        <version>${zstd-jni.version}</version>
+        <scope>test</scope>
+      </dependency>
     </dependencies>
   </dependencyManagement>