You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by bl...@apache.org on 2017/11/15 00:16:33 UTC

parquet-mr git commit: PARQUET-1143: Update to Parquet format 2.4.0.

Repository: parquet-mr
Updated Branches:
  refs/heads/master ba7b8ba69 -> 132b2a8c5


PARQUET-1143: Update to Parquet format 2.4.0.

This adds new compression codecs that are required by format 2.4.0.

Author: Ryan Blue <bl...@apache.org>

Closes #430 from rdblue/PARQUET-1143-format-2.4.0-updates and squashes the following commits:

0aca87812 [Ryan Blue] PARQUET-1143: Remove staging repository now that 2.4.0 is released.
89b01cb64 [Ryan Blue] PARQUET-1143: Make brotli-codec an optional dependency.
a2f57ba5b [Ryan Blue] PARQUET-1143: Drop hadoop-1 tests from Travis CI.
d0f81d7cd [Ryan Blue] PARQUET-1143: Use slf4j-simple and log4j in Thrift/Pig tests.
326b8ac74 [Ryan Blue] PARQUET-1143: Update Travis to use the default ubuntu image.
4ad46f94c [Ryan Blue] PARQUET-1143: Use slf4j-log4j12 in Pig tests.
785e84dff [Ryan Blue] PARQUET-1143: Fix Travis CI.
efa171fda [Ryan Blue] PARQUET-1143: Ban slf4j-log4j12 dependency.
bf61e84ab [Ryan Blue] PARQUET-1143: Update to Parquet format 2.4.0.


Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/132b2a8c
Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/132b2a8c
Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/132b2a8c

Branch: refs/heads/master
Commit: 132b2a8c553bdcfd445e88680beac6f225c50ac4
Parents: ba7b8ba
Author: Ryan Blue <bl...@apache.org>
Authored: Tue Nov 14 16:16:28 2017 -0800
Committer: Ryan Blue <bl...@apache.org>
Committed: Tue Nov 14 16:16:28 2017 -0800

----------------------------------------------------------------------
 .travis.yml                                     |  4 +--
 parquet-avro/pom.xml                            |  6 ++++
 parquet-benchmarks/pom.xml                      | 17 +++++++++--
 parquet-cascading/pom.xml                       |  6 ++++
 .../main/java/org/apache/parquet/cli/Util.java  |  6 ++++
 parquet-hadoop/pom.xml                          | 12 ++++++++
 .../hadoop/metadata/CompressionCodecName.java   |  5 +++-
 .../parquet/hadoop/TestDirectCodecFactory.java  | 14 +++++++--
 parquet-pig/src/test/resources/log4j.properties | 23 +++++++++++++++
 .../src/test/resources/log4j.properties         | 23 +++++++++++++++
 parquet-scrooge/pom.xml                         |  6 ++++
 parquet-thrift/pom.xml                          | 15 +++++++++-
 pom.xml                                         | 30 +++++++++++++++++++-
 13 files changed, 157 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/132b2a8c/.travis.yml
----------------------------------------------------------------------
diff --git a/.travis.yml b/.travis.yml
index f4b56c6..55f6e9a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -22,12 +22,12 @@ before_install:
   - tar zxf thrift-0.7.0.tar.gz
   - cd thrift-0.7.0
   - chmod +x ./configure
-  - ./configure --disable-gen-erl --disable-gen-hs --without-php --without-nodejs --without-ruby --without-haskell --without-erlang
+  - ./configure --disable-gen-erl --disable-gen-hs --without-ruby --without-haskell --without-erlang --without-php
   - sudo make install
   - cd ..
 
 env:
-  - HADOOP_PROFILE=hadoop-1 TEST_CODECS=uncompressed
+  - HADOOP_PROFILE=default TEST_CODECS=uncompressed,brotli
   - HADOOP_PROFILE=default TEST_CODECS=gzip,snappy
 
 install: mvn install --batch-mode -DskipTests=true -Dmaven.javadoc.skip=true -Dsource.skip=true > mvn_install.log || mvn install --batch-mode -DskipTests=true -Dmaven.javadoc.skip=true -Dsource.skip=true > mvn_install.log || (cat mvn_install.log && false)

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/132b2a8c/parquet-avro/pom.xml
----------------------------------------------------------------------
diff --git a/parquet-avro/pom.xml b/parquet-avro/pom.xml
index ecb69bb..f2b4c9e 100644
--- a/parquet-avro/pom.xml
+++ b/parquet-avro/pom.xml
@@ -63,6 +63,12 @@
       <artifactId>hadoop-client</artifactId>
       <version>${hadoop.version}</version>
       <scope>provided</scope>
+      <exclusions>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-log4j12</artifactId>
+        </exclusion>
+      </exclusions>
     </dependency>
     <dependency>
       <groupId>com.google.guava</groupId>

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/132b2a8c/parquet-benchmarks/pom.xml
----------------------------------------------------------------------
diff --git a/parquet-benchmarks/pom.xml b/parquet-benchmarks/pom.xml
index b01a967..df3d832 100644
--- a/parquet-benchmarks/pom.xml
+++ b/parquet-benchmarks/pom.xml
@@ -48,9 +48,15 @@
        <version>${project.version}</version>
     </dependency>
     <dependency>
-       <groupId>org.apache.hadoop</groupId>
-       <artifactId>hadoop-client</artifactId>
-       <version>${hadoop.version}</version>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-client</artifactId>
+      <version>${hadoop.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-log4j12</artifactId>
+        </exclusion>
+      </exclusions>
     </dependency>
     <dependency>
        <groupId>org.openjdk.jmh</groupId>
@@ -73,6 +79,11 @@
 
   <build>
     <plugins>
+      <!-- This module disables semver checks because it is not a public API.
+      <plugin>
+        <artifactId>maven-enforcer-plugin</artifactId>
+      </plugin>
+      -->
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-compiler-plugin</artifactId>

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/132b2a8c/parquet-cascading/pom.xml
----------------------------------------------------------------------
diff --git a/parquet-cascading/pom.xml b/parquet-cascading/pom.xml
index 07e8f68..c386817 100644
--- a/parquet-cascading/pom.xml
+++ b/parquet-cascading/pom.xml
@@ -66,6 +66,12 @@
       <artifactId>hadoop-client</artifactId>
       <version>${hadoop.version}</version>
       <scope>provided</scope>
+      <exclusions>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-log4j12</artifactId>
+        </exclusion>
+      </exclusions>
     </dependency>
     <dependency>
       <groupId>org.apache.parquet</groupId>

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/132b2a8c/parquet-cli/src/main/java/org/apache/parquet/cli/Util.java
----------------------------------------------------------------------
diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Util.java b/parquet-cli/src/main/java/org/apache/parquet/cli/Util.java
index 860a218..07a5364 100644
--- a/parquet-cli/src/main/java/org/apache/parquet/cli/Util.java
+++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Util.java
@@ -218,6 +218,12 @@ public class Util {
         return "G";
       case LZO:
         return "L";
+      case BROTLI:
+        return "B";
+      case LZ4:
+        return "4";
+      case ZSTD:
+        return "Z";
       default:
         return "?";
     }

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/132b2a8c/parquet-hadoop/pom.xml
----------------------------------------------------------------------
diff --git a/parquet-hadoop/pom.xml b/parquet-hadoop/pom.xml
index 84ef43f..5df2002 100644
--- a/parquet-hadoop/pom.xml
+++ b/parquet-hadoop/pom.xml
@@ -51,6 +51,12 @@
       <artifactId>hadoop-client</artifactId>
       <version>${hadoop.version}</version>
       <scope>provided</scope>
+      <exclusions>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-log4j12</artifactId>
+        </exclusion>
+      </exclusions>
     </dependency>
     <dependency>
       <groupId>org.apache.parquet</groupId>
@@ -79,6 +85,12 @@
       <artifactId>commons-pool</artifactId>
       <version>1.6</version>
     </dependency>
+    <dependency>
+      <groupId>com.github.rdblue</groupId>
+      <artifactId>brotli-codec</artifactId>
+      <version>${brotli-codec.version}</version>
+      <optional>true</optional>
+    </dependency>
 
     <dependency>
       <groupId>com.google.guava</groupId>

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/132b2a8c/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/CompressionCodecName.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/CompressionCodecName.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/CompressionCodecName.java
index d03d280..153133e 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/CompressionCodecName.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/CompressionCodecName.java
@@ -27,7 +27,10 @@ public enum CompressionCodecName {
   UNCOMPRESSED(null, CompressionCodec.UNCOMPRESSED, ""),
   SNAPPY("org.apache.parquet.hadoop.codec.SnappyCodec", CompressionCodec.SNAPPY, ".snappy"),
   GZIP("org.apache.hadoop.io.compress.GzipCodec", CompressionCodec.GZIP, ".gz"),
-  LZO("com.hadoop.compression.lzo.LzoCodec", CompressionCodec.LZO, ".lzo");
+  LZO("com.hadoop.compression.lzo.LzoCodec", CompressionCodec.LZO, ".lzo"),
+  BROTLI("org.apache.hadoop.io.compress.BrotliCodec", CompressionCodec.BROTLI, ".br"),
+  LZ4("org.apache.hadoop.io.compress.Lz4Codec", CompressionCodec.LZ4, ".lz4"),
+  ZSTD("org.apache.hadoop.io.compress.ZStandardCodec", CompressionCodec.ZSTD, ".zstd");
 
   public static CompressionCodecName fromConf(String name) {
      if (name == null) {

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/132b2a8c/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestDirectCodecFactory.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestDirectCodecFactory.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestDirectCodecFactory.java
index caf2ed6..3dd17e9 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestDirectCodecFactory.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestDirectCodecFactory.java
@@ -18,7 +18,9 @@
 package org.apache.parquet.hadoop;
 
 import java.nio.ByteBuffer;
+import java.util.HashSet;
 import java.util.Random;
+import java.util.Set;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.parquet.bytes.ByteBufferAllocator;
@@ -30,6 +32,11 @@ import org.junit.Test;
 import org.apache.parquet.bytes.BytesInput;
 import org.apache.parquet.hadoop.metadata.CompressionCodecName;
 
+import static org.apache.parquet.hadoop.metadata.CompressionCodecName.BROTLI;
+import static org.apache.parquet.hadoop.metadata.CompressionCodecName.LZ4;
+import static org.apache.parquet.hadoop.metadata.CompressionCodecName.LZO;
+import static org.apache.parquet.hadoop.metadata.CompressionCodecName.ZSTD;
+
 public class TestDirectCodecFactory {
 
   private static enum Decompression {
@@ -146,13 +153,16 @@ public class TestDirectCodecFactory {
   public void compressionCodecs() throws Exception {
     final int[] sizes = { 4 * 1024, 1 * 1024 * 1024 };
     final boolean[] comp = { true, false };
+    Set<CompressionCodecName> codecsToSkip = new HashSet<>();
+    codecsToSkip.add(LZO); // not distributed because it is GPL
+    codecsToSkip.add(LZ4); // not distributed in the default version of Hadoop
+    codecsToSkip.add(ZSTD); // not distributed in the default version of Hadoop
 
     for (final int size : sizes) {
       for (final boolean useOnHeapComp : comp) {
         for (final Decompression decomp : Decompression.values()) {
           for (final CompressionCodecName codec : CompressionCodecName.values()) {
-            if (codec == CompressionCodecName.LZO) {
-              // not installed as gpl.
+            if (codecsToSkip.contains(codec)) {
               continue;
             }
             test(size, codec, useOnHeapComp, decomp);

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/132b2a8c/parquet-pig/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/parquet-pig/src/test/resources/log4j.properties b/parquet-pig/src/test/resources/log4j.properties
new file mode 100644
index 0000000..b60c8c6
--- /dev/null
+++ b/parquet-pig/src/test/resources/log4j.properties
@@ -0,0 +1,23 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+log4j.rootCategory=INFO, console
+
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %t %c{1}: %m%n

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/132b2a8c/parquet-protobuf/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/parquet-protobuf/src/test/resources/log4j.properties b/parquet-protobuf/src/test/resources/log4j.properties
new file mode 100644
index 0000000..b60c8c6
--- /dev/null
+++ b/parquet-protobuf/src/test/resources/log4j.properties
@@ -0,0 +1,23 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+log4j.rootCategory=INFO, console
+
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %t %c{1}: %m%n

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/132b2a8c/parquet-scrooge/pom.xml
----------------------------------------------------------------------
diff --git a/parquet-scrooge/pom.xml b/parquet-scrooge/pom.xml
index 60867ba..0226efc 100644
--- a/parquet-scrooge/pom.xml
+++ b/parquet-scrooge/pom.xml
@@ -55,6 +55,12 @@
       <artifactId>hadoop-client</artifactId>
       <version>${hadoop.version}</version>
       <scope>provided</scope>
+      <exclusions>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-log4j12</artifactId>
+        </exclusion>
+      </exclusions>
     </dependency>
     <dependency>
       <groupId>org.apache.parquet</groupId>

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/132b2a8c/parquet-thrift/pom.xml
----------------------------------------------------------------------
diff --git a/parquet-thrift/pom.xml b/parquet-thrift/pom.xml
index 20666bc..3e69e4e 100644
--- a/parquet-thrift/pom.xml
+++ b/parquet-thrift/pom.xml
@@ -52,6 +52,12 @@
       <artifactId>hadoop-client</artifactId>
       <version>${hadoop.version}</version>
       <scope>provided</scope>
+      <exclusions>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-log4j12</artifactId>
+        </exclusion>
+      </exclusions>
     </dependency>
     <dependency>
       <groupId>com.twitter.elephantbird</groupId>
@@ -117,11 +123,18 @@
     </dependency>
     <dependency>
       <groupId>org.slf4j</groupId>
-      <artifactId>slf4j-log4j12</artifactId>
+      <artifactId>slf4j-simple</artifactId>
       <version>${slf4j.version}</version>
       <scope>test</scope>
     </dependency>
     <dependency>
+      <!-- needed for Pig tests -->
+      <groupId>log4j</groupId>
+      <artifactId>log4j</artifactId>
+      <version>1.2.17</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
       <groupId>org.apache.parquet</groupId>
       <artifactId>parquet-hadoop</artifactId>
       <version>${project.version}</version>

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/132b2a8c/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 40977a5..44b0b62 100644
--- a/pom.xml
+++ b/pom.xml
@@ -50,6 +50,15 @@
     </mailingList>
   </mailingLists>
 
+  <repositories>
+    <repository>
+      <id>jitpack.io</id>
+      <url>https://jitpack.io</url>
+      <name>Jitpack.io repository</name>
+      <!-- needed for brotli-codec -->
+    </repository>
+  </repositories>
+
   <developers>
     <developer>
       <name>Julien Le Dem</name>
@@ -72,7 +81,7 @@
     <hadoop1.version>1.2.1</hadoop1.version>
     <cascading.version>2.7.1</cascading.version>
     <cascading3.version>3.1.2</cascading3.version>
-    <parquet.format.version>2.3.1</parquet.format.version>
+    <parquet.format.version>2.4.0</parquet.format.version>
     <previous.version>1.7.0</previous.version>
     <thrift.executable>thrift</thrift.executable>
     <scala.version>2.10.6</scala.version>
@@ -88,6 +97,7 @@
     <slf4j.version>1.7.22</slf4j.version>
     <avro.version>1.8.2</avro.version>
     <guava.version>20.0</guava.version>
+    <brotli-codec.version>0.1.1</brotli-codec.version>
     <mockito.version>1.10.19</mockito.version>
 
     <!-- parquet-cli dependencies -->
@@ -242,6 +252,8 @@
                      <exclude>org/apache/parquet/hadoop/CodecFactory**</exclude>
                      <exclude>shaded/**</exclude> <!-- shaded by parquet -->
                      <exclude>org/apache/parquet/it/unimi/dsi/fastutil/**</exclude> <!-- Another shaded dependency from parquet-column -->
+                     <exclude>org/apache/parquet/benchmarks/**</exclude>
+                     <exclude>org/openjdk/**</exclude>
                      <!-- temporary exclusions for false-positives -->
                      <exclude>org/apache/parquet/Version</exclude>
                      <exclude>org/apache/parquet/schema/**</exclude> <!-- methods moved to new superclass -->
@@ -256,6 +268,22 @@
                </rules>
              </configuration>
            </execution>
+           <execution>
+            <id>enforce-banned-dependencies</id>
+            <goals>
+              <goal>enforce</goal>
+            </goals>
+            <configuration>
+              <rules>
+                <bannedDependencies>
+                  <excludes>
+                    <exclude>org.slf4j:slf4j-log4j12</exclude>
+                  </excludes>
+                </bannedDependencies>
+              </rules>
+              <fail>true</fail>
+            </configuration>
+          </execution>
          </executions>
         </plugin>
         <plugin>