You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by su...@apache.org on 2021/09/14 18:17:38 UTC

[hadoop] branch trunk updated: HADOOP-17891. Exclude snappy-java and lz4-java from relocation in shaded hadoop client libraries (#3385)

This is an automated email from the ASF dual-hosted git repository.

sunchao pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/trunk by this push:
     new b8f7c75  HADOOP-17891. Exclude snappy-java and lz4-java from relocation in shaded hadoop client libraries (#3385)
b8f7c75 is described below

commit b8f7c7527a7c33c204315a6ea615b4d3fd237744
Author: Liang-Chi Hsieh <vi...@gmail.com>
AuthorDate: Tue Sep 14 11:17:18 2021 -0700

    HADOOP-17891. Exclude snappy-java and lz4-java from relocation in shaded hadoop client libraries (#3385)
---
 dev-support/bin/hadoop.sh                          |  11 +-
 hadoop-client-modules/hadoop-client-api/pom.xml    |  17 +++
 .../hadoop-client-check-invariants/pom.xml         |   2 +
 .../resources/ensure-jars-have-correct-contents.sh |   2 +
 .../hadoop-client-check-test-invariants/pom.xml    |   2 +
 .../hadoop-client-integration-tests/pom.xml        |   5 +
 .../apache/hadoop/example/ITUseHadoopCodecs.java   | 144 +++++++++++++++++++++
 .../hadoop-client-minicluster/pom.xml              |  14 ++
 .../hadoop-client-runtime/pom.xml                  |  14 ++
 9 files changed, 209 insertions(+), 2 deletions(-)

diff --git a/dev-support/bin/hadoop.sh b/dev-support/bin/hadoop.sh
index 28d3ad2..e055519 100755
--- a/dev-support/bin/hadoop.sh
+++ b/dev-support/bin/hadoop.sh
@@ -513,7 +513,7 @@ function shadedclient_initialize
   maven_add_install shadedclient
 }
 
-## @description build client facing shaded artifacts and test them
+## @description build client facing shaded and non-shaded artifacts and test them
 ## @audience private
 ## @stability evolving
 ## @param repostatus
@@ -546,13 +546,20 @@ function shadedclient_rebuild
     return 0
   fi
 
-  big_console_header "Checking client artifacts on ${repostatus}"
+  big_console_header "Checking client artifacts on ${repostatus} with shaded clients"
 
   echo_and_redirect "${logfile}" \
     "${MAVEN}" "${MAVEN_ARGS[@]}" verify -fae --batch-mode -am \
       "${modules[@]}" \
       -Dtest=NoUnitTests -Dmaven.javadoc.skip=true -Dcheckstyle.skip=true -Dspotbugs.skip=true
 
+  big_console_header "Checking client artifacts on ${repostatus} with non-shaded clients"
+
+  echo_and_redirect "${logfile}" \
+    "${MAVEN}" "${MAVEN_ARGS[@]}" verify -fae --batch-mode -am \
+      "${modules[@]}" \
+      -Pnoshade -Dtest=NoUnitTests -Dmaven.javadoc.skip=true -Dcheckstyle.skip=true -Dspotbugs.skip=true
+
   count=$("${GREP}" -c '\[ERROR\]' "${logfile}")
   if [[ ${count} -gt 0 ]]; then
     add_vote_table -1 shadedclient "${repostatus} has errors when building and testing our client artifacts."
diff --git a/hadoop-client-modules/hadoop-client-api/pom.xml b/hadoop-client-modules/hadoop-client-api/pom.xml
index 1a83743..d0d62f5 100644
--- a/hadoop-client-modules/hadoop-client-api/pom.xml
+++ b/hadoop-client-modules/hadoop-client-api/pom.xml
@@ -67,6 +67,13 @@
         </exclusion>
       </exclusions>
     </dependency>
+    <!-- snappy-java is native library and cannot be relocated. So we explicitly exclude it
+         from shaded jar to prevent possible conflict. Make it as transitive dependency to
+         make the downstream pull it. -->
+    <dependency>
+      <groupId>org.xerial.snappy</groupId>
+      <artifactId>snappy-java</artifactId>
+    </dependency>
   </dependencies>
   <profiles>
     <profile>
@@ -109,6 +116,10 @@
                     <includes>
                       <include>org.apache.hadoop:*</include>
                     </includes>
+                    <excludes>
+                      <!-- Leave snappy that includes native methods which cannot be relocated. -->
+                      <exclude>org.xerial.snappy:*</exclude>
+                    </excludes>
                   </artifactSet>
                   <filters>
                     <!-- We get these package level classes from various yarn api jars -->
@@ -147,6 +158,9 @@
                         <exclude>org/xml/sax/**/*</exclude>
                         <exclude>org/bouncycastle/*</exclude>
                         <exclude>org/bouncycastle/**/*</exclude>
+                        <!-- Exclude snappy-java -->
+                        <exclude>org/xerial/snappy/*</exclude>
+                        <exclude>org/xerial/snappy/**/*</exclude>
                       </excludes>
                     </relocation>
                     <relocation>
@@ -223,6 +237,9 @@
                         <!-- Exclude config keys for Hadoop that look like package names -->
                         <exclude>net/topology/*</exclude>
                         <exclude>net/topology/**/*</exclude>
+                        <!-- Exclude lz4-java -->
+                        <exclude>net/jpountz/*</exclude>
+                        <exclude>net/jpountz/**/*</exclude>
                       </excludes>
                     </relocation>
                     <!-- okio declares a top level package instead of nested -->
diff --git a/hadoop-client-modules/hadoop-client-check-invariants/pom.xml b/hadoop-client-modules/hadoop-client-check-invariants/pom.xml
index 6ae9900..9d4bce1 100644
--- a/hadoop-client-modules/hadoop-client-check-invariants/pom.xml
+++ b/hadoop-client-modules/hadoop-client-check-invariants/pom.xml
@@ -90,6 +90,8 @@
                     <exclude>com.google.code.findbugs:jsr305</exclude>
                     <!-- Leave bouncycastle unshaded because it's signed with a special Oracle certificate so it can be a custom JCE security provider -->
                     <exclude>org.bouncycastle:*</exclude>
+                    <!-- Leave snappy that includes native methods which cannot be relocated. -->
+                    <exclude>org.xerial.snappy:*</exclude>
                   </excludes>
                 </banTransitiveDependencies>
                 <banDuplicateClasses>
diff --git a/hadoop-client-modules/hadoop-client-check-invariants/src/test/resources/ensure-jars-have-correct-contents.sh b/hadoop-client-modules/hadoop-client-check-invariants/src/test/resources/ensure-jars-have-correct-contents.sh
index 7242ade..2e92740 100644
--- a/hadoop-client-modules/hadoop-client-check-invariants/src/test/resources/ensure-jars-have-correct-contents.sh
+++ b/hadoop-client-modules/hadoop-client-check-invariants/src/test/resources/ensure-jars-have-correct-contents.sh
@@ -67,6 +67,8 @@ allowed_expr+="|^krb5_udp-template.conf$"
 # Jetty uses this style sheet for directory listings. TODO ensure our
 # internal use of jetty disallows directory listings and remove this.
 allowed_expr+="|^jetty-dir.css$"
+# Snappy java is native library. We cannot relocate it to under org/apache/hadoop.
+allowed_expr+="|^org/xerial/"
 
 allowed_expr+=")"
 declare -i bad_artifacts=0
diff --git a/hadoop-client-modules/hadoop-client-check-test-invariants/pom.xml b/hadoop-client-modules/hadoop-client-check-test-invariants/pom.xml
index bec5e6f..635250e 100644
--- a/hadoop-client-modules/hadoop-client-check-test-invariants/pom.xml
+++ b/hadoop-client-modules/hadoop-client-check-test-invariants/pom.xml
@@ -98,6 +98,8 @@
                     <exclude>com.google.code.findbugs:jsr305</exclude>
                     <!-- Leave bouncycastle unshaded because it's signed with a special Oracle certificate so it can be a custom JCE security provider -->
                     <exclude>org.bouncycastle:*</exclude>
+                    <!-- Leave snappy that includes native methods which cannot be relocated. -->
+                    <exclude>org.xerial.snappy:*</exclude>
                   </excludes>
                 </banTransitiveDependencies>
                 <banDuplicateClasses>
diff --git a/hadoop-client-modules/hadoop-client-integration-tests/pom.xml b/hadoop-client-modules/hadoop-client-integration-tests/pom.xml
index 978918e..960421b 100644
--- a/hadoop-client-modules/hadoop-client-integration-tests/pom.xml
+++ b/hadoop-client-modules/hadoop-client-integration-tests/pom.xml
@@ -52,6 +52,11 @@
       <artifactId>junit</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.lz4</groupId>
+      <artifactId>lz4-java</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   <profiles>
     <profile>
diff --git a/hadoop-client-modules/hadoop-client-integration-tests/src/test/java/org/apache/hadoop/example/ITUseHadoopCodecs.java b/hadoop-client-modules/hadoop-client-integration-tests/src/test/java/org/apache/hadoop/example/ITUseHadoopCodecs.java
new file mode 100644
index 0000000..fd0effa
--- /dev/null
+++ b/hadoop-client-modules/hadoop-client-integration-tests/src/test/java/org/apache/hadoop/example/ITUseHadoopCodecs.java
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+package org.apache.hadoop.example;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+
+import java.io.*;
+import java.util.Arrays;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
+import org.apache.hadoop.io.DataInputBuffer;
+import org.apache.hadoop.io.DataOutputBuffer;
+import org.apache.hadoop.io.RandomDatum;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.CompressionInputStream;
+import org.apache.hadoop.io.compress.CompressionOutputStream;
+import org.apache.hadoop.io.compress.zlib.ZlibFactory;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Ensure that we can perform codec operations given the API and runtime jars
+ * by performing some simple smoke tests.
+ */
+public class ITUseHadoopCodecs {
+
+  private static final Logger LOG = LoggerFactory.getLogger(ITUseHadoopCodecs.class);
+
+  private Configuration haddopConf = new Configuration();
+  private int dataCount = 100;
+  private int dataSeed = new Random().nextInt();
+
+  @Test
+  public void testGzipCodec() throws IOException {
+    ZlibFactory.setNativeZlibLoaded(false);
+    assertFalse(ZlibFactory.isNativeZlibLoaded(haddopConf));
+    codecTest(haddopConf, dataSeed, 0, "org.apache.hadoop.io.compress.GzipCodec");
+    codecTest(haddopConf, dataSeed, dataCount, "org.apache.hadoop.io.compress.GzipCodec");
+  }
+
+  @Test
+  public void testSnappyCodec() throws IOException {
+    codecTest(haddopConf, dataSeed, 0, "org.apache.hadoop.io.compress.SnappyCodec");
+    codecTest(haddopConf, dataSeed, dataCount, "org.apache.hadoop.io.compress.SnappyCodec");
+  }
+
+  @Test
+  public void testLz4Codec() {
+    Arrays.asList(false, true).forEach(config -> {
+      haddopConf.setBoolean(
+          CommonConfigurationKeys.IO_COMPRESSION_CODEC_LZ4_USELZ4HC_KEY,
+          config);
+      try {
+        codecTest(haddopConf, dataSeed, 0, "org.apache.hadoop.io.compress.Lz4Codec");
+        codecTest(haddopConf, dataSeed, dataCount, "org.apache.hadoop.io.compress.Lz4Codec");
+      } catch (IOException e) {
+        throw new RuntimeException("failed when running codecTest", e);
+      }
+    });
+  }
+
+  private void codecTest(Configuration conf, int seed, int count, String codecClass)
+      throws IOException {
+
+    // Create the codec
+    CompressionCodec codec = null;
+    try {
+      codec = (CompressionCodec)
+              ReflectionUtils.newInstance(conf.getClassByName(codecClass), conf);
+    } catch (ClassNotFoundException cnfe) {
+      throw new IOException("Illegal codec!");
+    }
+    LOG.info("Created a Codec object of type: " + codecClass);
+
+    // Generate data
+    DataOutputBuffer data = new DataOutputBuffer();
+    RandomDatum.Generator generator = new RandomDatum.Generator(seed);
+    for(int i = 0; i < count; ++i) {
+      generator.next();
+      RandomDatum key = generator.getKey();
+      RandomDatum value = generator.getValue();
+
+      key.write(data);
+      value.write(data);
+    }
+    LOG.info("Generated " + count + " records");
+
+    // Compress data
+    DataOutputBuffer compressedDataBuffer = new DataOutputBuffer();
+    try (CompressionOutputStream deflateFilter =
+        codec.createOutputStream(compressedDataBuffer);
+        DataOutputStream deflateOut =
+            new DataOutputStream(new BufferedOutputStream(deflateFilter))) {
+      deflateOut.write(data.getData(), 0, data.getLength());
+      deflateOut.flush();
+      deflateFilter.finish();
+    }
+
+    // De-compress data
+    DataInputBuffer deCompressedDataBuffer = new DataInputBuffer();
+    deCompressedDataBuffer.reset(compressedDataBuffer.getData(), 0,
+            compressedDataBuffer.getLength());
+    DataInputBuffer originalData = new DataInputBuffer();
+    originalData.reset(data.getData(), 0, data.getLength());
+    try (CompressionInputStream inflateFilter =
+        codec.createInputStream(deCompressedDataBuffer);
+        DataInputStream originalIn =
+            new DataInputStream(new BufferedInputStream(originalData))) {
+
+      // Check
+      int expected;
+      do {
+        expected = originalIn.read();
+        assertEquals("Inflated stream read by byte does not match",
+                expected, inflateFilter.read());
+      } while (expected != -1);
+    }
+
+    LOG.info("SUCCESS! Completed checking " + count + " records");
+  }
+}
diff --git a/hadoop-client-modules/hadoop-client-minicluster/pom.xml b/hadoop-client-modules/hadoop-client-minicluster/pom.xml
index a35d832..c9ce6f2 100644
--- a/hadoop-client-modules/hadoop-client-minicluster/pom.xml
+++ b/hadoop-client-modules/hadoop-client-minicluster/pom.xml
@@ -40,6 +40,12 @@
       <artifactId>hadoop-client-api</artifactId>
       <scope>runtime</scope>
     </dependency>
+    <!-- This is the api's compile dependency, but we don't want it to be compile dependency here too. -->
+    <dependency>
+      <groupId>org.xerial.snappy</groupId>
+      <artifactId>snappy-java</artifactId>
+      <scope>runtime</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-client-runtime</artifactId>
@@ -682,6 +688,8 @@
                       <!-- We need a filter that matches just those things that are included in the above artiacts -->
                       <!-- Leave bouncycastle unshaded because it's signed with a special Oracle certificate so it can be a custom JCE security provider -->
                       <exclude>org.bouncycastle:*</exclude>
+                      <!-- Leave snappy that includes native methods which cannot be relocated. -->
+                      <exclude>org.xerial.snappy:*</exclude>
                     </excludes>
                   </artifactSet>
                   <filters>
@@ -883,6 +891,9 @@
                         <exclude>org/xml/sax/**/*</exclude>
                         <exclude>org/bouncycastle/*</exclude>
                         <exclude>org/bouncycastle/**/*</exclude>
+                        <!-- Exclude snappy-java -->
+                        <exclude>org/xerial/snappy/*</exclude>
+                        <exclude>org/xerial/snappy/**/*</exclude>
                       </excludes>
                     </relocation>
                     <relocation>
@@ -1001,6 +1012,9 @@
                         <!-- Exclude config keys for Hadoop that look like package names -->
                         <exclude>net/topology/*</exclude>
                         <exclude>net/topology/**/*</exclude>
+                        <!-- Exclude lz4-java -->
+                        <exclude>net/jpountz/*</exclude>
+                        <exclude>net/jpountz/**/*</exclude>
                       </excludes>
                     </relocation>
                     <!-- okio declares a top level package instead of nested -->
diff --git a/hadoop-client-modules/hadoop-client-runtime/pom.xml b/hadoop-client-modules/hadoop-client-runtime/pom.xml
index f1eb8a9..b6a71e5 100644
--- a/hadoop-client-modules/hadoop-client-runtime/pom.xml
+++ b/hadoop-client-modules/hadoop-client-runtime/pom.xml
@@ -60,6 +60,12 @@
       <artifactId>hadoop-client-api</artifactId>
       <scope>runtime</scope>
     </dependency>
+    <!-- This is the api's compile dependency, but we don't want it to be compile dependency here too. -->
+    <dependency>
+      <groupId>org.xerial.snappy</groupId>
+      <artifactId>snappy-java</artifactId>
+      <scope>runtime</scope>
+    </dependency>
     <!-- This comes from our parent pom. If we don't expressly change it here to get included,
          downstream will get warnings at compile time. -->
     <dependency>
@@ -155,6 +161,8 @@
                       <exclude>org.ow2.asm:*</exclude>
                       <!-- Leave bouncycastle unshaded because it's signed with a special Oracle certificate so it can be a custom JCE security provider -->
                       <exclude>org.bouncycastle:*</exclude>
+                      <!-- Leave snappy that includes native methods which cannot be relocated. -->
+                      <exclude>org.xerial.snappy:*</exclude>
                     </excludes>
                   </artifactSet>
                   <filters>
@@ -259,6 +267,9 @@
                         <exclude>org/xml/sax/**/*</exclude>
                         <exclude>org/bouncycastle/*</exclude>
                         <exclude>org/bouncycastle/**/*</exclude>
+                        <!-- Exclude snappy-java -->
+                        <exclude>org/xerial/snappy/*</exclude>
+                        <exclude>org/xerial/snappy/**/*</exclude>
                       </excludes>
                     </relocation>
                     <relocation>
@@ -349,6 +360,9 @@
                         <!-- Exclude config keys for Hadoop that look like package names -->
                         <exclude>net/topology/*</exclude>
                         <exclude>net/topology/**/*</exclude>
+                        <!-- Exclude lz4-java -->
+                        <exclude>net/jpountz/*</exclude>
+                        <exclude>net/jpountz/**/*</exclude>
                       </excludes>
                     </relocation>
                     <!-- okio declares a top level package instead of nested -->

---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org