You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2020/08/29 03:37:37 UTC

[impala] branch master updated: IMPALA-10073: Create shaded dependency for S3A and aws-java-sdk-bundle

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new 5daff34  IMPALA-10073: Create shaded dependency for S3A and aws-java-sdk-bundle
5daff34 is described below

commit 5daff3472440dc6174f0f31a28bbdafee4f68716
Author: Sahil Takiar <ta...@gmail.com>
AuthorDate: Tue Aug 11 10:36:50 2020 -0700

    IMPALA-10073: Create shaded dependency for S3A and aws-java-sdk-bundle
    
    The aws-java-sdk-bundle is one of the largest dependencies in the Impala
    Docker images and continues to grow. The jar includes SDKs for
    every single AWS service.
    
    This patch removes most of the unnecessary SDKs from the
    aws-java-sdk-bundle, thus drastically decreasing the size of the
    dependency. The Maven shade plugin is used to do this, and the
    implementation is similar to what is currently done for the hive-exec
    jar.
    
    This patch takes a conservative approach to removing packages from the
    aws-java-sdk-bundle jar, and I ensured no direct dependencies of the S3
    SDK were removed. The idea is to only remove dependencies that S3A would
    never conceivably need. Given the huge number of AWS services, I only
    focused on removing the largest SDKs (the size of each SDK is estimated
    by the number of classes in the SDK).
    
    This decreases the size of the Docker images by about 100 MB.
    
    Testing:
    * Ran core tests against S3
    
    Change-Id: I0939f73be986f83cc1fd07921563b4d9201780f2
    Reviewed-on: http://gerrit.cloudera.org:8080/16342
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 CMakeLists.txt                               |   3 +-
 ext-data-source/CMakeLists.txt               |   3 +-
 fe/CMakeLists.txt                            |   4 +-
 fe/pom.xml                                   |  14 ++
 shaded-deps/.gitignore                       |   1 -
 shaded-deps/{ => hive-exec}/CMakeLists.txt   |   2 +-
 shaded-deps/{ => hive-exec}/pom.xml          |   4 +-
 shaded-deps/{ => s3a-aws-sdk}/CMakeLists.txt |   2 +-
 shaded-deps/s3a-aws-sdk/pom.xml              | 188 +++++++++++++++++++++++++++
 9 files changed, 212 insertions(+), 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 12f92e4..cc2c8aa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -413,7 +413,8 @@ add_subdirectory(common/yarn-extras)
 add_subdirectory(common/protobuf)
 add_subdirectory(be)
 add_subdirectory(docker)
-add_subdirectory(shaded-deps)
+add_subdirectory(shaded-deps/hive-exec)
+add_subdirectory(shaded-deps/s3a-aws-sdk)
 add_subdirectory(fe)
 add_subdirectory(impala-parent)
 add_subdirectory(ext-data-source)
diff --git a/ext-data-source/CMakeLists.txt b/ext-data-source/CMakeLists.txt
index 46e437c..0f88dd2 100644
--- a/ext-data-source/CMakeLists.txt
+++ b/ext-data-source/CMakeLists.txt
@@ -17,6 +17,7 @@
 
 # The dependency on shaded-deps is only added to avoid parallel downloads
 # of dependencies. For more details see IMPALA-7051, which was a similar issue.
-add_custom_target(ext-data-source ALL DEPENDS gen-deps impala-parent shaded-deps
+add_custom_target(ext-data-source ALL DEPENDS gen-deps impala-parent
+    shaded-deps-hive-exec shaded-deps-s3a-aws-sdk
   COMMAND $ENV{IMPALA_HOME}/bin/mvn-quiet.sh -B install -DskipTests
 )
diff --git a/fe/CMakeLists.txt b/fe/CMakeLists.txt
index 06ea20c..51f6760 100644
--- a/fe/CMakeLists.txt
+++ b/fe/CMakeLists.txt
@@ -16,7 +16,7 @@
 # under the License.
 
 add_custom_target(fe ALL DEPENDS
-  shaded-deps thrift-deps fb-deps yarn-extras function-registry ext-data-source
-      query-event-hook-api impala-parent
+  shaded-deps-hive-exec shaded-deps-s3a-aws-sdk thrift-deps fb-deps yarn-extras
+      function-registry ext-data-source query-event-hook-api impala-parent
   COMMAND ${CMAKE_SOURCE_DIR}/bin/mvn-quiet.sh -B install -DskipTests
 )
diff --git a/fe/pom.xml b/fe/pom.xml
index 68fed3a..061a739 100644
--- a/fe/pom.xml
+++ b/fe/pom.xml
@@ -130,6 +130,20 @@ under the License.
       <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-aws</artifactId>
       <version>${hadoop.version}</version>
+      <!-- Exclude the aws-java-sdk-bundle dependency because the Impala minimal
+           version of this dependency is used instead. -->
+      <exclusions>
+        <exclusion>
+          <groupId>com.amazonaws</groupId>
+          <artifactId>aws-java-sdk-bundle</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.impala</groupId>
+      <artifactId>impala-minimal-s3a-aws-sdk</artifactId>
+      <version>${project.version}</version>
     </dependency>
 
     <dependency>
diff --git a/shaded-deps/.gitignore b/shaded-deps/.gitignore
deleted file mode 100644
index 916e17c..0000000
--- a/shaded-deps/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-dependency-reduced-pom.xml
diff --git a/shaded-deps/CMakeLists.txt b/shaded-deps/hive-exec/CMakeLists.txt
similarity index 92%
copy from shaded-deps/CMakeLists.txt
copy to shaded-deps/hive-exec/CMakeLists.txt
index 73d353c..7d8b6b1 100644
--- a/shaded-deps/CMakeLists.txt
+++ b/shaded-deps/hive-exec/CMakeLists.txt
@@ -15,6 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-add_custom_target(shaded-deps ALL DEPENDS impala-parent
+add_custom_target(shaded-deps-hive-exec ALL DEPENDS impala-parent
   COMMAND $ENV{IMPALA_HOME}/bin/mvn-quiet.sh -B install -DskipTests
 )
diff --git a/shaded-deps/pom.xml b/shaded-deps/hive-exec/pom.xml
similarity index 98%
rename from shaded-deps/pom.xml
rename to shaded-deps/hive-exec/pom.xml
index ff6fa25..eadc397 100644
--- a/shaded-deps/pom.xml
+++ b/shaded-deps/hive-exec/pom.xml
@@ -28,7 +28,7 @@ the same dependencies
     <groupId>org.apache.impala</groupId>
     <artifactId>impala-parent</artifactId>
     <version>0.1-SNAPSHOT</version>
-    <relativePath>../impala-parent/pom.xml</relativePath>
+    <relativePath>../../impala-parent/pom.xml</relativePath>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.apache.impala</groupId>
@@ -76,7 +76,7 @@ the same dependencies
                 <include>org/apache/hadoop/hive/conf/**/*</include>
                 <include>org/apache/hadoop/hive/common/FileUtils*</include>
                 <include>org/apache/hive/common/util/TxnIdUtils*</include>
-                <!-- Needed to support describe formatted command compat with Hive --> 
+                <!-- Needed to support describe formatted command compat with Hive -->
                 <include>org/apache/hadoop/hive/ql/metadata/**/*</include>
                 <include>org/apache/hadoop/hive/ql/parse/SemanticException.class</include>
                 <!-- Needed to support Hive udfs -->
diff --git a/shaded-deps/CMakeLists.txt b/shaded-deps/s3a-aws-sdk/CMakeLists.txt
similarity index 92%
rename from shaded-deps/CMakeLists.txt
rename to shaded-deps/s3a-aws-sdk/CMakeLists.txt
index 73d353c..956f5eb 100644
--- a/shaded-deps/CMakeLists.txt
+++ b/shaded-deps/s3a-aws-sdk/CMakeLists.txt
@@ -15,6 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-add_custom_target(shaded-deps ALL DEPENDS impala-parent
+add_custom_target(shaded-deps-s3a-aws-sdk ALL DEPENDS impala-parent
   COMMAND $ENV{IMPALA_HOME}/bin/mvn-quiet.sh -B install -DskipTests
 )
diff --git a/shaded-deps/s3a-aws-sdk/pom.xml b/shaded-deps/s3a-aws-sdk/pom.xml
new file mode 100644
index 0000000..392ea10
--- /dev/null
+++ b/shaded-deps/s3a-aws-sdk/pom.xml
@@ -0,0 +1,188 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
+                      http://maven.apache.org/xsd/maven-4.0.0.xsd">
+
+  <!-- This pom creates a minimal version of the aws-java-sdk-bundle jar. The S3A
+dependency is used here to ensure the correct version of the aws-java-sdk-bundle jar is
+used. Only AWS service SDKs are excluded, all thirdparty jars are still included, even
+though some of them might not be necessary. The exclusions are sorted alphabetically.
+  -->
+  <parent>
+    <groupId>org.apache.impala</groupId>
+    <artifactId>impala-parent</artifactId>
+    <version>0.1-SNAPSHOT</version>
+    <relativePath>../../impala-parent/pom.xml</relativePath>
+  </parent>
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>org.apache.impala</groupId>
+  <artifactId>impala-minimal-s3a-aws-sdk</artifactId>
+  <packaging>jar</packaging>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-aws</artifactId>
+      <version>${hadoop.version}</version>
+    </dependency>
+  </dependencies>
+  <build>
+    <plugins>
+      <plugin>
+        <artifactId>maven-shade-plugin</artifactId>
+        <version>3.2.1</version>
+        <configuration>
+          <artifactSet>
+            <includes>
+              <include>com.amazonaws:aws-java-sdk-bundle</include>
+            </includes>
+          </artifactSet>
+          <filters>
+            <filter>
+              <artifact>com.amazonaws:aws-java-sdk-bundle</artifact>
+              <excludes>
+                <exclude>com/amazonaws/services/alexaforbusiness/**</exclude>
+                <exclude>com/amazonaws/services/amplify/**</exclude>
+                <exclude>com/amazonaws/services/apigateway/**</exclude>
+                <exclude>com/amazonaws/services/apigatewayv2/**</exclude>
+                <exclude>com/amazonaws/services/applicationautoscaling/**</exclude>
+                <exclude>com/amazonaws/services/applicationdiscovery/**</exclude>
+                <exclude>com/amazonaws/services/appmesh/**</exclude>
+                <exclude>com/amazonaws/services/appstream/**</exclude>
+                <exclude>com/amazonaws/services/appsync/**</exclude>
+                <exclude>com/amazonaws/services/athena/**</exclude>
+                <exclude>com/amazonaws/services/autoscaling/**</exclude>
+                <exclude>com/amazonaws/services/autoscalingplans/**</exclude>
+                <exclude>com/amazonaws/services/batch/**</exclude>
+                <exclude>com/amazonaws/services/chime/**</exclude>
+                <exclude>com/amazonaws/services/cloud9/**</exclude>
+                <exclude>com/amazonaws/services/clouddirectory/**</exclude>
+                <exclude>com/amazonaws/services/cloudformation/**</exclude>
+                <exclude>com/amazonaws/services/cloudfront/**</exclude>
+                <exclude>com/amazonaws/services/cloudsearchv2/**</exclude>
+                <exclude>com/amazonaws/services/cloudwatch/**</exclude>
+                <exclude>com/amazonaws/services/cloudwatchevents/**</exclude>
+                <exclude>com/amazonaws/services/codebuild/**</exclude>
+                <exclude>com/amazonaws/services/codecommit/**</exclude>
+                <exclude>com/amazonaws/services/codedeploy/**</exclude>
+                <exclude>com/amazonaws/services/codepipeline/**</exclude>
+                <exclude>com/amazonaws/services/codestar/**</exclude>
+                <exclude>com/amazonaws/services/cognitoidp/**</exclude>
+                <exclude>com/amazonaws/services/cognitosync/**</exclude>
+                <exclude>com/amazonaws/services/comprehend/**</exclude>
+                <exclude>com/amazonaws/services/connect/**</exclude>
+                <exclude>com/amazonaws/services/databasemigrationservice/**</exclude>
+                <exclude>com/amazonaws/services/devicefarm/**</exclude>
+                <exclude>com/amazonaws/services/directory/**</exclude>
+                <exclude>com/amazonaws/services/docdb/**</exclude>
+                <exclude>com/amazonaws/services/ec2/**</exclude>
+                <exclude>com/amazonaws/services/ecr/**</exclude>
+                <exclude>com/amazonaws/services/ecs/**</exclude>
+                <exclude>com/amazonaws/services/eks/**</exclude>
+                <exclude>com/amazonaws/services/elasticache/**</exclude>
+                <exclude>com/amazonaws/services/elasticbeanstalk/**</exclude>
+                <exclude>com/amazonaws/services/elasticfilesystem/**</exclude>
+                <exclude>com/amazonaws/services/elasticloadbalancing/**</exclude>
+                <exclude>com/amazonaws/services/elasticloadbalancingv2/**</exclude>
+                <exclude>com/amazonaws/services/elasticmapreduce/**</exclude>
+                <exclude>com/amazonaws/services/elasticsearch/**</exclude>
+                <exclude>com/amazonaws/services/elastictranscoder/**</exclude>
+                <exclude>com/amazonaws/services/fms/**</exclude>
+                <exclude>com/amazonaws/services/globalaccelerator/**</exclude>
+                <exclude>com/amazonaws/services/glue/**</exclude>
+                <exclude>com/amazonaws/services/greengrass/**</exclude>
+                <exclude>com/amazonaws/services/groundstation/**</exclude>
+                <exclude>com/amazonaws/services/guardduty/**</exclude>
+                <exclude>com/amazonaws/services/inspector/**</exclude>
+                <exclude>com/amazonaws/services/iot/**</exclude>
+                <exclude>com/amazonaws/services/iot1clickdevices/**</exclude>
+                <exclude>com/amazonaws/services/iot1clickprojects/**</exclude>
+                <exclude>com/amazonaws/services/iotanalytics/**</exclude>
+                <exclude>com/amazonaws/services/iotevents/**</exclude>
+                <exclude>com/amazonaws/services/iotthingsgraph/**</exclude>
+                <exclude>com/amazonaws/services/kafka/**</exclude>
+                <exclude>com/amazonaws/services/kinesis/**</exclude>
+                <exclude>com/amazonaws/services/kinesisanalytics/**</exclude>
+                <exclude>com/amazonaws/services/kinesisanalyticsv2/**</exclude>
+                <exclude>com/amazonaws/services/kinesisfirehose/**</exclude>
+                <exclude>com/amazonaws/services/kinesisvideo/**</exclude>
+                <exclude>com/amazonaws/services/lambda/**</exclude>
+                <exclude>com/amazonaws/services/lexmodelbuilding/**</exclude>
+                <exclude>com/amazonaws/services/licensemanager/**</exclude>
+                <exclude>com/amazonaws/services/lightsail/**</exclude>
+                <exclude>com/amazonaws/services/machinelearning/**</exclude>
+                <exclude>com/amazonaws/services/managedblockchain/**</exclude>
+                <exclude>com/amazonaws/services/mediaconnect/**</exclude>
+                <exclude>com/amazonaws/services/mediaconvert/**</exclude>
+                <exclude>com/amazonaws/services/medialive/**</exclude>
+                <exclude>com/amazonaws/services/mediapackage/**</exclude>
+                <exclude>com/amazonaws/services/mediapackagevod/**</exclude>
+                <exclude>com/amazonaws/services/mediastore/**</exclude>
+                <exclude>com/amazonaws/services/migrationhub/**</exclude>
+                <exclude>com/amazonaws/services/mq/**</exclude>
+                <exclude>com/amazonaws/services/mturk/**</exclude>
+                <exclude>com/amazonaws/services/neptune/**</exclude>
+                <exclude>com/amazonaws/services/opsworks/**</exclude>
+                <exclude>com/amazonaws/services/pinpoint/**</exclude>
+                <exclude>com/amazonaws/services/pinpointemail/**</exclude>
+                <exclude>com/amazonaws/services/pinpointsmsvoice/**</exclude>
+                <exclude>com/amazonaws/services/polly/**</exclude>
+                <exclude>com/amazonaws/services/quicksight/**</exclude>
+                <exclude>com/amazonaws/services/rds/**</exclude>
+                <exclude>com/amazonaws/services/redshift/**</exclude>
+                <exclude>com/amazonaws/services/rekognition/**</exclude>
+                <exclude>com/amazonaws/services/robomaker/**</exclude>
+                <exclude>com/amazonaws/services/route53/**</exclude>
+                <exclude>com/amazonaws/services/route53domains/**</exclude>
+                <exclude>com/amazonaws/services/route53resolver/**</exclude>
+                <exclude>com/amazonaws/services/sagemaker/**</exclude>
+                <exclude>com/amazonaws/services/securityhub/**</exclude>
+                <exclude>com/amazonaws/services/serverlessapplicationrepository/**</exclude>
+                <exclude>com/amazonaws/services/servermigration/**</exclude>
+                <exclude>com/amazonaws/services/servicecatalog/**</exclude>
+                <exclude>com/amazonaws/services/servicediscovery/**</exclude>
+                <exclude>com/amazonaws/services/shield/**</exclude>
+                <exclude>com/amazonaws/services/simpledb/**</exclude>
+                <exclude>com/amazonaws/services/simpleemail/**</exclude>
+                <exclude>com/amazonaws/services/simplesystemsmanagement/**</exclude>
+                <exclude>com/amazonaws/services/simpleworkflow/**</exclude>
+                <exclude>com/amazonaws/services/sqs/**</exclude>
+                <exclude>com/amazonaws/services/stepfunctions/**</exclude>
+                <exclude>com/amazonaws/services/support/**</exclude>
+                <exclude>com/amazonaws/services/textract/**</exclude>
+                <exclude>com/amazonaws/services/transcribe/**</exclude>
+                <exclude>com/amazonaws/services/waf/**</exclude>
+                <exclude>com/amazonaws/services/workdocs/**</exclude>
+                <exclude>com/amazonaws/services/worklink/**</exclude>
+                <exclude>com/amazonaws/services/workmail/**</exclude>
+                <exclude>com/amazonaws/services/workspaces/**</exclude>
+              </excludes>
+            </filter>
+          </filters>
+        </configuration>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>shade</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>