You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@beam.apache.org by lc...@apache.org on 2018/11/28 02:19:23 UTC

[beam] branch master updated: [BEAM-6102] Fix several packages that were being bundled without relocation within the Dataflow worker. (#7145)

This is an automated email from the ASF dual-hosted git repository.

lcwik pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git


The following commit(s) were added to refs/heads/master by this push:
     new d922f70  [BEAM-6102] Fix several packages that were being bundled without relocation within the Dataflow worker. (#7145)
d922f70 is described below

commit d922f70b123a2d5f0926f8989d829ecd75360f47
Author: Lukasz Cwik <lc...@google.com>
AuthorDate: Tue Nov 27 18:19:16 2018 -0800

    [BEAM-6102] Fix several packages that were being bundled without relocation within the Dataflow worker. (#7145)
    
    This change:
    * Parameterizes the shadow jar validation to allow for excludes to be on a per project basis instead.
    * Cuts down the Dataflow worker jar by ~20mbs of non-relocated classes that are provided by the SDK during job submission.
    * Fixes building the job management and fn execution model jars which were including the model pipeline classes.
    * Adds several comments on how one is meant to shade the legacy worker jar correctly and enforces validation so classes don't leak out unexpectedly.
    * Adds two follow-up TODOs to relocate DataflowRunnerHarness which requires fixing boot.go code and also relocating conscrypt.
---
 .../org/apache/beam/gradle/BeamModulePlugin.groovy |  68 +++++++-----
 model/fn-execution/build.gradle                    |   4 +-
 model/job-management/build.gradle                  |   4 +-
 model/pipeline/build.gradle                        |   2 +-
 .../worker/legacy-worker/build.gradle              | 118 +++++++++++----------
 .../worker/windmill/build.gradle                   |   2 +-
 6 files changed, 109 insertions(+), 89 deletions(-)

diff --git a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy
index ba17983..38c52ad 100644
--- a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy
+++ b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy
@@ -102,6 +102,14 @@ class BeamModulePlugin implements Plugin<Project> {
     boolean validateShadowJar = true
 
     /**
+     * The set of excludes that should be used during validation of the shadow jar. Projects should override
+     * the default with the most specific set of excludes that is valid for the contents of its shaded jar.
+     *
+     * By default we exclude any class underneath the org.apache.beam namespace.
+     */
+    List<String> shadowJarValidationExcludes = ["org/apache/beam/**"]
+
+    /**
      * The shadowJar / shadowTestJar tasks execute the following closure to configure themselves.
      * Users can compose their closure with the default closure via:
      * DEFAULT_SHADOW_CLOSURE << {
@@ -117,6 +125,17 @@ class BeamModulePlugin implements Plugin<Project> {
     boolean publish = true
   }
 
+  /** A class defining the set of configurable properties accepted by applyPortabilityNature. */
+  class PortabilityNatureConfiguration {
+    /**
+     * The set of excludes that should be used during validation of the shadow jar. Projects should override
+     * the default with the most specific set of excludes that is valid for the contents of its shaded jar.
+     *
+     * By default we exclude any class underneath the org.apache.beam namespace.
+     */
+    List<String> shadowJarValidationExcludes = ["org/apache/beam/**"]
+  }
+
   // A class defining the set of configurable properties for createJavaExamplesArchetypeValidationTask
   class JavaExamplesArchetypeValidationConfiguration {
     // Type [Quickstart, MobileGaming] for the postrelease validation is required.
@@ -759,7 +778,7 @@ class BeamModulePlugin implements Plugin<Project> {
       }
 
       if (configuration.validateShadowJar) {
-        project.task('validateShadedJarDoesntLeakNonOrgApacheBeamClasses', dependsOn: 'shadowJar') {
+        project.task('validateShadedJarDoesntLeakNonProjectClasses', dependsOn: 'shadowJar') {
           ext.outFile = project.file("${project.reportsDir}/${name}.out")
           inputs.files project.configurations.shadow.artifacts.files
           outputs.files outFile
@@ -767,19 +786,21 @@ class BeamModulePlugin implements Plugin<Project> {
             project.configurations.shadow.artifacts.files.each {
               FileTree exposedClasses = project.zipTree(it).matching {
                 include "**/*.class"
-                exclude "org/apache/beam/**"
                 // BEAM-5919: Exclude paths for Java 9 multi-release jars.
                 exclude "META-INF/versions/*/module-info.class"
-                exclude "META-INF/versions/*/org/apache/beam/**"
+                configuration.shadowJarValidationExcludes.each {
+                  exclude "$it"
+                  exclude "META-INF/versions/*/$it"
+                }
               }
               outFile.text = exposedClasses.files
               if (exposedClasses.files) {
-                throw new GradleException("$it exposed classes outside of org.apache.beam namespace: ${exposedClasses.files}")
+                throw new GradleException("$it exposed classes outside of ${configuration.shadowJarValidationExcludes}: ${exposedClasses.files}")
               }
             }
           }
         }
-        project.tasks.check.dependsOn project.tasks.validateShadedJarDoesntLeakNonOrgApacheBeamClasses
+        project.tasks.check.dependsOn project.tasks.validateShadedJarDoesntLeakNonProjectClasses
       }
 
       if ((isRelease(project) || project.hasProperty('publishing')) &&
@@ -1358,14 +1379,19 @@ artifactId=${project.name}
 
     project.ext.applyPortabilityNature = {
       println "applyPortabilityNature with " + (it ? "$it" : "default configuration") + " for project $project.name"
-      project.ext.applyJavaNature(enableFindbugs: false, shadowClosure: GrpcVendoring.shadowClosure() << {
-        // We perform all the code relocations but don't include
-        // any of the actual dependencies since they will be supplied
-        // by org.apache.beam:beam-vendor-grpc-v1_13_1:0.1
-        dependencies {
-          exclude(dependency(".*:.*"))
-        }
-      })
+      PortabilityNatureConfiguration configuration = it ? it as PortabilityNatureConfiguration : new PortabilityNatureConfiguration()
+
+      project.ext.applyJavaNature(
+              enableFindbugs: false,
+              shadowJarValidationExcludes: it.shadowJarValidationExcludes,
+              shadowClosure: GrpcVendoring.shadowClosure() << {
+                // We perform all the code relocations but don't include
+                // any of the actual dependencies since they will be supplied
+                // by org.apache.beam:beam-vendor-grpc-v1_13_1:0.1
+                dependencies {
+                  include(dependency { return false })
+                }
+              })
 
       // Don't force modules here because we don't want to take the shared declarations in build_rules.gradle
       // because we would like to have the freedom to choose which versions of dependencies we
@@ -1399,22 +1425,6 @@ artifactId=${project.name}
       }
 
       project.dependencies GrpcVendoring.dependenciesClosure() << { shadow 'org.apache.beam:beam-vendor-grpc-1_13_1:0.1' }
-
-      project.task('validateShadedJarDoesntExportVendoredDependencies', dependsOn: 'shadowJar') {
-        ext.outFile = project.file("${project.reportsDir}/${name}.out")
-        inputs.files project.configurations.shadow.artifacts.files
-        outputs.files outFile
-        doLast {
-          project.configurations.shadow.artifacts.files.each {
-            FileTree exportedClasses = project.zipTree(it).matching { include "org/apache/beam/vendor/**" }
-            outFile.text = exportedClasses.files
-            if (exportedClasses.files) {
-              throw new GradleException("$it exported classes inside of org.apache.beam.vendor namespace: ${exportedClasses.files}")
-            }
-          }
-        }
-      }
-      project.tasks.check.dependsOn project.tasks.validateShadedJarDoesntExportVendoredDependencies
     }
 
     /** ***********************************************************************************************/
diff --git a/model/fn-execution/build.gradle b/model/fn-execution/build.gradle
index 3194fcf..faf674c 100644
--- a/model/fn-execution/build.gradle
+++ b/model/fn-execution/build.gradle
@@ -17,7 +17,7 @@
  */
 
 apply plugin: org.apache.beam.gradle.BeamModulePlugin
-applyPortabilityNature()
+applyPortabilityNature(shadowJarValidationExcludes: ["org/apache/beam/model/fnexecution/v1/**"])
 
 description = "Apache Beam :: Model :: Fn Execution"
 ext.summary = "Portable definitions for execution user-defined functions."
@@ -25,6 +25,6 @@ ext.summary = "Portable definitions for execution user-defined functions."
 dependencies {
   // We purposely depend on the unshaded classes for protobuf compilation and
   // export the shaded variant as the actual runtime dependency.
-  protobuf project(path: ":beam-model-pipeline", configuration: "unshaded")
+  compile project(path: ":beam-model-pipeline", configuration: "unshaded")
   runtime project(path: ":beam-model-pipeline", configuration: "shadow")
 }
diff --git a/model/job-management/build.gradle b/model/job-management/build.gradle
index e453e33..4c50782 100644
--- a/model/job-management/build.gradle
+++ b/model/job-management/build.gradle
@@ -17,7 +17,7 @@
  */
 
 apply plugin: org.apache.beam.gradle.BeamModulePlugin
-applyPortabilityNature()
+applyPortabilityNature(shadowJarValidationExcludes: ["org/apache/beam/model/jobmanagement/v1/**"])
 
 description = "Apache Beam :: Model :: Job Management"
 ext.summary = "Portable definitions for submitting pipelines."
@@ -25,6 +25,6 @@ ext.summary = "Portable definitions for submitting pipelines."
 dependencies {
   // We purposely depend on the unshaded classes for protobuf compilation and
   // export the shaded variant as the actual runtime dependency.
-  protobuf project(path: ":beam-model-pipeline", configuration: "unshaded")
+  compile project(path: ":beam-model-pipeline", configuration: "unshaded")
   runtime project(path: ":beam-model-pipeline", configuration: "shadow")
 }
diff --git a/model/pipeline/build.gradle b/model/pipeline/build.gradle
index 016a766..2a72cd6 100644
--- a/model/pipeline/build.gradle
+++ b/model/pipeline/build.gradle
@@ -17,7 +17,7 @@
  */
 
 apply plugin: org.apache.beam.gradle.BeamModulePlugin
-applyPortabilityNature()
+applyPortabilityNature(shadowJarValidationExcludes: ["org/apache/beam/model/pipeline/v1/**"])
 
 description = "Apache Beam :: Model :: Pipeline"
 ext.summary = "Portable definitions for building pipelines"
diff --git a/runners/google-cloud-dataflow-java/worker/legacy-worker/build.gradle b/runners/google-cloud-dataflow-java/worker/legacy-worker/build.gradle
index 5718d11..7551758 100644
--- a/runners/google-cloud-dataflow-java/worker/legacy-worker/build.gradle
+++ b/runners/google-cloud-dataflow-java/worker/legacy-worker/build.gradle
@@ -78,20 +78,72 @@ def excluded_dependencies = [
         library.java.junit,                          // Test only
 ]
 
-applyJavaNature(publish: false, enableFindbugs: false /* TODO(BEAM-5658): enable findbugs */, validateShadowJar: false, shadowClosure: DEFAULT_SHADOW_CLOSURE << {
+applyJavaNature(
+        publish: false,
+        enableFindbugs: false /* TODO(BEAM-5658): enable findbugs */,
+        shadowJarValidationExcludes: [
+            "org/apache/beam/runners/dataflow/worker/**",
+            "org/apache/beam/repackaged/beam_runners_google_cloud_dataflow_java_legacy_worker/**",
+            // TODO(BEAM-6137): Move DataflowRunnerHarness class under org.apache.beam.runners.dataflow.worker namespace
+            "com/google/cloud/dataflow/worker/DataflowRunnerHarness.class",
+            // TODO(BEAM-6136): Enable relocation for conscrypt
+            "org/conscrypt/**",
+        ],
+        shadowClosure: DEFAULT_SHADOW_CLOSURE << {
+    // Each included dependency must also include all of its necessary transitive dependencies
+    // or have them provided by the users pipeline during job submission. Typically a users
+    // pipeline includes :beam-runners-google-cloud-dataflow-java and its transitive dependencies
+    // so those dependencies don't need to be shaded (bundled and relocated) away. All other
+    // dependencies needed to run the worker must be shaded (bundled and relocated) to prevent
+    // ClassNotFound and/or MethodNotFound errors during pipeline execution.
+    //
+    // Each included dependency should have a matching relocation rule below that ensures
+    // that the shaded jar is correctly built.
+
+    dependencies {
+        include(project(path: ":beam-model-fn-execution", configuration: "shadow"))
+    }
+    relocate("org.apache.beam.model.fnexecution.v1", getWorkerRelocatedPath("org.apache.beam.model.fnexecution.v1"))
+
+    dependencies {
+        include(project(path: ":beam-runners-core-construction-java", configuration: "shadow"))
+        include(project(path: ":beam-runners-core-java", configuration: "shadow"))
+    }
+    relocate("org.apache.beam.runners.core", getWorkerRelocatedPath("org.apache.beam.runners.core"))
+    relocate("org.apache.beam.repackaged.beam_runners_core_construction_java", getWorkerRelocatedPath("org.apache.beam.repackaged.beam_runners_core_construction_java"))
+    relocate("org.apache.beam.repackaged.beam_runners_core_java", getWorkerRelocatedPath("org.apache.beam.repackaged.beam_runners_core_java"))
+
+    dependencies {
+        include(project(path: ":beam-runners-java-fn-execution", configuration: "shadow"))
+    }
+    relocate("org.apache.beam.runners.fnexecution", getWorkerRelocatedPath("org.apache.beam.runners.fnexecution"))
+    relocate("org.apache.beam.repackaged.beam_runners_java_fn_execution", getWorkerRelocatedPath("org.apache.beam.repackaged.beam_runners_java_fn_execution"))
+
+    dependencies {
+        include(project(path: ":beam-sdks-java-fn-execution", configuration: "shadow"))
+    }
+    relocate("org.apache.beam.sdk.fn", getWorkerRelocatedPath("org.apache.beam.sdk.fn"))
+    relocate("org.apache.beam.repackaged.beam_sdks_java_fn_execution", getWorkerRelocatedPath("org.apache.beam.repackaged.beam_sdks_java_fn_execution"))
+
+    // TODO(BEAM-6136): Enable relocation for conscrypt
+    dependencies {
+        include(dependency("org.conscrypt:conscrypt-openjdk:1.1.3:linux-x86_64"))
+    }
+
+    dependencies {
+        // We have to include jetty-server/jetty-servlet and all of its transitive dependencies
+        // which includes several org.eclipse.jetty artifacts + servlet-api
+        include(dependency("org.eclipse.jetty:.*:9.2.10.v20150310"))
+        include(dependency("javax.servlet:javax.servlet-api:3.1.0"))
+    }
+    relocate("org.eclipse.jetty", getWorkerRelocatedPath("org.eclipse.jetty"))
+    relocate("javax.servlet", getWorkerRelocatedPath("javax.servlet"))
+
+    // We don't relocate windmill since it is already underneath the org.apache.beam.runners.dataflow.worker namespace and never
+    // expect a user pipeline to include it. There is also a JNI component that windmill server relies on which makes
+    // arbitrary relocation more difficult.
     dependencies {
         include(project(path: ":beam-runners-google-cloud-dataflow-java-windmill", configuration: "shadow"))
-        include(dependency(".*:.*"))
-
-        sdk_provided_dependencies.each {
-            exclude(dependency(it))
-        }
-        sdk_provided_project_dependencies.each {
-            exclude(project(path: it, configuration: "shadow"))
-        }
-        excluded_dependencies.each {
-            exclude(dependency(it))
-        }
     }
 
     // Include original source files extracted under
@@ -100,47 +152,6 @@ applyJavaNature(publish: false, enableFindbugs: false /* TODO(BEAM-5658): enable
 
     exclude "META-INF/LICENSE.txt"
     exclude "about.html"
-
-    relocate("com.", getWorkerRelocatedPath("com.")) {
-        exclude "com.fasterxml.jackson.**"
-        exclude "com.google.api.client.**"
-        exclude "com.google.api.services.bigquery.**"
-        exclude "com.google.api.services.clouddebugger.**"
-        exclude "com.google.api.services.dataflow.**"
-        exclude "com.google.api.services.datastore.**"
-        exclude "com.google.api.services.pubsub.**"
-        exclude "com.google.api.services.storage.**"
-        exclude "com.google.auth.**"
-        exclude "com.google.cloud.dataflow.**"
-        exclude "com.sun.management*"
-        exclude "com.sun.management.**"
-    }
-    relocate("javax.servlet", getWorkerRelocatedPath("javax.servlet"))
-    relocate("io.", getWorkerRelocatedPath("io."))
-    relocate("okio.", getWorkerRelocatedPath("okio."))
-    relocate("org.", getWorkerRelocatedPath("org.")) {
-        // Exclude netty-tcnative from shading since gRPC relies on Netty to be able
-        // to load org.apache.tomcat.jni.SSL to provide an SSL context.
-        exclude "org.apache.avro.**"
-        exclude "org.apache.beam.**"
-        exclude "org.apache.tomcat.jni.**"
-        exclude "org.conscrypt.**"
-        exclude "org.eclipse.jetty.alpn.**"
-        exclude "org.eclipse.jetty.npn.**"
-        exclude "org.hamcrest.**"
-        exclude "org.joda.time.**"
-        exclude "org.junit.**"
-        exclude "org.slf4j.**"
-        exclude "org.w3c.dom.**"
-    }
-    relocate("org.apache.beam.runners.core.construction.",
-            getWorkerRelocatedPath("org.")) {
-              // Exclude org.apache.beam.runners.core.construction.metrics
-              // because it is referenced by
-              // org.apache.beam.runners.core.metrics which is not relocated.
-              exclude "org.apache.beam.runners.core.construction.metrics.**"
-            }
-
 })
 
 /******************************************************************************/
@@ -184,7 +195,6 @@ dependencies {
     compile project(path: ":beam-sdks-java-fn-execution", configuration: "shadow")
     compile project(path: ":beam-runners-google-cloud-dataflow-java-windmill", configuration: "shadow")
     compile library.java.guava
-    compile "javax.servlet:javax.servlet-api:3.1.0"
     compile "org.conscrypt:conscrypt-openjdk:1.1.3:linux-x86_64"
     compile "org.eclipse.jetty:jetty-server:9.2.10.v20150310"
     compile "org.eclipse.jetty:jetty-servlet:9.2.10.v20150310"
diff --git a/runners/google-cloud-dataflow-java/worker/windmill/build.gradle b/runners/google-cloud-dataflow-java/worker/windmill/build.gradle
index 7939921..cd982e5 100644
--- a/runners/google-cloud-dataflow-java/worker/windmill/build.gradle
+++ b/runners/google-cloud-dataflow-java/worker/windmill/build.gradle
@@ -17,7 +17,7 @@
  */
 
 apply plugin: org.apache.beam.gradle.BeamModulePlugin
-applyPortabilityNature()
+applyPortabilityNature(shadowJarValidationExcludes: ["org/apache/beam/runners/dataflow/worker/windmill/**"])
 
 description = "Apache Beam :: Runners :: Google Cloud Dataflow Java :: Windmill"
 ext.summary = "Windmill proto specifications"