You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by st...@apache.org on 2021/05/24 12:08:58 UTC

[hadoop] branch trunk updated: HADOOP-17705. S3A to add Config to set AWS region (#3020)

This is an automated email from the ASF dual-hosted git repository.

stevel pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/trunk by this push:
     new 5f40003  HADOOP-17705. S3A to add Config to set AWS region (#3020)
5f40003 is described below

commit 5f400032b6b3d13a2ec4109877b91b8ac7f90b9d
Author: Mehakmeet Singh <me...@gmail.com>
AuthorDate: Mon May 24 17:38:45 2021 +0530

    HADOOP-17705. S3A to add Config to set AWS region (#3020)
    
    The option `fs.s3a.endpoint.region` can be used
    to explicitly set the AWS region of a bucket.
    
    This is needed when using AWS Private Link, as
    the region cannot be automatically determined.
    
    Contributed by Mehakmeet Singh
---
 .../java/org/apache/hadoop/fs/s3a/Constants.java   |  6 ++
 .../hadoop/fs/s3a/DefaultS3ClientFactory.java      | 31 ++++----
 .../src/site/markdown/tools/hadoop-aws/index.md    |  8 ++
 .../tools/hadoop-aws/troubleshooting_s3a.md        | 26 +++++++
 .../hadoop/fs/s3a/ITestS3AEndpointRegion.java      | 91 ++++++++++++++++++++++
 .../hadoop/fs/s3a/impl/TestNetworkBinding.java     |  2 +-
 6 files changed, 150 insertions(+), 14 deletions(-)

diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java
index f6900cb..8dc6bba 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java
@@ -1081,4 +1081,10 @@ public final class Constants {
    */
   public static final String XA_HEADER_PREFIX = "header.";
 
+  /**
+   * AWS S3 region for the bucket. When set bypasses the construction of
+   * region through endpoint url.
+   */
+  public static final String AWS_REGION = "fs.s3a.endpoint.region";
+
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java
index ae50bd1..6e84497 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java
@@ -42,6 +42,7 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.s3a.statistics.impl.AwsStatisticsCollector;
 
+import static org.apache.hadoop.fs.s3a.Constants.AWS_REGION;
 import static org.apache.hadoop.fs.s3a.Constants.EXPERIMENTAL_AWS_INTERNAL_THROTTLING;
 import static org.apache.hadoop.fs.s3a.Constants.EXPERIMENTAL_AWS_INTERNAL_THROTTLING_DEFAULT;
 
@@ -132,7 +133,7 @@ public class DefaultS3ClientFactory extends Configured
     // endpoint set up is a PITA
     AwsClientBuilder.EndpointConfiguration epr
         = createEndpointConfiguration(parameters.getEndpoint(),
-        awsConf);
+        awsConf, getConf().getTrimmed(AWS_REGION));
     if (epr != null) {
       // an endpoint binding was constructed: use it.
       b.withEndpointConfiguration(epr);
@@ -197,12 +198,14 @@ public class DefaultS3ClientFactory extends Configured
    *
    * @param endpoint possibly null endpoint.
    * @param awsConf config to build the URI from.
+   * @param awsRegion AWS S3 Region if the corresponding config is set.
    * @return a configuration for the S3 client builder.
    */
   @VisibleForTesting
   public static AwsClientBuilder.EndpointConfiguration
       createEndpointConfiguration(
-          final String endpoint, final ClientConfiguration awsConf) {
+      final String endpoint, final ClientConfiguration awsConf,
+      String awsRegion) {
     LOG.debug("Creating endpoint configuration for {}", endpoint);
     if (endpoint == null || endpoint.isEmpty()) {
       // the default endpoint...we should be using null at this point.
@@ -212,17 +215,19 @@ public class DefaultS3ClientFactory extends Configured
 
     final URI epr = RuntimeHttpUtils.toUri(endpoint, awsConf);
     LOG.debug("Endpoint URI = {}", epr);
-
-    String region;
-    if (!ServiceUtils.isS3USStandardEndpoint(endpoint)) {
-      LOG.debug("Endpoint {} is not the default; parsing", epr);
-      region = AwsHostNameUtils.parseRegion(
-          epr.getHost(),
-          S3_SERVICE_NAME);
-    } else {
-      // US-east, set region == null.
-      LOG.debug("Endpoint {} is the standard one; declare region as null", epr);
-      region = null;
+    String region = awsRegion;
+    if (StringUtils.isBlank(region)) {
+      if (!ServiceUtils.isS3USStandardEndpoint(endpoint)) {
+        LOG.debug("Endpoint {} is not the default; parsing", epr);
+        region = AwsHostNameUtils.parseRegion(
+            epr.getHost(),
+            S3_SERVICE_NAME);
+      } else {
+        // US-east, set region == null.
+        LOG.debug("Endpoint {} is the standard one; declare region as null",
+            epr);
+        region = null;
+      }
     }
     LOG.debug("Region for endpoint {}, URI {} is determined as {}",
         endpoint, epr, region);
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
index 9258c47..c68c57d 100644
--- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
@@ -798,6 +798,14 @@ options are covered in [Testing](./testing.md).
 </property>
 
 <property>
+  <name>fs.s3a.endpoint.region</name>
+  <description>AWS S3 region for a bucket, which bypasses the parsing of
+ fs.s3a.endpoint to know the region. Would be helpful in avoiding errors
+ while using privateLink URL and explicitly set the bucket region.
+  </description>
+</property>
+
+<property>
   <name>fs.s3a.path.style.access</name>
   <value>false</value>
   <description>Enable S3 path style access ie disabling the default virtual hosting behaviour.
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md
index 661dd2f..d91607d 100644
--- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md
@@ -247,6 +247,32 @@ As an example, the endpoint for S3 Frankfurt is `s3.eu-central-1.amazonaws.com`:
   <value>s3.eu-central-1.amazonaws.com</value>
 </property>
 ```
+### <a name="AuthorizationHeaderMalformed"></a> "Authorization Header is Malformed"(400) exception when PrivateLink URL is used in "fs.s3a.endpoint"
+
+When [PrivateLink](https://docs.aws.amazon.com/AmazonS3/latest/userguide/privatelink-interface-endpoints.html) URL
+is used instead of standard s3a endpoint, it returns "authorization
+header is malformed" exception. So, if we set fs.s3a.endpoint=bucket.vpce
+-<some_string>.s3.ca-central-1.vpce.amazonaws.com and make s3 calls we get:
+```
+com.amazonaws.services.s3.model.AmazonS3Exception: The authorization header is malformed; the region 'vpce' is wrong; expecting 'ca-central-1'
+(Service: Amazon S3; Status Code: 400; Error Code: AuthorizationHeaderMalformed; Request ID: req-id; S3 Extended Request ID: req-id-2), S3 Extended Request ID: req-id-2:AuthorizationHeaderMalformed: The authorization
+header is malformed; the region 'vpce' is wrong; expecting 'ca-central-1' (Service: Amazon S3; Status Code: 400; Error Code: AuthorizationHeaderMalformed; Request ID: req-id;
+```
+Cause:
+
+Since, endpoint parsing is done in a way that it assumes the AWS S3 region
+would be the 2nd component of the `fs.s3a.endpoint` URL delimited by ".", in
+case of PrivateLink URL, it can't figure out the region and throws an
+authorization exception. Thus, to add support to using PrivateLink URLs we use `fs.s3a.endpoint.region`
+to set the region and bypass this parsing of `fs.s3a.endpoint`, in the case shown above to make it work we'll set the AWS
+S3 region as `ca-central-1`.
+
+```xml
+<property>
+  <name>fs.s3a.endpoint.region</name>
+  <value>ca-central-1</value>
+</property>
+```
 
 ### `Class does not implement AWSCredentialsProvider`
 
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEndpointRegion.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEndpointRegion.java
new file mode 100644
index 0000000..abd637a
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEndpointRegion.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a;
+
+import com.amazonaws.ClientConfiguration;
+import com.amazonaws.client.builder.AwsClientBuilder;
+import com.amazonaws.util.AwsHostNameUtils;
+import org.assertj.core.api.Assertions;
+import org.junit.Test;
+
+import static org.apache.hadoop.fs.s3a.Constants.AWS_REGION;
+
+/**
+ * Test to check correctness of S3A endpoint regions in
+ * {@link DefaultS3ClientFactory}.
+ */
+public class ITestS3AEndpointRegion extends AbstractS3ATestBase {
+
+  private static final String AWS_REGION_TEST = "test-region";
+  private static final String AWS_ENDPOINT_TEST = "test-endpoint";
+  private static final String AWS_ENDPOINT_TEST_WITH_REGION =
+      "test-endpoint.some-region.amazonaws.com";
+
+  /**
+   * Test to verify that setting a region with the config would bypass the
+   * construction of region from endpoint.
+   */
+  @Test
+  public void testWithRegionConfig() {
+    getFileSystem().getConf().set(AWS_REGION, AWS_REGION_TEST);
+
+    //Creating an endpoint config with a custom endpoint.
+    AwsClientBuilder.EndpointConfiguration epr = createEpr(AWS_ENDPOINT_TEST,
+        getFileSystem().getConf().getTrimmed(AWS_REGION));
+    //Checking if setting region config bypasses the endpoint region.
+    Assertions.assertThat(epr.getSigningRegion())
+        .describedAs("There is a region mismatch")
+        .isEqualTo(getFileSystem().getConf().get(AWS_REGION));
+  }
+
+  /**
+   * Test to verify that not setting the region config, would lead to using
+   * endpoint to construct the region.
+   */
+  @Test
+  public void testWithoutRegionConfig() {
+    getFileSystem().getConf().unset(AWS_REGION);
+
+    //Creating an endpoint config with a custom endpoint containing a region.
+    AwsClientBuilder.EndpointConfiguration eprRandom =
+        createEpr(AWS_ENDPOINT_TEST_WITH_REGION,
+            getFileSystem().getConf().getTrimmed(AWS_REGION));
+    String regionFromEndpoint =
+        AwsHostNameUtils
+            .parseRegionFromAwsPartitionPattern(AWS_ENDPOINT_TEST_WITH_REGION);
+    //Checking if not setting region config leads to constructing the region
+    // from endpoint.
+    Assertions.assertThat(eprRandom.getSigningRegion())
+        .describedAs("There is a region mismatch")
+        .isNotEqualTo(getFileSystem().getConf().get(AWS_REGION))
+        .isEqualTo(regionFromEndpoint);
+  }
+
+  /**
+   * Method to create EndpointConfiguration using an endpoint.
+   *
+   * @param endpoint the endpoint to be used for EndpointConfiguration creation.
+   * @return an instance of EndpointConfiguration.
+   */
+  private AwsClientBuilder.EndpointConfiguration createEpr(String endpoint,
+      String awsRegion) {
+    return DefaultS3ClientFactory.createEndpointConfiguration(endpoint,
+        new ClientConfiguration(), awsRegion);
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/TestNetworkBinding.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/TestNetworkBinding.java
index 10fe339..7f51d2b 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/TestNetworkBinding.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/TestNetworkBinding.java
@@ -85,7 +85,7 @@ public class TestNetworkBinding extends AbstractHadoopTestBase {
       final boolean expectNull,
       final String expectRegion) {
     AwsClientBuilder.EndpointConfiguration epr =
-        createEndpointConfiguration(src, new ClientConfiguration());
+        createEndpointConfiguration(src, new ClientConfiguration(), src);
     String eprStr = epr == null
         ? "(empty)"
         : ("(" + epr.getServiceEndpoint() + " " + epr.getSigningRegion());

---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org