You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pulsar.apache.org by gi...@git.apache.org on 2017/08/14 19:16:24 UTC

[GitHub] rdhabalia commented on a change in pull request #659: Pulsar connectors

rdhabalia commented on a change in pull request #659: Pulsar connectors
URL: https://github.com/apache/incubator-pulsar/pull/659#discussion_r133035746
 
 

 ##########
 File path: pulsar-connectors/google-cloud/src/main/java/org/apahce/pulsar/common/io/cloud/gcs/GcsPath.java
 ##########
 @@ -0,0 +1,627 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apahce.pulsar.common.io.cloud.gcs;
+
+import com.google.api.services.storage.model.StorageObject;
+
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+import java.io.File;
+import java.io.IOException;
+import java.io.Serializable;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.nio.file.*;
+import java.util.Iterator;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import static com.google.api.client.util.Strings.isNullOrEmpty;
+import static com.google.common.base.Preconditions.checkArgument;
+
+/**
+ * Copied from apache beam project
+ *
+ * Implements the Java NIO {@link Path} API for Google Cloud Storage paths.
+ *
+ * <p>GcsPath uses a slash ('/') as a directory separator.  Below is
+ * a summary of how slashes are treated:
+ * <ul>
+ *   <li> A GCS bucket may not contain a slash.  An object may contain zero or
+ *        more slashes.
+ *   <li> A trailing slash always indicates a directory, which is compliant
+ *        with POSIX.1-2008.
+ *   <li> Slashes separate components of a path.  Empty components are allowed,
+ *        these are represented as repeated slashes.  An empty component always
+ *        refers to a directory, and always ends in a slash.
+ *   <li> {@link #getParent()}} always returns a path ending in a slash, as the
+ *        parent of a GcsPath is always a directory.
+ *   <li> Use {@link #resolve(String)} to append elements to a GcsPath -- this
+ *        applies the rules consistently and is highly recommended over any
+ *        custom string concatenation.
+ * </ul>
+ *
+ * <p>GcsPath treats all GCS objects and buckets as belonging to the same
+ * filesystem, so the root of a GcsPath is the GcsPath bucket="", object="".
+ *
+ * <p>Relative paths are not associated with any bucket.  This matches common
+ * treatment of Path in which relative paths can be constructed from one
+ * filesystem and appended to another filesystem.
+ *
+ * @see <a href=
+ * "http://docs.oracle.com/javase/tutorial/essential/io/pathOps.html"
+ * >Java Tutorials: Path Operations</a>
+ */
+public class GcsPath implements Path, Serializable {
+
+    public static final String SCHEME = "gs";
+
+    /**
+     * Creates a GcsPath from a URI.
+     *
+     * <p>The URI must be in the form {@code gs://[bucket]/[path]}, and may not
+     * contain a port, user info, a query, or a fragment.
+     */
+    public static GcsPath fromUri(URI uri) {
+        checkArgument(uri.getScheme().equalsIgnoreCase(SCHEME), "URI: %s is not a GCS URI", uri);
+        checkArgument(uri.getPort() == -1,
+                "GCS URI may not specify port: %s (%i)", uri, uri.getPort());
+        checkArgument(
+                isNullOrEmpty(uri.getUserInfo()),
+                "GCS URI may not specify userInfo: %s (%s)", uri, uri.getUserInfo());
+        checkArgument(
+                isNullOrEmpty(uri.getQuery()),
+                "GCS URI may not specify query: %s (%s)", uri, uri.getQuery());
+        checkArgument(
+                isNullOrEmpty(uri.getFragment()),
+                "GCS URI may not specify fragment: %s (%s)", uri, uri.getFragment());
+
+        return fromUri(uri.toString());
+    }
+
+    /**
+     * Pattern that is used to parse a GCS URL.
+     *
+     * <p>This is used to separate the components.  Verification is handled
+     * separately.
+     */
+    public static final Pattern GCS_URI =
+            Pattern.compile("(?<SCHEME>[^:]+)://(?<BUCKET>[^/]+)(/(?<OBJECT>.*))?");
+
+    /**
+     * Creates a GcsPath from a URI in string form.
+     *
+     * <p>This does not use URI parsing, which means it may accept patterns that
+     * the URI parser would not accept.
+     */
+    public static GcsPath fromUri(String uri) {
+        Matcher m = GCS_URI.matcher(uri);
+        checkArgument(m.matches(), "Invalid GCS URI: %s", uri);
+
+        checkArgument(m.group("SCHEME").equalsIgnoreCase(SCHEME),
+                "URI: %s is not a GCS URI", uri);
+        return new GcsPath(null, m.group("BUCKET"), m.group("OBJECT"));
+    }
+
+    /**
+     * Pattern that is used to parse a GCS resource name.
+     */
+    private static final Pattern GCS_RESOURCE_NAME =
+            Pattern.compile("storage.googleapis.com/(?<BUCKET>[^/]+)(/(?<OBJECT>.*))?");
+
+    /**
+     * Creates a GcsPath from a OnePlatform resource name in string form.
+     */
+    public static GcsPath fromResourceName(String name) {
+        Matcher m = GCS_RESOURCE_NAME.matcher(name);
+        checkArgument(m.matches(), "Invalid GCS resource name: %s", name);
+
+        return new GcsPath(null, m.group("BUCKET"), m.group("OBJECT"));
+    }
+
+    /**
+     * Creates a GcsPath from a {@linkplain StorageObject}.
+     */
+    public static GcsPath fromObject(StorageObject object) {
+        return new GcsPath(null, object.getBucket(), object.getName());
+    }
+
+    /**
+     * Creates a GcsPath from bucket and object components.
+     *
+     * <p>A GcsPath without a bucket name is treated as a relative path, which
+     * is a path component with no linkage to the root element.  This is similar
+     * to a Unix path that does not begin with the root marker (a slash).
+     * GCS has different naming constraints and APIs for working with buckets and
+     * objects, so these two concepts are kept separate to avoid accidental
+     * attempts to treat objects as buckets, or vice versa, as much as possible.
+     *
+     * <p>A GcsPath without an object name is a bucket reference.
+     * A bucket is always a directory, which could be used to lookup or add
+     * files to a bucket, but could not be opened as a file.
+     *
+     * <p>A GcsPath containing neither bucket or object names is treated as
+     * the root of the GCS filesystem.  A listing on the root element would return
+     * the buckets available to the user.
+     *
+     * <p>If {@code null} is passed as either parameter, it is converted to an
+     * empty string internally for consistency.  There is no distinction between
+     * an empty string and a {@code null}, as neither are allowed by GCS.
+     *
+     * @param bucket a GCS bucket name, or none ({@code null} or an empty string)
+     *               if the object is not associated with a bucket
+     *               (e.g. relative paths or the root node).
+     * @param object a GCS object path, or none ({@code null} or an empty string)
+     *               for no object.
+     */
+    public static GcsPath fromComponents(@Nullable String bucket,
+                                         @Nullable String object) {
+        return new GcsPath(null, bucket, object);
+    }
+
+    @Nullable
+    private transient FileSystem fs;
+    @Nonnull
+    private final String bucket;
+    @Nonnull
+    private final String object;
+
+    /**
+     * Constructs a GcsPath.
+     *
+     * @param fs the associated FileSystem, if any
+     * @param bucket the associated bucket, or none ({@code null} or an empty
+     *               string) for a relative path component
+     * @param object the object, which is a fully-qualified object name if bucket
+     *               was also provided, or none ({@code null} or an empty string)
+     *               for no object
+     * @throws java.lang.IllegalArgumentException if the bucket of object names
+     *         are invalid.
+     */
+    public GcsPath(@Nullable FileSystem fs,
+                   @Nullable String bucket,
+                   @Nullable String object) {
+        if (bucket == null) {
+            bucket = "";
+        }
+        checkArgument(!bucket.contains("/"),
+                "GCS bucket may not contain a slash");
+        checkArgument(bucket.isEmpty()
+                        || bucket.matches("[a-z0-9][-_a-z0-9.]+[a-z0-9]"),
+                "GCS bucket names must contain only lowercase letters, numbers, "
+                        + "dashes (-), underscores (_), and dots (.). Bucket names "
+                        + "must start and end with a number or letter. "
+                        + "See https://developers.google.com/storage/docs/bucketnaming "
 
 Review comment:
   it seems regex may more rules as per developer-doc
   > Bucket names must contain 3 to 63 characters. Names containing dots can contain up to 222 characters, but each dot-separated component can be no longer than 63 characters.
   Bucket names cannot be represented as an IP address in dotted-decimal notation (for example, 192.168.5.4).
   Bucket names cannot begin with the "goog" prefix.
   Bucket names cannot contain "google" or close misspellings of "google".
   
 
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services