You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/01/02 14:42:24 UTC

[arrow-rs] branch master updated: Derive Clone for ObjectStore builders and Make URL Parsing Stricter (#3419) (#3424)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 1889e33da Derive Clone for ObjectStore builders and Make URL Parsing Stricter (#3419) (#3424)
1889e33da is described below

commit 1889e33da31218ee2c58ad874036b17b699538b9
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Mon Jan 2 14:42:19 2023 +0000

    Derive Clone for ObjectStore builders and Make URL Parsing Stricter (#3419) (#3424)
    
    * Derive Clone for ObjectStore builders (#3419)
    
    Make URL parsing more strict
    
    * Review feedback
---
 object_store/src/aws/mod.rs   | 112 ++++++++++++++----------
 object_store/src/azure/mod.rs | 196 ++++++++++++++++++++++--------------------
 object_store/src/gcp/mod.rs   |  95 ++++++++++----------
 object_store/src/http/mod.rs  |  21 +++--
 4 files changed, 228 insertions(+), 196 deletions(-)

diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs
index 0fcfbaf9c..786ccd20f 100644
--- a/object_store/src/aws/mod.rs
+++ b/object_store/src/aws/mod.rs
@@ -36,6 +36,7 @@ use bytes::Bytes;
 use chrono::{DateTime, Utc};
 use futures::stream::BoxStream;
 use futures::TryStreamExt;
+use itertools::Itertools;
 use snafu::{OptionExt, ResultExt, Snafu};
 use std::collections::BTreeSet;
 use std::ops::Range;
@@ -129,6 +130,9 @@ enum Error {
         scheme
     ))]
     UnknownUrlScheme { scheme: String },
+
+    #[snafu(display("URL did not match any known pattern for scheme: {}", url))]
+    UrlNotRecognised { url: String },
 }
 
 impl From<Error> for super::Error {
@@ -358,7 +362,7 @@ impl CloudMultiPartUploadImpl for S3MultiPartUpload {
 ///  .with_secret_access_key(SECRET_KEY)
 ///  .build();
 /// ```
-#[derive(Debug, Default)]
+#[derive(Debug, Default, Clone)]
 pub struct AmazonS3Builder {
     access_key_id: Option<String>,
     secret_access_key: Option<String>,
@@ -366,13 +370,13 @@ pub struct AmazonS3Builder {
     bucket_name: Option<String>,
     endpoint: Option<String>,
     token: Option<String>,
+    url: Option<String>,
     retry_config: RetryConfig,
     imdsv1_fallback: bool,
     virtual_hosted_style_request: bool,
     metadata_endpoint: Option<String>,
     profile: Option<String>,
     client_options: ClientOptions,
-    url_parse_error: Option<Error>,
 }
 
 impl AmazonS3Builder {
@@ -453,9 +457,7 @@ impl AmazonS3Builder {
     /// - `https://s3.<bucket>.amazonaws.com`
     /// - `https://<bucket>.s3.<region>.amazonaws.com`
     ///
-    /// Please note that this is a best effort implementation, and will not fail for malformed URLs,
-    /// but rather warn and ignore the passed url. The url also has no effect on how the
-    /// storage is accessed - e.g. which driver or protocol is used for reading from the location.
+    /// Note: Settings derived from the URL will override any others set on this builder
     ///
     /// # Example
     /// ```
@@ -465,44 +467,39 @@ impl AmazonS3Builder {
     ///     .with_url("s3://bucket/path")
     ///     .build();
     /// ```
-    pub fn with_url(mut self, url: impl AsRef<str>) -> Self {
-        let maybe_parsed = Url::parse(url.as_ref());
-        match maybe_parsed {
-            Ok(parsed) => match parsed.scheme() {
-                "s3" | "s3a" => {
-                    self.bucket_name = parsed.host_str().map(|host| host.to_owned());
-                }
-                "https" => {
-                    if let Some(host) = parsed.host_str() {
-                        let parts = host.splitn(4, '.').collect::<Vec<&str>>();
-                        if parts.len() == 4 && parts[0] == "s3" && parts[2] == "amazonaws"
-                        {
-                            self.bucket_name = Some(parts[1].to_string());
-                        }
-                        if parts.len() == 4
-                            && parts[1] == "s3"
-                            && parts[3] == "amazonaws.com"
-                        {
-                            self.bucket_name = Some(parts[0].to_string());
-                            self.region = Some(parts[2].to_string());
-                            self.virtual_hosted_style_request = true;
-                        }
-                    }
+    pub fn with_url(mut self, url: impl Into<String>) -> Self {
+        self.url = Some(url.into());
+        self
+    }
+
+    /// Sets properties on this builder based on a URL
+    ///
+    /// This is a separate member function to allow fallible computation to
+    /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`]
+    fn parse_url(&mut self, url: &str) -> Result<()> {
+        let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?;
+        let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?;
+        let validate = |s: &str| match s.contains('.') {
+            true => Err(UrlNotRecognisedSnafu { url }.build()),
+            false => Ok(s.to_string()),
+        };
+
+        match parsed.scheme() {
+            "s3" | "s3a" => self.bucket_name = Some(validate(host)?),
+            "https" => match host.splitn(4, '.').collect_tuple() {
+                Some(("s3", bucket, "amazonaws", "com")) => {
+                    self.bucket_name = Some(bucket.to_string());
                 }
-                other => {
-                    self.url_parse_error = Some(Error::UnknownUrlScheme {
-                        scheme: other.into(),
-                    });
+                Some((bucket, "s3", region, "amazonaws.com")) => {
+                    self.bucket_name = Some(bucket.to_string());
+                    self.region = Some(region.to_string());
+                    self.virtual_hosted_style_request = true;
                 }
+                _ => return Err(UrlNotRecognisedSnafu { url }.build().into()),
             },
-            Err(err) => {
-                self.url_parse_error = Some(Error::UnableToParseUrl {
-                    source: err,
-                    url: url.as_ref().into(),
-                });
-            }
+            scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()),
         };
-        self
+        Ok(())
     }
 
     /// Set the AWS Access Key (required)
@@ -641,9 +638,9 @@ impl AmazonS3Builder {
 
     /// Create a [`AmazonS3`] instance from the provided values,
     /// consuming `self`.
-    pub fn build(self) -> Result<AmazonS3> {
-        if let Some(err) = self.url_parse_error {
-            return Err(err.into());
+    pub fn build(mut self) -> Result<AmazonS3> {
+        if let Some(url) = self.url.take() {
+            self.parse_url(&url)?;
         }
 
         let bucket = self.bucket_name.context(MissingBucketNameSnafu)?;
@@ -1022,15 +1019,36 @@ mod tests {
 
     #[test]
     fn s3_test_urls() {
-        let builder = AmazonS3Builder::new().with_url("s3://bucket/path");
+        let mut builder = AmazonS3Builder::new();
+        builder.parse_url("s3://bucket/path").unwrap();
         assert_eq!(builder.bucket_name, Some("bucket".to_string()));
 
-        let builder = AmazonS3Builder::new().with_url("https://s3.bucket.amazonaws.com");
+        let mut builder = AmazonS3Builder::new();
+        builder
+            .parse_url("https://s3.bucket.amazonaws.com")
+            .unwrap();
         assert_eq!(builder.bucket_name, Some("bucket".to_string()));
 
-        let builder =
-            AmazonS3Builder::new().with_url("https://bucket.s3.region.amazonaws.com");
+        let mut builder = AmazonS3Builder::new();
+        builder
+            .parse_url("https://bucket.s3.region.amazonaws.com")
+            .unwrap();
         assert_eq!(builder.bucket_name, Some("bucket".to_string()));
-        assert_eq!(builder.region, Some("region".to_string()))
+        assert_eq!(builder.region, Some("region".to_string()));
+        assert!(builder.virtual_hosted_style_request);
+
+        let err_cases = [
+            "mailto://bucket/path",
+            "s3://bucket.mydomain/path",
+            "https://s3.bucket.mydomain.com",
+            "https://s3.bucket.foo.amazonaws.com",
+            "https://bucket.mydomain.region.amazonaws.com",
+            "https://bucket.s3.region.bar.amazonaws.com",
+            "https://bucket.foo.s3.amazonaws.com",
+        ];
+        let mut builder = AmazonS3Builder::new();
+        for case in err_cases {
+            builder.parse_url(case).unwrap_err();
+        }
     }
 }
diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs
index 4224ae633..7cf369de3 100644
--- a/object_store/src/azure/mod.rs
+++ b/object_store/src/azure/mod.rs
@@ -37,7 +37,7 @@ use async_trait::async_trait;
 use bytes::Bytes;
 use chrono::{TimeZone, Utc};
 use futures::{stream::BoxStream, StreamExt, TryStreamExt};
-use snafu::{ResultExt, Snafu};
+use snafu::{OptionExt, ResultExt, Snafu};
 use std::collections::BTreeSet;
 use std::fmt::{Debug, Formatter};
 use std::io;
@@ -121,6 +121,9 @@ enum Error {
         scheme
     ))]
     UnknownUrlScheme { scheme: String },
+
+    #[snafu(display("URL did not match any known pattern for scheme: {}", url))]
+    UrlNotRecognised { url: String },
 }
 
 impl From<Error> for super::Error {
@@ -354,7 +357,7 @@ impl CloudMultiPartUploadImpl for AzureMultiPartUpload {
 ///  .with_container_name(BUCKET_NAME)
 ///  .build();
 /// ```
-#[derive(Default)]
+#[derive(Default, Clone)]
 pub struct MicrosoftAzureBuilder {
     account_name: Option<String>,
     access_key: Option<String>,
@@ -365,10 +368,10 @@ pub struct MicrosoftAzureBuilder {
     tenant_id: Option<String>,
     sas_query_pairs: Option<Vec<(String, String)>>,
     authority_host: Option<String>,
+    url: Option<String>,
     use_emulator: bool,
     retry_config: RetryConfig,
     client_options: ClientOptions,
-    url_parse_error: Option<Error>,
 }
 
 impl Debug for MicrosoftAzureBuilder {
@@ -444,9 +447,7 @@ impl MicrosoftAzureBuilder {
     /// - `https://<account>.dfs.core.windows.net`
     /// - `https://<account>.blob.core.windows.net`
     ///
-    /// Please note that this is a best effort implementation, and will not fail for malformed URLs,
-    /// but rather warn and ignore the passed url. The url also has no effect on how the
-    /// storage is accessed - e.g. which driver or protocol is used for reading from the location.
+    /// Note: Settings derived from the URL will override any others set on this builder
     ///
     /// # Example
     /// ```
@@ -456,52 +457,48 @@ impl MicrosoftAzureBuilder {
     ///     .with_url("abfss://file_system@account.dfs.core.windows.net/")
     ///     .build();
     /// ```
-    pub fn with_url(mut self, url: impl AsRef<str>) -> Self {
-        let maybe_parsed = Url::parse(url.as_ref());
-        match maybe_parsed {
-            Ok(parsed) => match parsed.scheme() {
-                "az" | "adl" | "azure" => {
-                    self.container_name = parsed.host_str().map(|host| host.to_owned());
-                }
-                "abfs" | "abfss" => {
-                    // abfs(s) might refer to the fsspec convention abfs://<container>/<path>
-                    // or the convention for the hadoop driver abfs[s]://<file_system>@<account_name>.dfs.core.windows.net/<path>
-                    if parsed.username().is_empty() {
-                        self.container_name =
-                            parsed.host_str().map(|host| host.to_owned());
-                    } else if let Some(host) = parsed.host_str() {
-                        let parts = host.splitn(2, '.').collect::<Vec<&str>>();
-                        if parts.len() == 2 && parts[1] == "dfs.core.windows.net" {
-                            self.container_name = Some(parsed.username().to_owned());
-                            self.account_name = Some(parts[0].to_string());
-                        }
-                    }
-                }
-                "https" => {
-                    if let Some(host) = parsed.host_str() {
-                        let parts = host.splitn(2, '.').collect::<Vec<&str>>();
-                        if parts.len() == 2
-                            && (parts[1] == "dfs.core.windows.net"
-                                || parts[1] == "blob.core.windows.net")
-                        {
-                            self.account_name = Some(parts[0].to_string());
-                        }
-                    }
+    pub fn with_url(mut self, url: impl Into<String>) -> Self {
+        self.url = Some(url.into());
+        self
+    }
+
+    /// Sets properties on this builder based on a URL
+    ///
+    /// This is a separate member function to allow fallible computation to
+    /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`]
+    fn parse_url(&mut self, url: &str) -> Result<()> {
+        let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?;
+        let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?;
+
+        let validate = |s: &str| match s.contains('.') {
+            true => Err(UrlNotRecognisedSnafu { url }.build()),
+            false => Ok(s.to_string()),
+        };
+
+        match parsed.scheme() {
+            "az" | "adl" | "azure" => self.container_name = Some(validate(host)?),
+            "abfs" | "abfss" => {
+                // abfs(s) might refer to the fsspec convention abfs://<container>/<path>
+                // or the convention for the hadoop driver abfs[s]://<file_system>@<account_name>.dfs.core.windows.net/<path>
+                if parsed.username().is_empty() {
+                    self.container_name = Some(validate(host)?);
+                } else if let Some(a) = host.strip_suffix(".dfs.core.windows.net") {
+                    self.container_name = Some(validate(parsed.username())?);
+                    self.account_name = Some(validate(a)?);
+                } else {
+                    return Err(UrlNotRecognisedSnafu { url }.build().into());
                 }
-                other => {
-                    self.url_parse_error = Some(Error::UnknownUrlScheme {
-                        scheme: other.into(),
-                    });
+            }
+            "https" => match host.split_once('.') {
+                Some((a, "dfs.core.windows.net"))
+                | Some((a, "blob.core.windows.net")) => {
+                    self.account_name = Some(validate(a)?);
                 }
+                _ => return Err(UrlNotRecognisedSnafu { url }.build().into()),
             },
-            Err(err) => {
-                self.url_parse_error = Some(Error::UnableToParseUrl {
-                    source: err,
-                    url: url.as_ref().into(),
-                });
-            }
-        };
-        self
+            scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()),
+        }
+        Ok(())
     }
 
     /// Set the Azure Account (required)
@@ -595,63 +592,49 @@ impl MicrosoftAzureBuilder {
 
     /// Configure a connection to container with given name on Microsoft Azure
     /// Blob store.
-    pub fn build(self) -> Result<MicrosoftAzure> {
-        let Self {
-            account_name,
-            access_key,
-            container_name,
-            bearer_token,
-            client_id,
-            client_secret,
-            tenant_id,
-            sas_query_pairs,
-            use_emulator,
-            retry_config,
-            authority_host,
-            mut client_options,
-            url_parse_error,
-        } = self;
-
-        if let Some(err) = url_parse_error {
-            return Err(err.into());
+    pub fn build(mut self) -> Result<MicrosoftAzure> {
+        if let Some(url) = self.url.take() {
+            self.parse_url(&url)?;
         }
 
-        let container = container_name.ok_or(Error::MissingContainerName {})?;
+        let container = self.container_name.ok_or(Error::MissingContainerName {})?;
 
-        let (is_emulator, storage_url, auth, account) = if use_emulator {
-            let account_name =
-                account_name.unwrap_or_else(|| EMULATOR_ACCOUNT.to_string());
+        let (is_emulator, storage_url, auth, account) = if self.use_emulator {
+            let account_name = self
+                .account_name
+                .unwrap_or_else(|| EMULATOR_ACCOUNT.to_string());
             // Allow overriding defaults. Values taken from
             // from https://docs.rs/azure_storage/0.2.0/src/azure_storage/core/clients/storage_account_client.rs.html#129-141
             let url = url_from_env("AZURITE_BLOB_STORAGE_URL", "http://127.0.0.1:10000")?;
-            let account_key =
-                access_key.unwrap_or_else(|| EMULATOR_ACCOUNT_KEY.to_string());
+            let account_key = self
+                .access_key
+                .unwrap_or_else(|| EMULATOR_ACCOUNT_KEY.to_string());
             let credential = credential::CredentialProvider::AccessKey(account_key);
 
-            client_options = client_options.with_allow_http(true);
+            self.client_options = self.client_options.with_allow_http(true);
             (true, url, credential, account_name)
         } else {
-            let account_name = account_name.ok_or(Error::MissingAccount {})?;
+            let account_name = self.account_name.ok_or(Error::MissingAccount {})?;
             let account_url = format!("https://{}.blob.core.windows.net", &account_name);
             let url = Url::parse(&account_url)
                 .context(UnableToParseUrlSnafu { url: account_url })?;
-            let credential = if let Some(bearer_token) = bearer_token {
+            let credential = if let Some(bearer_token) = self.bearer_token {
                 Ok(credential::CredentialProvider::AccessKey(bearer_token))
-            } else if let Some(access_key) = access_key {
+            } else if let Some(access_key) = self.access_key {
                 Ok(credential::CredentialProvider::AccessKey(access_key))
             } else if let (Some(client_id), Some(client_secret), Some(tenant_id)) =
-                (client_id, client_secret, tenant_id)
+                (self.client_id, self.client_secret, self.tenant_id)
             {
                 let client_credential = credential::ClientSecretOAuthProvider::new(
                     client_id,
                     client_secret,
                     tenant_id,
-                    authority_host,
+                    self.authority_host,
                 );
                 Ok(credential::CredentialProvider::ClientSecret(
                     client_credential,
                 ))
-            } else if let Some(query_pairs) = sas_query_pairs {
+            } else if let Some(query_pairs) = self.sas_query_pairs {
                 Ok(credential::CredentialProvider::SASToken(query_pairs))
             } else {
                 Err(Error::MissingCredentials {})
@@ -661,12 +644,12 @@ impl MicrosoftAzureBuilder {
 
         let config = client::AzureConfig {
             account,
-            retry_config,
-            service: storage_url,
+            is_emulator,
             container,
+            retry_config: self.retry_config,
+            client_options: self.client_options,
+            service: storage_url,
             credentials: auth,
-            is_emulator,
-            client_options,
         };
 
         let client = Arc::new(client::AzureClient::new(config)?);
@@ -804,26 +787,49 @@ mod tests {
 
     #[test]
     fn azure_blob_test_urls() {
-        let builder = MicrosoftAzureBuilder::new()
-            .with_url("abfss://file_system@account.dfs.core.windows.net/");
+        let mut builder = MicrosoftAzureBuilder::new();
+        builder
+            .parse_url("abfss://file_system@account.dfs.core.windows.net/")
+            .unwrap();
         assert_eq!(builder.account_name, Some("account".to_string()));
         assert_eq!(builder.container_name, Some("file_system".to_string()));
 
-        let builder = MicrosoftAzureBuilder::new().with_url("abfs://container/path");
+        let mut builder = MicrosoftAzureBuilder::new();
+        builder.parse_url("abfs://container/path").unwrap();
         assert_eq!(builder.container_name, Some("container".to_string()));
 
-        let builder = MicrosoftAzureBuilder::new().with_url("az://container");
+        let mut builder = MicrosoftAzureBuilder::new();
+        builder.parse_url("az://container").unwrap();
         assert_eq!(builder.container_name, Some("container".to_string()));
 
-        let builder = MicrosoftAzureBuilder::new().with_url("az://container/path");
+        let mut builder = MicrosoftAzureBuilder::new();
+        builder.parse_url("az://container/path").unwrap();
         assert_eq!(builder.container_name, Some("container".to_string()));
 
-        let builder = MicrosoftAzureBuilder::new()
-            .with_url("https://account.dfs.core.windows.net/");
+        let mut builder = MicrosoftAzureBuilder::new();
+        builder
+            .parse_url("https://account.dfs.core.windows.net/")
+            .unwrap();
         assert_eq!(builder.account_name, Some("account".to_string()));
 
-        let builder = MicrosoftAzureBuilder::new()
-            .with_url("https://account.blob.core.windows.net/");
-        assert_eq!(builder.account_name, Some("account".to_string()))
+        let mut builder = MicrosoftAzureBuilder::new();
+        builder
+            .parse_url("https://account.blob.core.windows.net/")
+            .unwrap();
+        assert_eq!(builder.account_name, Some("account".to_string()));
+
+        let err_cases = [
+            "mailto://account.blob.core.windows.net/",
+            "az://blob.mydomain/",
+            "abfs://container.foo/path",
+            "abfss://file_system@account.foo.dfs.core.windows.net/",
+            "abfss://file_system.bar@account.dfs.core.windows.net/",
+            "https://blob.mydomain/",
+            "https://blob.foo.dfs.core.windows.net/",
+        ];
+        let mut builder = MicrosoftAzureBuilder::new();
+        for case in err_cases {
+            builder.parse_url(case).unwrap_err();
+        }
     }
 }
diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs
index c1424d971..f2638748f 100644
--- a/object_store/src/gcp/mod.rs
+++ b/object_store/src/gcp/mod.rs
@@ -42,7 +42,7 @@ use futures::{stream::BoxStream, StreamExt, TryStreamExt};
 use percent_encoding::{percent_encode, NON_ALPHANUMERIC};
 use reqwest::header::RANGE;
 use reqwest::{header, Client, Method, Response, StatusCode};
-use snafu::{ResultExt, Snafu};
+use snafu::{OptionExt, ResultExt, Snafu};
 use tokio::io::AsyncWrite;
 use url::Url;
 
@@ -142,6 +142,9 @@ enum Error {
         scheme
     ))]
     UnknownUrlScheme { scheme: String },
+
+    #[snafu(display("URL did not match any known pattern for scheme: {}", url))]
+    UrlNotRecognised { url: String },
 }
 
 impl From<Error> for super::Error {
@@ -784,13 +787,13 @@ fn reader_credentials_file(
 ///  .with_bucket_name(BUCKET_NAME)
 ///  .build();
 /// ```
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct GoogleCloudStorageBuilder {
     bucket_name: Option<String>,
+    url: Option<String>,
     service_account_path: Option<String>,
     retry_config: RetryConfig,
     client_options: ClientOptions,
-    url_parse_error: Option<Error>,
 }
 
 impl Default for GoogleCloudStorageBuilder {
@@ -800,7 +803,7 @@ impl Default for GoogleCloudStorageBuilder {
             service_account_path: None,
             retry_config: Default::default(),
             client_options: ClientOptions::new().with_allow_http(true),
-            url_parse_error: None,
+            url: None,
         }
     }
 }
@@ -845,9 +848,7 @@ impl GoogleCloudStorageBuilder {
     ///
     /// - `gs://<bucket>/<path>`
     ///
-    /// Please note that this is a best effort implementation, and will not fail for malformed URLs,
-    /// but rather warn and ignore the passed url. The url also has no effect on how the
-    /// storage is accessed - e.g. which driver or protocol is used for reading from the location.
+    /// Note: Settings derived from the URL will override any others set on this builder
     ///
     /// # Example
     /// ```
@@ -857,29 +858,31 @@ impl GoogleCloudStorageBuilder {
     ///     .with_url("gs://bucket/path")
     ///     .build();
     /// ```
-    pub fn with_url(mut self, url: impl AsRef<str>) -> Self {
-        let maybe_parsed = Url::parse(url.as_ref());
-        match maybe_parsed {
-            Ok(parsed) => match parsed.scheme() {
-                "gs" => {
-                    self.bucket_name = parsed.host_str().map(|host| host.to_owned());
-                }
-                other => {
-                    self.url_parse_error = Some(Error::UnknownUrlScheme {
-                        scheme: other.into(),
-                    });
-                }
-            },
-            Err(err) => {
-                self.url_parse_error = Some(Error::UnableToParseUrl {
-                    source: err,
-                    url: url.as_ref().into(),
-                });
-            }
-        };
+    pub fn with_url(mut self, url: impl Into<String>) -> Self {
+        self.url = Some(url.into());
         self
     }
 
+    /// Sets properties on this builder based on a URL
+    ///
+    /// This is a separate member function to allow fallible computation to
+    /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`]
+    fn parse_url(&mut self, url: &str) -> Result<()> {
+        let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?;
+        let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?;
+
+        let validate = |s: &str| match s.contains('.') {
+            true => Err(UrlNotRecognisedSnafu { url }.build()),
+            false => Ok(s.to_string()),
+        };
+
+        match parsed.scheme() {
+            "gs" => self.bucket_name = Some(validate(host)?),
+            scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()),
+        }
+        Ok(())
+    }
+
     /// Set the bucket name (required)
     pub fn with_bucket_name(mut self, bucket_name: impl Into<String>) -> Self {
         self.bucket_name = Some(bucket_name.into());
@@ -927,24 +930,17 @@ impl GoogleCloudStorageBuilder {
 
     /// Configure a connection to Google Cloud Storage, returning a
     /// new [`GoogleCloudStorage`] and consuming `self`
-    pub fn build(self) -> Result<GoogleCloudStorage> {
-        let Self {
-            bucket_name,
-            service_account_path,
-            retry_config,
-            client_options,
-            url_parse_error,
-        } = self;
-
-        if let Some(err) = url_parse_error {
-            return Err(err.into());
+    pub fn build(mut self) -> Result<GoogleCloudStorage> {
+        if let Some(url) = self.url.take() {
+            self.parse_url(&url)?;
         }
 
-        let bucket_name = bucket_name.ok_or(Error::MissingBucketName {})?;
-        let service_account_path =
-            service_account_path.ok_or(Error::MissingServiceAccountPath)?;
+        let bucket_name = self.bucket_name.ok_or(Error::MissingBucketName {})?;
+        let service_account_path = self
+            .service_account_path
+            .ok_or(Error::MissingServiceAccountPath)?;
 
-        let client = client_options.client()?;
+        let client = self.client_options.client()?;
         let credentials = reader_credentials_file(service_account_path)?;
 
         // TODO: https://cloud.google.com/storage/docs/authentication#oauth-scopes
@@ -977,8 +973,8 @@ impl GoogleCloudStorageBuilder {
                 token_cache: Default::default(),
                 bucket_name,
                 bucket_name_encoded: encoded_bucket_name,
-                retry_config,
-                client_options,
+                retry_config: self.retry_config,
+                client_options: self.client_options,
                 max_list_results: None,
             }),
         })
@@ -1199,7 +1195,14 @@ mod test {
 
     #[test]
     fn gcs_test_urls() {
-        let builder = GoogleCloudStorageBuilder::new().with_url("gs://bucket/path");
-        assert_eq!(builder.bucket_name, Some("bucket".to_string()))
+        let mut builder = GoogleCloudStorageBuilder::new();
+        builder.parse_url("gs://bucket/path").unwrap();
+        assert_eq!(builder.bucket_name, Some("bucket".to_string()));
+
+        let err_cases = ["mailto://bucket/path", "gs://bucket.mydomain/path"];
+        let mut builder = GoogleCloudStorageBuilder::new();
+        for case in err_cases {
+            builder.parse_url(case).unwrap_err();
+        }
     }
 }
diff --git a/object_store/src/http/mod.rs b/object_store/src/http/mod.rs
index 25997d892..f05e70024 100644
--- a/object_store/src/http/mod.rs
+++ b/object_store/src/http/mod.rs
@@ -55,8 +55,11 @@ enum Error {
     #[snafu(display("Must specify a URL"))]
     MissingUrl,
 
-    #[snafu(display("Invalid URL: {}", source))]
-    InvalidUrl { source: reqwest::Error },
+    #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))]
+    UnableToParseUrl {
+        source: url::ParseError,
+        url: String,
+    },
 
     #[snafu(display("Object is a directory"))]
     IsDirectory,
@@ -210,9 +213,9 @@ impl ObjectStore for HttpStore {
 }
 
 /// Configure a connection to a generic HTTP server
-#[derive(Debug, Default)]
+#[derive(Debug, Default, Clone)]
 pub struct HttpBuilder {
-    url: Option<Result<Url>>,
+    url: Option<String>,
     client_options: ClientOptions,
     retry_config: RetryConfig,
 }
@@ -224,8 +227,8 @@ impl HttpBuilder {
     }
 
     /// Set the URL
-    pub fn with_url(mut self, url: impl reqwest::IntoUrl) -> Self {
-        self.url = Some(url.into_url().context(InvalidUrlSnafu).map_err(Into::into));
+    pub fn with_url(mut self, url: impl Into<String>) -> Self {
+        self.url = Some(url.into());
         self
     }
 
@@ -243,9 +246,11 @@ impl HttpBuilder {
 
     /// Build an [`HttpStore`] with the configured options
     pub fn build(self) -> Result<HttpStore> {
-        let url = self.url.context(MissingUrlSnafu)??;
+        let url = self.url.context(MissingUrlSnafu)?;
+        let parsed = Url::parse(&url).context(UnableToParseUrlSnafu { url })?;
+
         Ok(HttpStore {
-            client: Client::new(url, self.client_options, self.retry_config)?,
+            client: Client::new(parsed, self.client_options, self.retry_config)?,
         })
     }
 }