You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2024/02/04 16:02:43 UTC

(arrow-datafusion) branch main updated: Add http(s) support to the command line (#8753)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 840499fbd3 Add http(s) support to the command line (#8753)
840499fbd3 is described below

commit 840499fbd3c25bf82a6ee0641ef6888079e096da
Author: Kieran Colford <ki...@kcolford.com>
AuthorDate: Sun Feb 4 11:02:38 2024 -0500

    Add http(s) support to the command line (#8753)
    
    * Add http(s) support to the command line
    
    * fmt
    
    * Add documentation
    
    * Add a test
    
    * fmt
    
    ---------
    
    Co-authored-by: Andrew Lamb <an...@nerdnetworks.org>
---
 datafusion-cli/Cargo.lock     | 20 ++++++++++----------
 datafusion-cli/Cargo.toml     |  2 +-
 datafusion-cli/src/exec.rs    | 18 ++++++++++++++++++
 docs/source/user-guide/cli.md | 21 +++++++++++++++++++++
 4 files changed, 50 insertions(+), 11 deletions(-)

diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index 94c57bd770..072898cda4 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -1852,9 +1852,9 @@ dependencies = [
 
 [[package]]
 name = "iana-time-zone"
-version = "0.1.59"
+version = "0.1.60"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6a67363e2aa4443928ce15e57ebae94fd8949958fd1223c4cfc0cd473ad7539"
+checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141"
 dependencies = [
  "android_system_properties",
  "core-foundation-sys",
@@ -2159,9 +2159,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
 
 [[package]]
 name = "miniz_oxide"
-version = "0.7.1"
+version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
+checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7"
 dependencies = [
  "adler",
 ]
@@ -2917,9 +2917,9 @@ dependencies = [
 
 [[package]]
 name = "rustls-pki-types"
-version = "1.1.0"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e9d979b3ce68192e42760c7810125eb6cf2ea10efae545a156063e61f314e2a"
+checksum = "0a716eb65e3158e90e17cd93d855216e27bde02745ab842f2cab4a39dba1bacf"
 
 [[package]]
 name = "rustls-webpki"
@@ -3371,9 +3371,9 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.3.32"
+version = "0.3.34"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fe80ced77cbfb4cb91a94bf72b378b4b6791a0d9b7f09d0be747d1bdff4e68bd"
+checksum = "c8248b6521bb14bc45b4067159b9b6ad792e2d6d754d6c41fb50e29fefe38749"
 dependencies = [
  "deranged",
  "num-conv",
@@ -3425,9 +3425,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokio"
-version = "1.35.1"
+version = "1.36.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c89b4efa943be685f629b149f53829423f8f5531ea21249408e8e2f8671ec104"
+checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
 dependencies = [
  "backtrace",
  "bytes",
diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml
index 79a1f0162e..e40aa6107c 100644
--- a/datafusion-cli/Cargo.toml
+++ b/datafusion-cli/Cargo.toml
@@ -41,7 +41,7 @@ dirs = "4.0.0"
 env_logger = "0.9"
 futures = "0.3"
 mimalloc = { version = "0.1", default-features = false }
-object_store = { version = "0.9.0", features = ["aws", "gcp"] }
+object_store = { version = "0.9.0", features = ["aws", "gcp", "http"] }
 parking_lot = { version = "0.12" }
 parquet = { version = "50.0.0", default-features = false }
 regex = "1.8"
diff --git a/datafusion-cli/src/exec.rs b/datafusion-cli/src/exec.rs
index a175f99a90..5273ea8ee8 100644
--- a/datafusion-cli/src/exec.rs
+++ b/datafusion-cli/src/exec.rs
@@ -42,6 +42,7 @@ use datafusion::physical_plan::{collect, execute_stream};
 use datafusion::prelude::SessionContext;
 use datafusion::sql::{parser::DFParser, sqlparser::dialect::dialect_from_str};
 
+use object_store::http::HttpBuilder;
 use object_store::ObjectStore;
 use rustyline::error::ReadlineError;
 use rustyline::Editor;
@@ -281,6 +282,11 @@ async fn create_external_table(
             let builder = get_gcs_object_store_builder(url, cmd)?;
             Arc::new(builder.build()?) as Arc<dyn ObjectStore>
         }
+        "http" | "https" => Arc::new(
+            HttpBuilder::new()
+                .with_url(url.origin().ascii_serialization())
+                .build()?,
+        ) as Arc<dyn ObjectStore>,
         _ => {
             // for other types, try to get from the object_store_registry
             ctx.runtime_env()
@@ -329,12 +335,24 @@ mod tests {
             return plan_err!("LogicalPlan is not a CreateExternalTable");
         }
 
+        // Ensure the URL is supported by the object store
         ctx.runtime_env()
             .object_store(ListingTableUrl::parse(location)?)?;
 
         Ok(())
     }
 
+    #[tokio::test]
+    async fn create_object_store_table_http() -> Result<()> {
+        // Should be OK
+        let location = "http://example.com/file.parquet";
+        let sql =
+            format!("CREATE EXTERNAL TABLE test STORED AS PARQUET LOCATION '{location}'");
+        create_external_table_test(location, &sql).await?;
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn create_object_store_table_s3() -> Result<()> {
         let access_key_id = "fake_access_key_id";
diff --git a/docs/source/user-guide/cli.md b/docs/source/user-guide/cli.md
index 30ab7d1495..4b909bbb1e 100644
--- a/docs/source/user-guide/cli.md
+++ b/docs/source/user-guide/cli.md
@@ -260,6 +260,27 @@ STORED AS CSV
 LOCATION '/path/to/aggregate_test_100.csv';
 ```
 
+## Registering Remote Data Sources
+
+`datafusion-cli` can read from remote locations using a variety of protocols.
+For example to read from a remote parquet file via HTTP(S) you can use the following:
+
+```sql
+CREATE EXTERNAL TABLE hits
+STORED AS PARQUET
+LOCATION 'https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_1.parquet';
+```
+
+```sql
+❯ select count(*) from hits;
++----------+
+| COUNT(*) |
++----------+
+| 1000000  |
++----------+
+1 row in set. Query took 0.344 seconds.
+```
+
 ## Registering S3 Data Sources
 
 [AWS S3](https://aws.amazon.com/s3/) data sources can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement.