You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2024/02/04 16:02:43 UTC
(arrow-datafusion) branch main updated: Add http(s) support to the command line (#8753)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 840499fbd3 Add http(s) support to the command line (#8753)
840499fbd3 is described below
commit 840499fbd3c25bf82a6ee0641ef6888079e096da
Author: Kieran Colford <ki...@kcolford.com>
AuthorDate: Sun Feb 4 11:02:38 2024 -0500
Add http(s) support to the command line (#8753)
* Add http(s) support to the command line
* fmt
* Add documentation
* Add a test
* fmt
---------
Co-authored-by: Andrew Lamb <an...@nerdnetworks.org>
---
datafusion-cli/Cargo.lock | 20 ++++++++++----------
datafusion-cli/Cargo.toml | 2 +-
datafusion-cli/src/exec.rs | 18 ++++++++++++++++++
docs/source/user-guide/cli.md | 21 +++++++++++++++++++++
4 files changed, 50 insertions(+), 11 deletions(-)
diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index 94c57bd770..072898cda4 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -1852,9 +1852,9 @@ dependencies = [
[[package]]
name = "iana-time-zone"
-version = "0.1.59"
+version = "0.1.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6a67363e2aa4443928ce15e57ebae94fd8949958fd1223c4cfc0cd473ad7539"
+checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141"
dependencies = [
"android_system_properties",
"core-foundation-sys",
@@ -2159,9 +2159,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
[[package]]
name = "miniz_oxide"
-version = "0.7.1"
+version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
+checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7"
dependencies = [
"adler",
]
@@ -2917,9 +2917,9 @@ dependencies = [
[[package]]
name = "rustls-pki-types"
-version = "1.1.0"
+version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e9d979b3ce68192e42760c7810125eb6cf2ea10efae545a156063e61f314e2a"
+checksum = "0a716eb65e3158e90e17cd93d855216e27bde02745ab842f2cab4a39dba1bacf"
[[package]]
name = "rustls-webpki"
@@ -3371,9 +3371,9 @@ dependencies = [
[[package]]
name = "time"
-version = "0.3.32"
+version = "0.3.34"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fe80ced77cbfb4cb91a94bf72b378b4b6791a0d9b7f09d0be747d1bdff4e68bd"
+checksum = "c8248b6521bb14bc45b4067159b9b6ad792e2d6d754d6c41fb50e29fefe38749"
dependencies = [
"deranged",
"num-conv",
@@ -3425,9 +3425,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tokio"
-version = "1.35.1"
+version = "1.36.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c89b4efa943be685f629b149f53829423f8f5531ea21249408e8e2f8671ec104"
+checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
dependencies = [
"backtrace",
"bytes",
diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml
index 79a1f0162e..e40aa6107c 100644
--- a/datafusion-cli/Cargo.toml
+++ b/datafusion-cli/Cargo.toml
@@ -41,7 +41,7 @@ dirs = "4.0.0"
env_logger = "0.9"
futures = "0.3"
mimalloc = { version = "0.1", default-features = false }
-object_store = { version = "0.9.0", features = ["aws", "gcp"] }
+object_store = { version = "0.9.0", features = ["aws", "gcp", "http"] }
parking_lot = { version = "0.12" }
parquet = { version = "50.0.0", default-features = false }
regex = "1.8"
diff --git a/datafusion-cli/src/exec.rs b/datafusion-cli/src/exec.rs
index a175f99a90..5273ea8ee8 100644
--- a/datafusion-cli/src/exec.rs
+++ b/datafusion-cli/src/exec.rs
@@ -42,6 +42,7 @@ use datafusion::physical_plan::{collect, execute_stream};
use datafusion::prelude::SessionContext;
use datafusion::sql::{parser::DFParser, sqlparser::dialect::dialect_from_str};
+use object_store::http::HttpBuilder;
use object_store::ObjectStore;
use rustyline::error::ReadlineError;
use rustyline::Editor;
@@ -281,6 +282,11 @@ async fn create_external_table(
let builder = get_gcs_object_store_builder(url, cmd)?;
Arc::new(builder.build()?) as Arc<dyn ObjectStore>
}
+ "http" | "https" => Arc::new(
+ HttpBuilder::new()
+ .with_url(url.origin().ascii_serialization())
+ .build()?,
+ ) as Arc<dyn ObjectStore>,
_ => {
// for other types, try to get from the object_store_registry
ctx.runtime_env()
@@ -329,12 +335,24 @@ mod tests {
return plan_err!("LogicalPlan is not a CreateExternalTable");
}
+ // Ensure the URL is supported by the object store
ctx.runtime_env()
.object_store(ListingTableUrl::parse(location)?)?;
Ok(())
}
+ #[tokio::test]
+ async fn create_object_store_table_http() -> Result<()> {
+ // Should be OK
+ let location = "http://example.com/file.parquet";
+ let sql =
+ format!("CREATE EXTERNAL TABLE test STORED AS PARQUET LOCATION '{location}'");
+ create_external_table_test(location, &sql).await?;
+
+ Ok(())
+ }
+
#[tokio::test]
async fn create_object_store_table_s3() -> Result<()> {
let access_key_id = "fake_access_key_id";
diff --git a/docs/source/user-guide/cli.md b/docs/source/user-guide/cli.md
index 30ab7d1495..4b909bbb1e 100644
--- a/docs/source/user-guide/cli.md
+++ b/docs/source/user-guide/cli.md
@@ -260,6 +260,27 @@ STORED AS CSV
LOCATION '/path/to/aggregate_test_100.csv';
```
+## Registering Remote Data Sources
+
+`datafusion-cli` can read from remote locations using a variety of protocols.
+For example to read from a remote parquet file via HTTP(S) you can use the following:
+
+```sql
+CREATE EXTERNAL TABLE hits
+STORED AS PARQUET
+LOCATION 'https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_1.parquet';
+```
+
+```sql
+❯ select count(*) from hits;
++----------+
+| COUNT(*) |
++----------+
+| 1000000 |
++----------+
+1 row in set. Query took 0.344 seconds.
+```
+
## Registering S3 Data Sources
[AWS S3](https://aws.amazon.com/s3/) data sources can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement.