You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/12/31 14:01:00 UTC
[arrow-datafusion] branch master updated: Add example on how to query multiple parquet files (#1497)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/master by this push:
new 07f5b3d Add example on how to query multiple parquet files (#1497)
07f5b3d is described below
commit 07f5b3da8f5bab4c296aa2886be37556b104a930
Author: Nitish Tiwari <51...@users.noreply.github.com>
AuthorDate: Fri Dec 31 19:30:53 2021 +0530
Add example on how to query multiple parquet files (#1497)
---
.../examples/parquet_sql_multiple_files.rs | 67 ++++++++++++++++++++++
1 file changed, 67 insertions(+)
diff --git a/datafusion-examples/examples/parquet_sql_multiple_files.rs b/datafusion-examples/examples/parquet_sql_multiple_files.rs
new file mode 100644
index 0000000..2e95427
--- /dev/null
+++ b/datafusion-examples/examples/parquet_sql_multiple_files.rs
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion::datasource::file_format::parquet::ParquetFormat;
+use datafusion::datasource::listing::ListingOptions;
+use datafusion::error::Result;
+use datafusion::prelude::*;
+use std::sync::Arc;
+
+/// This example demonstrates executing a simple query against an Arrow data source (a directory
+/// with multiple Parquet files) and fetching results
+#[tokio::main]
+async fn main() -> Result<()> {
+ // create local execution context
+ let mut ctx = ExecutionContext::new();
+
+ let testdata = datafusion::arrow::util::test_util::parquet_test_data();
+
+ // Configure listing options
+ let file_format = ParquetFormat::default().with_enable_pruning(true);
+ let listing_options = ListingOptions {
+ file_extension: ".parquet".to_owned(),
+ format: Arc::new(file_format),
+ table_partition_cols: vec![],
+ collect_stat: true,
+ target_partitions: 1,
+ };
+
+ // Register a listing table - this will use all files in the directory as data sources
+ // for the query
+ ctx.register_listing_table(
+ "my_table",
+ &format!("file://{}", testdata),
+ listing_options,
+ None,
+ )
+ .await
+ .unwrap();
+
+ // execute the query
+ let df = ctx
+ .sql(
+ "SELECT int_col, double_col, CAST(date_string_col as VARCHAR) \
+ FROM alltypes_plain \
+ WHERE id > 1 AND tinyint_col < double_col",
+ )
+ .await?;
+
+ // print the results
+ df.show().await?;
+
+ Ok(())
+}