You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/12/31 14:01:00 UTC

[arrow-datafusion] branch master updated: Add example on how to query multiple parquet files (#1497)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new 07f5b3d  Add example on how to query multiple parquet files (#1497)
07f5b3d is described below

commit 07f5b3da8f5bab4c296aa2886be37556b104a930
Author: Nitish Tiwari <51...@users.noreply.github.com>
AuthorDate: Fri Dec 31 19:30:53 2021 +0530

    Add example on how to query multiple parquet files (#1497)
---
 .../examples/parquet_sql_multiple_files.rs         | 67 ++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/datafusion-examples/examples/parquet_sql_multiple_files.rs b/datafusion-examples/examples/parquet_sql_multiple_files.rs
new file mode 100644
index 0000000..2e95427
--- /dev/null
+++ b/datafusion-examples/examples/parquet_sql_multiple_files.rs
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion::datasource::file_format::parquet::ParquetFormat;
+use datafusion::datasource::listing::ListingOptions;
+use datafusion::error::Result;
+use datafusion::prelude::*;
+use std::sync::Arc;
+
+/// This example demonstrates executing a simple query against an Arrow data source (a directory
+/// with multiple Parquet files) and fetching results
+#[tokio::main]
+async fn main() -> Result<()> {
+    // create local execution context
+    let mut ctx = ExecutionContext::new();
+
+    let testdata = datafusion::arrow::util::test_util::parquet_test_data();
+
+    // Configure listing options
+    let file_format = ParquetFormat::default().with_enable_pruning(true);
+    let listing_options = ListingOptions {
+        file_extension: ".parquet".to_owned(),
+        format: Arc::new(file_format),
+        table_partition_cols: vec![],
+        collect_stat: true,
+        target_partitions: 1,
+    };
+
+    // Register a listing table - this will use all files in the directory as data sources
+    // for the query
+    ctx.register_listing_table(
+        "my_table",
+        &format!("file://{}", testdata),
+        listing_options,
+        None,
+    )
+    .await
+    .unwrap();
+
+    // execute the query
+    let df = ctx
+        .sql(
+            "SELECT int_col, double_col, CAST(date_string_col as VARCHAR) \
+        FROM alltypes_plain \
+        WHERE id > 1 AND tinyint_col < double_col",
+        )
+        .await?;
+
+    // print the results
+    df.show().await?;
+
+    Ok(())
+}