You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/05/10 09:29:34 UTC

[arrow-datafusion] branch main updated: Update Arrow 39 (#6252)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new e6d7e46ded Update Arrow 39 (#6252)
e6d7e46ded is described below

commit e6d7e46dedbe5046e4606bfd3d7a1199dd0aaae2
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Wed May 10 10:29:26 2023 +0100

    Update Arrow 39 (#6252)
    
    * Update Arrow 39
    
    * Update pin
---
 Cargo.toml                                         | 12 ++--
 datafusion-cli/Cargo.lock                          | 72 ++++++++++------------
 datafusion-cli/Cargo.toml                          |  2 +-
 datafusion/common/src/scalar.rs                    | 29 ++++-----
 datafusion/core/src/datasource/file_format/csv.rs  | 13 ++--
 .../core/src/datasource/file_format/parquet.rs     |  4 +-
 .../core/src/physical_plan/file_format/csv.rs      | 28 ++++-----
 .../file_format/parquet/page_filter.rs             |  4 +-
 datafusion/physical-expr/src/struct_expressions.rs |  8 ++-
 9 files changed, 84 insertions(+), 88 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index bc1e4625a0..a0efd150db 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -45,12 +45,12 @@ repository = "https://github.com/apache/arrow-datafusion"
 rust-version = "1.64"
 
 [workspace.dependencies]
-arrow = { version = "38.0.0", features = ["prettyprint"] }
-arrow-flight = { version = "38.0.0", features = ["flight-sql-experimental"] }
-arrow-buffer = { version = "38.0.0", default-features = false }
-arrow-schema = { version = "38.0.0", default-features = false }
-arrow-array = { version = "38.0.0", default-features = false, features = ["chrono-tz"] }
-parquet = { version = "38.0.0", features = ["arrow", "async", "object_store"] }
+arrow = { version = "39.0.0", features = ["prettyprint"] }
+arrow-flight = { version = "39.0.0", features = ["flight-sql-experimental"] }
+arrow-buffer = { version = "39.0.0", default-features = false }
+arrow-schema = { version = "39.0.0", default-features = false }
+arrow-array = { version = "39.0.0", default-features = false, features = ["chrono-tz"] }
+parquet = { version = "39.0.0", features = ["arrow", "async", "object_store"] }
 
 [profile.release]
 codegen-units = 1
diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index 399231702d..05415def8b 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -68,9 +68,9 @@ checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6"
 
 [[package]]
 name = "arrow"
-version = "38.0.0"
+version = "39.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c107a57b5913d852da9d5a40e280e4695f2258b5b87733c13b770c63a7117287"
+checksum = "218ca81dd088b102c0fd6687c72e73fad1ba93d2ef7b3cf9a1043b04b2c39dbf"
 dependencies = [
  "ahash",
  "arrow-arith",
@@ -90,9 +90,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-arith"
-version = "38.0.0"
+version = "39.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ace6aa3d5617c5d03041a05e01c6819428a8ddf49dd0b055df9b40fef9d96094"
+checksum = "d49309fa2299ec34a709cfc9f487c41ecaead96d1ab70e21857466346bbbd690"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -105,9 +105,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-array"
-version = "38.0.0"
+version = "39.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "104a04520692cc674e6afd7682f213ca41f9b13ff1873f63a5a2857a590b87b3"
+checksum = "e7a27466d897d99654357a6d95dc0a26931d9e4306e60c14fc31a894edb86579"
 dependencies = [
  "ahash",
  "arrow-buffer",
@@ -122,9 +122,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-buffer"
-version = "38.0.0"
+version = "39.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72c875bcb9530ec403998fb0b2dc6d180a7c64563ca4bc22b90eafb84b113143"
+checksum = "9405b78106a9d767c7b97c78a70ee1b23ee51a74f5188a821a716d9a85d1af2b"
 dependencies = [
  "half",
  "num",
@@ -132,9 +132,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-cast"
-version = "38.0.0"
+version = "39.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6d6e18281636c8fc0b93be59834da6bf9a72bb70fd0c98ddfdaf124da466c28"
+checksum = "be0ec5a79a87783dc828b7ff8f89f62880b3f553bc5f5b932a82f4a1035024b4"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -149,9 +149,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-csv"
-version = "38.0.0"
+version = "39.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3197dab0963a236ff8e7c82e2272535745955ac1321eb740c29f2f88b353f54e"
+checksum = "350d8e55c3b2d602a0a04389bcc1da40167657143a9922a7103190603e7b7692"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -168,9 +168,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-data"
-version = "38.0.0"
+version = "39.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb68113d6ecdbe8bba48b2c4042c151bf9e1c61244e45072a50250a6fc59bafe"
+checksum = "c6f710d98964d2c069b8baf566130045e79e11baa105623f038a6c942f805681"
 dependencies = [
  "arrow-buffer",
  "arrow-schema",
@@ -180,9 +180,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-ipc"
-version = "38.0.0"
+version = "39.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eab4bbf2dd3078facb5ce0a9641316a64f42bfd8cf357e6775c8a5e6708e3a8d"
+checksum = "9c99787cb8fabc187285da9e7182d22f2b80ecfac61ca0a42c4299e9eecdf903"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -194,9 +194,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-json"
-version = "38.0.0"
+version = "39.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48c5b650d23746a494665d914a7fa3d21d939153cff9d53bdebe39bffa88f263"
+checksum = "91c95a58ce63f60d80d7a3a1222d65df0bc060b71d31353c34a8118c2a6eae7b"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -214,9 +214,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-ord"
-version = "38.0.0"
+version = "39.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "68c6fce28e5011e30acc7466b5efcb8ed0197c396240bd2b10e167f275a3c208"
+checksum = "4141e6488610cc144e841da3de5f5371488f3cf5bc6bc7b3e752c64e7639c31b"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -229,9 +229,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-row"
-version = "38.0.0"
+version = "39.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f20a421f19799d8b93eb8edde5217e910fa1e2d6ceb3c529f000e57b6db144c0"
+checksum = "940191a3c636c111c41e816325b0941484bf904c46de72cd9553acd1afd24d33"
 dependencies = [
  "ahash",
  "arrow-array",
@@ -244,15 +244,15 @@ dependencies = [
 
 [[package]]
 name = "arrow-schema"
-version = "38.0.0"
+version = "39.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc85923d8d6662cc66ac6602c7d1876872e671002d60993dfdf492a6badeae92"
+checksum = "18c41d058b2895a12f46dfafc306ee3529ad9660406be0ab8a7967d5e27c417e"
 
 [[package]]
 name = "arrow-select"
-version = "38.0.0"
+version = "39.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6ab6613ce65b61d85a3410241744e84e48fbab0fe06e1251b4429d21b3470fd"
+checksum = "9fcbdda2772b7e712e77444f3a71f4ee517095aceb993b35de71de41c70d9b4f"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -263,9 +263,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-string"
-version = "38.0.0"
+version = "39.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f3008641239e884aefba66d8b8532da6af40d14296349fcc85935de4ba67b89e"
+checksum = "7081c34f4b534ad320a03db79d58e38972041bb7c65686b98bbcc2f9a67a9cee"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -273,7 +273,7 @@ dependencies = [
  "arrow-schema",
  "arrow-select",
  "regex",
- "regex-syntax 0.6.29",
+ "regex-syntax",
 ]
 
 [[package]]
@@ -1127,7 +1127,7 @@ dependencies = [
  "hashbrown 0.13.2",
  "itertools",
  "log",
- "regex-syntax 0.7.1",
+ "regex-syntax",
 ]
 
 [[package]]
@@ -2165,9 +2165,9 @@ dependencies = [
 
 [[package]]
 name = "parquet"
-version = "38.0.0"
+version = "39.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4cbd51311f8d9ff3d2697b1522b18a588782e097d313a1a278b0faf2ccf2d3f6"
+checksum = "b0a1e6fa27f09ebddba280f5966ef435f3ac4d74cfc3ffe370fd3fd59c2e004d"
 dependencies = [
  "ahash",
  "arrow-array",
@@ -2445,15 +2445,9 @@ checksum = "af83e617f331cc6ae2da5443c602dfa5af81e517212d9d611a5b3ba1777b5370"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-syntax 0.7.1",
+ "regex-syntax",
 ]
 
-[[package]]
-name = "regex-syntax"
-version = "0.6.29"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
-
 [[package]]
 name = "regex-syntax"
 version = "0.7.1"
diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml
index fe64446942..a73c7dff60 100644
--- a/datafusion-cli/Cargo.toml
+++ b/datafusion-cli/Cargo.toml
@@ -29,7 +29,7 @@ rust-version = "1.62"
 readme = "README.md"
 
 [dependencies]
-arrow = "38.0.0"
+arrow = "39.0.0"
 async-trait = "0.1.41"
 clap = { version = "3", features = ["derive", "cargo"] }
 datafusion = { path = "../datafusion/core", version = "24.0.0" }
diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs
index 0d436fa5c0..04b123fd4a 100644
--- a/datafusion/common/src/scalar.rs
+++ b/datafusion/common/src/scalar.rs
@@ -31,6 +31,7 @@ use crate::cast::{
 };
 use crate::delta::shift_months;
 use crate::error::{DataFusionError, Result};
+use arrow::buffer::NullBuffer;
 use arrow::compute::nullif;
 use arrow::datatypes::{FieldRef, Fields, SchemaBuilder};
 use arrow::{
@@ -2375,8 +2376,8 @@ impl ScalarValue {
                 let field_values = fields
                     .iter()
                     .zip(columns)
-                    .map(|(field, column)| -> Result<(Field, ArrayRef)> {
-                        Ok((field.as_ref().clone(), Self::iter_to_array(column)?))
+                    .map(|(field, column)| {
+                        Ok((field.clone(), Self::iter_to_array(column)?))
                     })
                     .collect::<Result<Vec<_>>>()?;
 
@@ -2546,7 +2547,7 @@ impl ScalarValue {
         let offsets_array = offsets.finish();
         let array_data = ArrayDataBuilder::new(data_type.clone())
             .len(offsets_array.len() - 1)
-            .null_bit_buffer(Some(valid.finish()))
+            .nulls(Some(NullBuffer::new(valid.finish())))
             .add_buffer(offsets_array.values().inner().clone())
             .add_child_data(flat_array.to_data());
 
@@ -2777,7 +2778,7 @@ impl ScalarValue {
                         .iter()
                         .zip(values.iter())
                         .map(|(field, value)| {
-                            (field.as_ref().clone(), value.to_array_of_size(size))
+                            (field.clone(), value.to_array_of_size(size))
                         })
                         .collect();
 
@@ -4647,17 +4648,17 @@ mod tests {
 
     #[test]
     fn test_scalar_struct() {
-        let field_a = Field::new("A", DataType::Int32, false);
-        let field_b = Field::new("B", DataType::Boolean, false);
-        let field_c = Field::new("C", DataType::Utf8, false);
+        let field_a = Arc::new(Field::new("A", DataType::Int32, false));
+        let field_b = Arc::new(Field::new("B", DataType::Boolean, false));
+        let field_c = Arc::new(Field::new("C", DataType::Utf8, false));
 
-        let field_e = Field::new("e", DataType::Int16, false);
-        let field_f = Field::new("f", DataType::Int64, false);
-        let field_d = Field::new(
+        let field_e = Arc::new(Field::new("e", DataType::Int16, false));
+        let field_f = Arc::new(Field::new("f", DataType::Int64, false));
+        let field_d = Arc::new(Field::new(
             "D",
             DataType::Struct(vec![field_e.clone(), field_f.clone()].into()),
             false,
-        );
+        ));
 
         let scalar = ScalarValue::Struct(
             Some(vec![
@@ -4824,12 +4825,12 @@ mod tests {
 
     #[test]
     fn test_lists_in_struct() {
-        let field_a = Field::new("A", DataType::Utf8, false);
-        let field_primitive_list = Field::new(
+        let field_a = Arc::new(Field::new("A", DataType::Utf8, false));
+        let field_primitive_list = Arc::new(Field::new(
             "primitive_list",
             DataType::List(Arc::new(Field::new("item", DataType::Int32, true))),
             false,
-        );
+        ));
 
         // Define primitive list scalars
         let l0 = ScalarValue::List(
diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs
index ff64979b5c..cfbde41109 100644
--- a/datafusion/core/src/datasource/file_format/csv.rs
+++ b/datafusion/core/src/datasource/file_format/csv.rs
@@ -239,14 +239,13 @@ impl CsvFormat {
         pin_mut!(stream);
 
         while let Some(chunk) = stream.next().await.transpose()? {
+            let format = arrow::csv::reader::Format::default()
+                .with_header(self.has_header && first_chunk)
+                .with_delimiter(self.delimiter);
+
             let (Schema { fields, .. }, records_read) =
-                arrow::csv::reader::infer_reader_schema(
-                    chunk.reader(),
-                    self.delimiter,
-                    Some(records_to_read),
-                    // only consider header for first chunk
-                    self.has_header && first_chunk,
-                )?;
+                format.infer_schema(chunk.reader(), Some(records_to_read))?;
+
             records_to_read -= records_read;
             total_records_read += records_read;
 
diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs
index 0ffc16cb11..f2780cd469 100644
--- a/datafusion/core/src/datasource/file_format/parquet.rs
+++ b/datafusion/core/src/datasource/file_format/parquet.rs
@@ -1201,7 +1201,7 @@ mod tests {
                 .unwrap()
                 .metadata()
                 .clone();
-        check_page_index_validation(builder.page_indexes(), builder.offset_indexes());
+        check_page_index_validation(builder.column_index(), builder.offset_index());
 
         let path = format!("{testdata}/alltypes_tiny_pages_plain.parquet");
         let file = File::open(path).await.unwrap();
@@ -1211,7 +1211,7 @@ mod tests {
             .unwrap()
             .metadata()
             .clone();
-        check_page_index_validation(builder.page_indexes(), builder.offset_indexes());
+        check_page_index_validation(builder.column_index(), builder.offset_index());
 
         Ok(())
     }
diff --git a/datafusion/core/src/physical_plan/file_format/csv.rs b/datafusion/core/src/physical_plan/file_format/csv.rs
index e7633807e0..f19cf0743a 100644
--- a/datafusion/core/src/physical_plan/file_format/csv.rs
+++ b/datafusion/core/src/physical_plan/file_format/csv.rs
@@ -221,23 +221,21 @@ impl CsvConfig {
 }
 
 impl CsvConfig {
-    fn open<R: std::io::Read>(&self, reader: R) -> csv::Reader<R> {
-        let datetime_format = None;
-        csv::Reader::new(
-            reader,
-            Arc::clone(&self.file_schema),
-            self.has_header,
-            Some(self.delimiter),
-            self.batch_size,
-            None,
-            self.file_projection.clone(),
-            datetime_format,
-        )
+    fn open<R: std::io::Read>(&self, reader: R) -> Result<csv::Reader<R>> {
+        let mut builder = csv::ReaderBuilder::new(self.file_schema.clone())
+            .has_header(self.has_header)
+            .with_delimiter(self.delimiter)
+            .with_batch_size(self.batch_size);
+
+        if let Some(p) = &self.file_projection {
+            builder = builder.with_projection(p.clone());
+        }
+
+        Ok(builder.build(reader)?)
     }
 
     fn builder(&self) -> csv::ReaderBuilder {
-        let mut builder = csv::ReaderBuilder::new()
-            .with_schema(self.file_schema.clone())
+        let mut builder = csv::ReaderBuilder::new(self.file_schema.clone())
             .with_delimiter(self.delimiter)
             .with_batch_size(self.batch_size)
             .has_header(self.has_header);
@@ -277,7 +275,7 @@ impl FileOpener for CsvOpener {
             match config.object_store.get(file_meta.location()).await? {
                 GetResult::File(file, _) => {
                     let decoder = file_compression_type.convert_read(file)?;
-                    Ok(futures::stream::iter(config.open(decoder)).boxed())
+                    Ok(futures::stream::iter(config.open(decoder)?).boxed())
                 }
                 GetResult::Stream(s) => {
                     let mut decoder = config.builder().build_decoder();
diff --git a/datafusion/core/src/physical_plan/file_format/parquet/page_filter.rs b/datafusion/core/src/physical_plan/file_format/parquet/page_filter.rs
index 58592b0605..410c43c57d 100644
--- a/datafusion/core/src/physical_plan/file_format/parquet/page_filter.rs
+++ b/datafusion/core/src/physical_plan/file_format/parquet/page_filter.rs
@@ -145,8 +145,8 @@ impl PagePruningPredicate {
             return Ok(None);
         }
 
-        let file_offset_indexes = file_metadata.offset_indexes();
-        let file_page_indexes = file_metadata.page_indexes();
+        let file_offset_indexes = file_metadata.offset_index();
+        let file_page_indexes = file_metadata.column_index();
         let (file_offset_indexes, file_page_indexes) =
             match (file_offset_indexes, file_page_indexes) {
                 (Some(o), Some(i)) => (o, i),
diff --git a/datafusion/physical-expr/src/struct_expressions.rs b/datafusion/physical-expr/src/struct_expressions.rs
index d03f94a497..dc8812b1ee 100644
--- a/datafusion/physical-expr/src/struct_expressions.rs
+++ b/datafusion/physical-expr/src/struct_expressions.rs
@@ -34,7 +34,7 @@ fn array_struct(args: &[ArrayRef]) -> Result<ArrayRef> {
     let vec: Vec<_> = args
         .iter()
         .enumerate()
-        .map(|(i, arg)| -> Result<(Field, ArrayRef)> {
+        .map(|(i, arg)| {
             let field_name = format!("c{i}");
             match arg.data_type() {
                 DataType::Utf8
@@ -50,7 +50,11 @@ fn array_struct(args: &[ArrayRef]) -> Result<ArrayRef> {
                 | DataType::UInt16
                 | DataType::UInt32
                 | DataType::UInt64 => Ok((
-                    Field::new(field_name.as_str(), arg.data_type().clone(), true),
+                    Arc::new(Field::new(
+                        field_name.as_str(),
+                        arg.data_type().clone(),
+                        true,
+                    )),
                     arg.clone(),
                 )),
                 data_type => Err(DataFusionError::NotImplemented(format!(