You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/05/10 09:29:34 UTC
[arrow-datafusion] branch main updated: Update Arrow 39 (#6252)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new e6d7e46ded Update Arrow 39 (#6252)
e6d7e46ded is described below
commit e6d7e46dedbe5046e4606bfd3d7a1199dd0aaae2
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Wed May 10 10:29:26 2023 +0100
Update Arrow 39 (#6252)
* Update Arrow 39
* Update pin
---
Cargo.toml | 12 ++--
datafusion-cli/Cargo.lock | 72 ++++++++++------------
datafusion-cli/Cargo.toml | 2 +-
datafusion/common/src/scalar.rs | 29 ++++-----
datafusion/core/src/datasource/file_format/csv.rs | 13 ++--
.../core/src/datasource/file_format/parquet.rs | 4 +-
.../core/src/physical_plan/file_format/csv.rs | 28 ++++-----
.../file_format/parquet/page_filter.rs | 4 +-
datafusion/physical-expr/src/struct_expressions.rs | 8 ++-
9 files changed, 84 insertions(+), 88 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index bc1e4625a0..a0efd150db 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -45,12 +45,12 @@ repository = "https://github.com/apache/arrow-datafusion"
rust-version = "1.64"
[workspace.dependencies]
-arrow = { version = "38.0.0", features = ["prettyprint"] }
-arrow-flight = { version = "38.0.0", features = ["flight-sql-experimental"] }
-arrow-buffer = { version = "38.0.0", default-features = false }
-arrow-schema = { version = "38.0.0", default-features = false }
-arrow-array = { version = "38.0.0", default-features = false, features = ["chrono-tz"] }
-parquet = { version = "38.0.0", features = ["arrow", "async", "object_store"] }
+arrow = { version = "39.0.0", features = ["prettyprint"] }
+arrow-flight = { version = "39.0.0", features = ["flight-sql-experimental"] }
+arrow-buffer = { version = "39.0.0", default-features = false }
+arrow-schema = { version = "39.0.0", default-features = false }
+arrow-array = { version = "39.0.0", default-features = false, features = ["chrono-tz"] }
+parquet = { version = "39.0.0", features = ["arrow", "async", "object_store"] }
[profile.release]
codegen-units = 1
diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index 399231702d..05415def8b 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -68,9 +68,9 @@ checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6"
[[package]]
name = "arrow"
-version = "38.0.0"
+version = "39.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c107a57b5913d852da9d5a40e280e4695f2258b5b87733c13b770c63a7117287"
+checksum = "218ca81dd088b102c0fd6687c72e73fad1ba93d2ef7b3cf9a1043b04b2c39dbf"
dependencies = [
"ahash",
"arrow-arith",
@@ -90,9 +90,9 @@ dependencies = [
[[package]]
name = "arrow-arith"
-version = "38.0.0"
+version = "39.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ace6aa3d5617c5d03041a05e01c6819428a8ddf49dd0b055df9b40fef9d96094"
+checksum = "d49309fa2299ec34a709cfc9f487c41ecaead96d1ab70e21857466346bbbd690"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -105,9 +105,9 @@ dependencies = [
[[package]]
name = "arrow-array"
-version = "38.0.0"
+version = "39.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "104a04520692cc674e6afd7682f213ca41f9b13ff1873f63a5a2857a590b87b3"
+checksum = "e7a27466d897d99654357a6d95dc0a26931d9e4306e60c14fc31a894edb86579"
dependencies = [
"ahash",
"arrow-buffer",
@@ -122,9 +122,9 @@ dependencies = [
[[package]]
name = "arrow-buffer"
-version = "38.0.0"
+version = "39.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72c875bcb9530ec403998fb0b2dc6d180a7c64563ca4bc22b90eafb84b113143"
+checksum = "9405b78106a9d767c7b97c78a70ee1b23ee51a74f5188a821a716d9a85d1af2b"
dependencies = [
"half",
"num",
@@ -132,9 +132,9 @@ dependencies = [
[[package]]
name = "arrow-cast"
-version = "38.0.0"
+version = "39.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6d6e18281636c8fc0b93be59834da6bf9a72bb70fd0c98ddfdaf124da466c28"
+checksum = "be0ec5a79a87783dc828b7ff8f89f62880b3f553bc5f5b932a82f4a1035024b4"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -149,9 +149,9 @@ dependencies = [
[[package]]
name = "arrow-csv"
-version = "38.0.0"
+version = "39.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3197dab0963a236ff8e7c82e2272535745955ac1321eb740c29f2f88b353f54e"
+checksum = "350d8e55c3b2d602a0a04389bcc1da40167657143a9922a7103190603e7b7692"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -168,9 +168,9 @@ dependencies = [
[[package]]
name = "arrow-data"
-version = "38.0.0"
+version = "39.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb68113d6ecdbe8bba48b2c4042c151bf9e1c61244e45072a50250a6fc59bafe"
+checksum = "c6f710d98964d2c069b8baf566130045e79e11baa105623f038a6c942f805681"
dependencies = [
"arrow-buffer",
"arrow-schema",
@@ -180,9 +180,9 @@ dependencies = [
[[package]]
name = "arrow-ipc"
-version = "38.0.0"
+version = "39.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eab4bbf2dd3078facb5ce0a9641316a64f42bfd8cf357e6775c8a5e6708e3a8d"
+checksum = "9c99787cb8fabc187285da9e7182d22f2b80ecfac61ca0a42c4299e9eecdf903"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -194,9 +194,9 @@ dependencies = [
[[package]]
name = "arrow-json"
-version = "38.0.0"
+version = "39.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48c5b650d23746a494665d914a7fa3d21d939153cff9d53bdebe39bffa88f263"
+checksum = "91c95a58ce63f60d80d7a3a1222d65df0bc060b71d31353c34a8118c2a6eae7b"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -214,9 +214,9 @@ dependencies = [
[[package]]
name = "arrow-ord"
-version = "38.0.0"
+version = "39.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "68c6fce28e5011e30acc7466b5efcb8ed0197c396240bd2b10e167f275a3c208"
+checksum = "4141e6488610cc144e841da3de5f5371488f3cf5bc6bc7b3e752c64e7639c31b"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -229,9 +229,9 @@ dependencies = [
[[package]]
name = "arrow-row"
-version = "38.0.0"
+version = "39.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f20a421f19799d8b93eb8edde5217e910fa1e2d6ceb3c529f000e57b6db144c0"
+checksum = "940191a3c636c111c41e816325b0941484bf904c46de72cd9553acd1afd24d33"
dependencies = [
"ahash",
"arrow-array",
@@ -244,15 +244,15 @@ dependencies = [
[[package]]
name = "arrow-schema"
-version = "38.0.0"
+version = "39.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc85923d8d6662cc66ac6602c7d1876872e671002d60993dfdf492a6badeae92"
+checksum = "18c41d058b2895a12f46dfafc306ee3529ad9660406be0ab8a7967d5e27c417e"
[[package]]
name = "arrow-select"
-version = "38.0.0"
+version = "39.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6ab6613ce65b61d85a3410241744e84e48fbab0fe06e1251b4429d21b3470fd"
+checksum = "9fcbdda2772b7e712e77444f3a71f4ee517095aceb993b35de71de41c70d9b4f"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -263,9 +263,9 @@ dependencies = [
[[package]]
name = "arrow-string"
-version = "38.0.0"
+version = "39.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f3008641239e884aefba66d8b8532da6af40d14296349fcc85935de4ba67b89e"
+checksum = "7081c34f4b534ad320a03db79d58e38972041bb7c65686b98bbcc2f9a67a9cee"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -273,7 +273,7 @@ dependencies = [
"arrow-schema",
"arrow-select",
"regex",
- "regex-syntax 0.6.29",
+ "regex-syntax",
]
[[package]]
@@ -1127,7 +1127,7 @@ dependencies = [
"hashbrown 0.13.2",
"itertools",
"log",
- "regex-syntax 0.7.1",
+ "regex-syntax",
]
[[package]]
@@ -2165,9 +2165,9 @@ dependencies = [
[[package]]
name = "parquet"
-version = "38.0.0"
+version = "39.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4cbd51311f8d9ff3d2697b1522b18a588782e097d313a1a278b0faf2ccf2d3f6"
+checksum = "b0a1e6fa27f09ebddba280f5966ef435f3ac4d74cfc3ffe370fd3fd59c2e004d"
dependencies = [
"ahash",
"arrow-array",
@@ -2445,15 +2445,9 @@ checksum = "af83e617f331cc6ae2da5443c602dfa5af81e517212d9d611a5b3ba1777b5370"
dependencies = [
"aho-corasick",
"memchr",
- "regex-syntax 0.7.1",
+ "regex-syntax",
]
-[[package]]
-name = "regex-syntax"
-version = "0.6.29"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
-
[[package]]
name = "regex-syntax"
version = "0.7.1"
diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml
index fe64446942..a73c7dff60 100644
--- a/datafusion-cli/Cargo.toml
+++ b/datafusion-cli/Cargo.toml
@@ -29,7 +29,7 @@ rust-version = "1.62"
readme = "README.md"
[dependencies]
-arrow = "38.0.0"
+arrow = "39.0.0"
async-trait = "0.1.41"
clap = { version = "3", features = ["derive", "cargo"] }
datafusion = { path = "../datafusion/core", version = "24.0.0" }
diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs
index 0d436fa5c0..04b123fd4a 100644
--- a/datafusion/common/src/scalar.rs
+++ b/datafusion/common/src/scalar.rs
@@ -31,6 +31,7 @@ use crate::cast::{
};
use crate::delta::shift_months;
use crate::error::{DataFusionError, Result};
+use arrow::buffer::NullBuffer;
use arrow::compute::nullif;
use arrow::datatypes::{FieldRef, Fields, SchemaBuilder};
use arrow::{
@@ -2375,8 +2376,8 @@ impl ScalarValue {
let field_values = fields
.iter()
.zip(columns)
- .map(|(field, column)| -> Result<(Field, ArrayRef)> {
- Ok((field.as_ref().clone(), Self::iter_to_array(column)?))
+ .map(|(field, column)| {
+ Ok((field.clone(), Self::iter_to_array(column)?))
})
.collect::<Result<Vec<_>>>()?;
@@ -2546,7 +2547,7 @@ impl ScalarValue {
let offsets_array = offsets.finish();
let array_data = ArrayDataBuilder::new(data_type.clone())
.len(offsets_array.len() - 1)
- .null_bit_buffer(Some(valid.finish()))
+ .nulls(Some(NullBuffer::new(valid.finish())))
.add_buffer(offsets_array.values().inner().clone())
.add_child_data(flat_array.to_data());
@@ -2777,7 +2778,7 @@ impl ScalarValue {
.iter()
.zip(values.iter())
.map(|(field, value)| {
- (field.as_ref().clone(), value.to_array_of_size(size))
+ (field.clone(), value.to_array_of_size(size))
})
.collect();
@@ -4647,17 +4648,17 @@ mod tests {
#[test]
fn test_scalar_struct() {
- let field_a = Field::new("A", DataType::Int32, false);
- let field_b = Field::new("B", DataType::Boolean, false);
- let field_c = Field::new("C", DataType::Utf8, false);
+ let field_a = Arc::new(Field::new("A", DataType::Int32, false));
+ let field_b = Arc::new(Field::new("B", DataType::Boolean, false));
+ let field_c = Arc::new(Field::new("C", DataType::Utf8, false));
- let field_e = Field::new("e", DataType::Int16, false);
- let field_f = Field::new("f", DataType::Int64, false);
- let field_d = Field::new(
+ let field_e = Arc::new(Field::new("e", DataType::Int16, false));
+ let field_f = Arc::new(Field::new("f", DataType::Int64, false));
+ let field_d = Arc::new(Field::new(
"D",
DataType::Struct(vec![field_e.clone(), field_f.clone()].into()),
false,
- );
+ ));
let scalar = ScalarValue::Struct(
Some(vec![
@@ -4824,12 +4825,12 @@ mod tests {
#[test]
fn test_lists_in_struct() {
- let field_a = Field::new("A", DataType::Utf8, false);
- let field_primitive_list = Field::new(
+ let field_a = Arc::new(Field::new("A", DataType::Utf8, false));
+ let field_primitive_list = Arc::new(Field::new(
"primitive_list",
DataType::List(Arc::new(Field::new("item", DataType::Int32, true))),
false,
- );
+ ));
// Define primitive list scalars
let l0 = ScalarValue::List(
diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs
index ff64979b5c..cfbde41109 100644
--- a/datafusion/core/src/datasource/file_format/csv.rs
+++ b/datafusion/core/src/datasource/file_format/csv.rs
@@ -239,14 +239,13 @@ impl CsvFormat {
pin_mut!(stream);
while let Some(chunk) = stream.next().await.transpose()? {
+ let format = arrow::csv::reader::Format::default()
+ .with_header(self.has_header && first_chunk)
+ .with_delimiter(self.delimiter);
+
let (Schema { fields, .. }, records_read) =
- arrow::csv::reader::infer_reader_schema(
- chunk.reader(),
- self.delimiter,
- Some(records_to_read),
- // only consider header for first chunk
- self.has_header && first_chunk,
- )?;
+ format.infer_schema(chunk.reader(), Some(records_to_read))?;
+
records_to_read -= records_read;
total_records_read += records_read;
diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs
index 0ffc16cb11..f2780cd469 100644
--- a/datafusion/core/src/datasource/file_format/parquet.rs
+++ b/datafusion/core/src/datasource/file_format/parquet.rs
@@ -1201,7 +1201,7 @@ mod tests {
.unwrap()
.metadata()
.clone();
- check_page_index_validation(builder.page_indexes(), builder.offset_indexes());
+ check_page_index_validation(builder.column_index(), builder.offset_index());
let path = format!("{testdata}/alltypes_tiny_pages_plain.parquet");
let file = File::open(path).await.unwrap();
@@ -1211,7 +1211,7 @@ mod tests {
.unwrap()
.metadata()
.clone();
- check_page_index_validation(builder.page_indexes(), builder.offset_indexes());
+ check_page_index_validation(builder.column_index(), builder.offset_index());
Ok(())
}
diff --git a/datafusion/core/src/physical_plan/file_format/csv.rs b/datafusion/core/src/physical_plan/file_format/csv.rs
index e7633807e0..f19cf0743a 100644
--- a/datafusion/core/src/physical_plan/file_format/csv.rs
+++ b/datafusion/core/src/physical_plan/file_format/csv.rs
@@ -221,23 +221,21 @@ impl CsvConfig {
}
impl CsvConfig {
- fn open<R: std::io::Read>(&self, reader: R) -> csv::Reader<R> {
- let datetime_format = None;
- csv::Reader::new(
- reader,
- Arc::clone(&self.file_schema),
- self.has_header,
- Some(self.delimiter),
- self.batch_size,
- None,
- self.file_projection.clone(),
- datetime_format,
- )
+ fn open<R: std::io::Read>(&self, reader: R) -> Result<csv::Reader<R>> {
+ let mut builder = csv::ReaderBuilder::new(self.file_schema.clone())
+ .has_header(self.has_header)
+ .with_delimiter(self.delimiter)
+ .with_batch_size(self.batch_size);
+
+ if let Some(p) = &self.file_projection {
+ builder = builder.with_projection(p.clone());
+ }
+
+ Ok(builder.build(reader)?)
}
fn builder(&self) -> csv::ReaderBuilder {
- let mut builder = csv::ReaderBuilder::new()
- .with_schema(self.file_schema.clone())
+ let mut builder = csv::ReaderBuilder::new(self.file_schema.clone())
.with_delimiter(self.delimiter)
.with_batch_size(self.batch_size)
.has_header(self.has_header);
@@ -277,7 +275,7 @@ impl FileOpener for CsvOpener {
match config.object_store.get(file_meta.location()).await? {
GetResult::File(file, _) => {
let decoder = file_compression_type.convert_read(file)?;
- Ok(futures::stream::iter(config.open(decoder)).boxed())
+ Ok(futures::stream::iter(config.open(decoder)?).boxed())
}
GetResult::Stream(s) => {
let mut decoder = config.builder().build_decoder();
diff --git a/datafusion/core/src/physical_plan/file_format/parquet/page_filter.rs b/datafusion/core/src/physical_plan/file_format/parquet/page_filter.rs
index 58592b0605..410c43c57d 100644
--- a/datafusion/core/src/physical_plan/file_format/parquet/page_filter.rs
+++ b/datafusion/core/src/physical_plan/file_format/parquet/page_filter.rs
@@ -145,8 +145,8 @@ impl PagePruningPredicate {
return Ok(None);
}
- let file_offset_indexes = file_metadata.offset_indexes();
- let file_page_indexes = file_metadata.page_indexes();
+ let file_offset_indexes = file_metadata.offset_index();
+ let file_page_indexes = file_metadata.column_index();
let (file_offset_indexes, file_page_indexes) =
match (file_offset_indexes, file_page_indexes) {
(Some(o), Some(i)) => (o, i),
diff --git a/datafusion/physical-expr/src/struct_expressions.rs b/datafusion/physical-expr/src/struct_expressions.rs
index d03f94a497..dc8812b1ee 100644
--- a/datafusion/physical-expr/src/struct_expressions.rs
+++ b/datafusion/physical-expr/src/struct_expressions.rs
@@ -34,7 +34,7 @@ fn array_struct(args: &[ArrayRef]) -> Result<ArrayRef> {
let vec: Vec<_> = args
.iter()
.enumerate()
- .map(|(i, arg)| -> Result<(Field, ArrayRef)> {
+ .map(|(i, arg)| {
let field_name = format!("c{i}");
match arg.data_type() {
DataType::Utf8
@@ -50,7 +50,11 @@ fn array_struct(args: &[ArrayRef]) -> Result<ArrayRef> {
| DataType::UInt16
| DataType::UInt32
| DataType::UInt64 => Ok((
- Field::new(field_name.as_str(), arg.data_type().clone(), true),
+ Arc::new(Field::new(
+ field_name.as_str(),
+ arg.data_type().clone(),
+ true,
+ )),
arg.clone(),
)),
data_type => Err(DataFusionError::NotImplemented(format!(