You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/04/04 18:18:14 UTC

[arrow-datafusion] branch master updated: #2109 schema infer max (#2139)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new 69ba713c4 #2109 schema infer max (#2139)
69ba713c4 is described below

commit 69ba713c44ae3d0b13e8ec0eed7a60084bfc434e
Author: Rich <jy...@users.noreply.github.com>
AuthorDate: Mon Apr 4 14:18:10 2022 -0400

    #2109 schema infer max (#2139)
    
    * set default schema infer max record
    
    * fix unrelated issue "error: format argument must be a string literal" during `cargo test`
---
 datafusion/core/src/datasource/file_format/csv.rs  |  5 +++--
 datafusion/core/src/datasource/file_format/json.rs | 13 +++++++++++--
 datafusion/core/src/datasource/file_format/mod.rs  |  3 +++
 datafusion/core/src/execution/options.rs           |  9 +++++----
 4 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs
index 29ca84a12..efcf4bde9 100644
--- a/datafusion/core/src/datasource/file_format/csv.rs
+++ b/datafusion/core/src/datasource/file_format/csv.rs
@@ -26,6 +26,7 @@ use async_trait::async_trait;
 use futures::StreamExt;
 
 use super::FileFormat;
+use crate::datasource::file_format::DEFAULT_SCHEMA_INFER_MAX_RECORD;
 use crate::error::Result;
 use crate::logical_plan::Expr;
 use crate::physical_plan::file_format::{CsvExec, FileScanConfig};
@@ -46,7 +47,7 @@ pub struct CsvFormat {
 impl Default for CsvFormat {
     fn default() -> Self {
         Self {
-            schema_infer_max_rec: None,
+            schema_infer_max_rec: Some(DEFAULT_SCHEMA_INFER_MAX_RECORD),
             has_header: true,
             delimiter: b',',
         }
@@ -55,7 +56,7 @@ impl Default for CsvFormat {
 
 impl CsvFormat {
     /// Set a limit in terms of records to scan to infer the schema
-    /// - default to `None` (no limit)
+    /// - default to `DEFAULT_SCHEMA_INFER_MAX_RECORD`
     pub fn with_schema_infer_max_rec(mut self, max_rec: Option<usize>) -> Self {
         self.schema_infer_max_rec = max_rec;
         self
diff --git a/datafusion/core/src/datasource/file_format/json.rs b/datafusion/core/src/datasource/file_format/json.rs
index da51d62bb..6d5961a6d 100644
--- a/datafusion/core/src/datasource/file_format/json.rs
+++ b/datafusion/core/src/datasource/file_format/json.rs
@@ -30,6 +30,7 @@ use futures::StreamExt;
 
 use super::FileFormat;
 use super::FileScanConfig;
+use crate::datasource::file_format::DEFAULT_SCHEMA_INFER_MAX_RECORD;
 use crate::error::Result;
 use crate::logical_plan::Expr;
 use crate::physical_plan::file_format::NdJsonExec;
@@ -40,14 +41,22 @@ use datafusion_data_access::object_store::{ObjectReader, ObjectReaderStream};
 /// The default file extension of json files
 pub const DEFAULT_JSON_EXTENSION: &str = ".json";
 /// New line delimited JSON `FileFormat` implementation.
-#[derive(Debug, Default)]
+#[derive(Debug)]
 pub struct JsonFormat {
     schema_infer_max_rec: Option<usize>,
 }
 
+impl Default for JsonFormat {
+    fn default() -> Self {
+        Self {
+            schema_infer_max_rec: Some(DEFAULT_SCHEMA_INFER_MAX_RECORD),
+        }
+    }
+}
+
 impl JsonFormat {
     /// Set a limit in terms of records to scan to infer the schema
-    /// - defaults to `None` (no limit)
+    /// - defaults to `DEFAULT_SCHEMA_INFER_MAX_RECORD`
     pub fn with_schema_infer_max_rec(mut self, max_rec: Option<usize>) -> Self {
         self.schema_infer_max_rec = max_rec;
         self
diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs
index 062430a2d..d0334b7a9 100644
--- a/datafusion/core/src/datasource/file_format/mod.rs
+++ b/datafusion/core/src/datasource/file_format/mod.rs
@@ -17,6 +17,9 @@
 
 //! Module containing helper methods for the various file formats
 
+/// default max records to scan to infer the schema
+pub const DEFAULT_SCHEMA_INFER_MAX_RECORD: usize = 1000;
+
 pub mod avro;
 pub mod csv;
 pub mod json;
diff --git a/datafusion/core/src/execution/options.rs b/datafusion/core/src/execution/options.rs
index b790ca3bf..a87ae7712 100644
--- a/datafusion/core/src/execution/options.rs
+++ b/datafusion/core/src/execution/options.rs
@@ -21,6 +21,7 @@ use std::sync::Arc;
 
 use arrow::datatypes::{Schema, SchemaRef};
 
+use crate::datasource::file_format::DEFAULT_SCHEMA_INFER_MAX_RECORD;
 use crate::datasource::{
     file_format::{
         avro::{AvroFormat, DEFAULT_AVRO_EXTENSION},
@@ -44,7 +45,7 @@ pub struct CsvReadOptions<'a> {
     /// An optional schema representing the CSV files. If None, CSV reader will try to infer it
     /// based on data in file.
     pub schema: Option<&'a Schema>,
-    /// Max number of rows to read from CSV files for schema inference if needed. Defaults to 1000.
+    /// Max number of rows to read from CSV files for schema inference if needed. Defaults to `DEFAULT_SCHEMA_INFER_MAX_RECORD`.
     pub schema_infer_max_records: usize,
     /// File extension; only files with this extension are selected for data input.
     /// Defaults to DEFAULT_CSV_EXTENSION.
@@ -65,7 +66,7 @@ impl<'a> CsvReadOptions<'a> {
         Self {
             has_header: true,
             schema: None,
-            schema_infer_max_records: 1000,
+            schema_infer_max_records: DEFAULT_SCHEMA_INFER_MAX_RECORD,
             delimiter: b',',
             file_extension: DEFAULT_CSV_EXTENSION,
             table_partition_cols: vec![],
@@ -234,7 +235,7 @@ pub struct NdJsonReadOptions<'a> {
     /// The data source schema.
     pub schema: Option<SchemaRef>,
 
-    /// Max number of rows to read from CSV files for schema inference if needed. Defaults to 1000.
+    /// Max number of rows to read from JSON files for schema inference if needed. Defaults to `DEFAULT_SCHEMA_INFER_MAX_RECORD`.
     pub schema_infer_max_records: usize,
 
     /// File extension; only files with this extension are selected for data input.
@@ -248,7 +249,7 @@ impl<'a> Default for NdJsonReadOptions<'a> {
     fn default() -> Self {
         Self {
             schema: None,
-            schema_infer_max_records: 1000,
+            schema_infer_max_records: DEFAULT_SCHEMA_INFER_MAX_RECORD,
             file_extension: DEFAULT_JSON_EXTENSION,
             table_partition_cols: vec![],
         }