You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2023/07/13 19:10:21 UTC
[arrow-datafusion] branch main updated: Pass infer max records to JsonFormat. (#6945)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 3de209e951 Pass infer max records to JsonFormat. (#6945)
3de209e951 is described below

commit 3de209e95158d59891174ee73595c90a9f1b3cde
Author: vincev <vi...@gmail.com>
AuthorDate: Thu Jul 13 21:10:14 2023 +0200

    Pass infer max records to JsonFormat. (#6945)
---
 .../core/src/datasource/file_format/options.rs     |  1 +
 .../core/src/datasource/physical_plan/json.rs      | 30 ++++++++++++++++++++++
 datafusion/core/tests/data/4.json                  |  4 +++
 3 files changed, 35 insertions(+)

diff --git a/datafusion/core/src/datasource/file_format/options.rs b/datafusion/core/src/datasource/file_format/options.rs
index 5694bf5380..6155dc6640 100644
--- a/datafusion/core/src/datasource/file_format/options.rs
+++ b/datafusion/core/src/datasource/file_format/options.rs
@@ -489,6 +489,7 @@ impl ReadOptions<'_> for ParquetReadOptions<'_> {
 impl ReadOptions<'_> for NdJsonReadOptions<'_> {
     fn to_listing_options(&self, config: &SessionConfig) -> ListingOptions {
         let file_format = JsonFormat::default()
+            .with_schema_infer_max_rec(Some(self.schema_infer_max_records))
             .with_file_compression_type(self.file_compression_type.to_owned());
 
         ListingOptions::new(Arc::new(file_format))
diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs
index e9082b8084..541e448cfe 100644
--- a/datafusion/core/src/datasource/physical_plan/json.rs
+++ b/datafusion/core/src/datasource/physical_plan/json.rs
@@ -733,4 +733,34 @@ mod tests {
         assert_eq!("Arrow error: Parser error: Error while parsing value d for column 0 at line 4", format!("{e}"));
         Ok(())
     }
+
+    #[tokio::test]
+    async fn ndjson_schema_infer_max_records() -> Result<()> {
+        async fn read_test_data(schema_infer_max_records: usize) -> Result<SchemaRef> {
+            let ctx = SessionContext::new();
+
+            let options = NdJsonReadOptions {
+                schema_infer_max_records,
+                ..Default::default()
+            };
+
+            let batches = ctx
+                .read_json("tests/data/4.json", options)
+                .await?
+                .collect()
+                .await?;
+
+            Ok(batches[0].schema())
+        }
+
+        // Use only the first 2 rows to infer the schema, those have 2 fields.
+        let schema = read_test_data(2).await?;
+        assert_eq!(schema.fields().len(), 2);
+
+        // Use all rows to infer the schema, those have 5 fields.
+        let schema = read_test_data(10).await?;
+        assert_eq!(schema.fields().len(), 5);
+
+        Ok(())
+    }
 }
diff --git a/datafusion/core/tests/data/4.json b/datafusion/core/tests/data/4.json
new file mode 100644
index 0000000000..f0c67cd7cf
--- /dev/null
+++ b/datafusion/core/tests/data/4.json
@@ -0,0 +1,4 @@
+{"a":1, "b":[2.0, 1.3, -6.1]}
+{"a":2, "b":[3.0, 4.3]}
+{"c":[false, true], "d":{"c1": 23, "c2": 32}}
+{"e": {"e1": 2, "e2": 12.3}}
\ No newline at end of file