You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2019/02/19 15:07:08 UTC

[arrow] branch master updated: ARROW-4556: [Rust] Preserve JSON field order when inferring schema

This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 28bc3d1  ARROW-4556: [Rust] Preserve JSON field order when inferring schema
28bc3d1 is described below

commit 28bc3d1020137ab7148e4fb2294a7f6badc24592
Author: Neville Dipale <ne...@gmail.com>
AuthorDate: Tue Feb 19 16:03:13 2019 +0100

    ARROW-4556: [Rust] Preserve JSON field order when inferring schema
    
    Uses the `preserve-order` feature in `serde_json`, which is required to guarantee that we iterate through JSON records' keys in their provided order.
    Adds `crate indexmap`, which `serde_json` already uses.
    
    Author: Neville Dipale <ne...@gmail.com>
    
    Closes #3702 from nevi-me/arrow-4556 and squashes the following commits:
    
    5ffbc723 <Neville Dipale> fmt
    5de965ae <Neville Dipale> ARROW-4556:  Preserve JSON field order when inferring schema
---
 rust/arrow/Cargo.toml         |  3 ++-
 rust/arrow/src/datatypes.rs   | 23 +++++++++++++++++++++--
 rust/arrow/src/json/reader.rs | 25 +++++++++++++++++++++++--
 3 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml
index ae09ee8..0ff44cd 100644
--- a/rust/arrow/Cargo.toml
+++ b/rust/arrow/Cargo.toml
@@ -39,7 +39,8 @@ bytes = "0.4"
 libc = "0.2"
 serde = { version = "1.0.80", features = ["alloc", "rc"] }
 serde_derive = "1.0.80"
-serde_json = "1.0.13"
+serde_json = { version = "1.0.13", features = ["preserve_order"] }
+indexmap = "1.0"
 rand = "0.5"
 csv = "1.0.0"
 num = "0.2"
diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs
index e5a0d0d..765eae0 100644
--- a/rust/arrow/src/datatypes.rs
+++ b/rust/arrow/src/datatypes.rs
@@ -665,7 +665,7 @@ mod tests {
         assert_eq!(
             "{\"name\":\"address\",\"nullable\":false,\"type\":{\"fields\":[\
             {\"name\":\"street\",\"nullable\":false,\"type\":{\"name\":\"utf8\"}},\
-            {\"name\":\"zip\",\"nullable\":false,\"type\":{\"bitWidth\":16,\"isSigned\":false,\"name\":\"int\"}}]}}",
+            {\"name\":\"zip\",\"nullable\":false,\"type\":{\"name\":\"int\",\"bitWidth\":16,\"isSigned\":false}}]}}",
             f.to_json().to_string()
         );
     }
@@ -745,7 +745,26 @@ mod tests {
         ]);
 
         let json = schema.to_json().to_string();
-        assert_eq!(json, "{\"fields\":[{\"name\":\"c1\",\"nullable\":false,\"type\":{\"name\":\"utf8\"}},{\"name\":\"c2\",\"nullable\":false,\"type\":{\"name\":\"date\",\"unit\":\"DAY\"}},{\"name\":\"c3\",\"nullable\":false,\"type\":{\"name\":\"date\",\"unit\":\"MILLISECOND\"}},{\"name\":\"c7\",\"nullable\":false,\"type\":{\"bitWidth\":\"32\",\"name\":\"time\",\"unit\":\"SECOND\"}},{\"name\":\"c8\",\"nullable\":false,\"type\":{\"bitWidth\":\"32\",\"name\":\"time\",\"unit\":\"MILLISECOND\ [...]
+        assert_eq!(json, "{\"fields\":[{\"name\":\"c1\",\"nullable\":false,\"type\":{\"name\":\"utf8\"}},\
+        {\"name\":\"c2\",\"nullable\":false,\"type\":{\"name\":\"date\",\"unit\":\"DAY\"}},\
+        {\"name\":\"c3\",\"nullable\":false,\"type\":{\"name\":\"date\",\"unit\":\"MILLISECOND\"}},\
+        {\"name\":\"c7\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"32\",\"unit\":\"SECOND\"}},\
+        {\"name\":\"c8\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"32\",\"unit\":\"MILLISECOND\"}},\
+        {\"name\":\"c9\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"32\",\"unit\":\"MICROSECOND\"}},\
+        {\"name\":\"c10\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"32\",\"unit\":\"NANOSECOND\"}},\
+        {\"name\":\"c11\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"64\",\"unit\":\"SECOND\"}},\
+        {\"name\":\"c12\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"64\",\"unit\":\"MILLISECOND\"}},\
+        {\"name\":\"c13\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"64\",\"unit\":\"MICROSECOND\"}},\
+        {\"name\":\"c14\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"64\",\"unit\":\"NANOSECOND\"}},\
+        {\"name\":\"c15\",\"nullable\":false,\"type\":{\"name\":\"timestamp\",\"unit\":\"SECOND\"}},\
+        {\"name\":\"c16\",\"nullable\":false,\"type\":{\"name\":\"timestamp\",\"unit\":\"MILLISECOND\"}},\
+        {\"name\":\"c17\",\"nullable\":false,\"type\":{\"name\":\"timestamp\",\"unit\":\"MICROSECOND\"}},\
+        {\"name\":\"c18\",\"nullable\":false,\"type\":{\"name\":\"timestamp\",\"unit\":\"NANOSECOND\"}},\
+        {\"name\":\"c19\",\"nullable\":false,\"type\":{\"name\":\"interval\",\"unit\":\"DAY_TIME\"}},\
+        {\"name\":\"c20\",\"nullable\":false,\"type\":{\"name\":\"interval\",\"unit\":\"YEAR_MONTH\"}},\
+        {\"name\":\"c21\",\"nullable\":false,\"type\":{\"fields\":[\
+        {\"name\":\"a\",\"nullable\":false,\"type\":{\"name\":\"utf8\"}},\
+        {\"name\":\"b\",\"nullable\":false,\"type\":{\"name\":\"int\",\"bitWidth\":16,\"isSigned\":false}}]}}]}");
 
         // convert back to a schema
         let value: Value = serde_json::from_str(&json).unwrap();
diff --git a/rust/arrow/src/json/reader.rs b/rust/arrow/src/json/reader.rs
index ab2f42f..5ab0ce0 100644
--- a/rust/arrow/src/json/reader.rs
+++ b/rust/arrow/src/json/reader.rs
@@ -42,7 +42,8 @@
 //! let batch = json.next().unwrap().unwrap();
 //! ```
 
-use std::collections::{HashMap, HashSet};
+use indexmap::map::IndexMap as HashMap;
+use indexmap::set::IndexSet as HashSet;
 use std::fs::File;
 use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
 use std::sync::Arc;
@@ -472,8 +473,21 @@ impl<R: Read> Reader<R> {
             })
             .collect();
 
+        let projected_fields: Vec<Field> = if projection.is_empty() {
+            self.schema.fields().to_vec()
+        } else {
+            projection
+                .iter()
+                .map(|name| self.schema.column_with_name(name))
+                .filter_map(|c| c)
+                .map(|(_, field)| field.clone())
+                .collect()
+        };
+
+        let projected_schema = Arc::new(Schema::new(projected_fields));
+
         match arrays {
-            Ok(arr) => Ok(Some(RecordBatch::new(self.schema.clone(), arr))),
+            Ok(arr) => Ok(Some(RecordBatch::new(projected_schema, arr))),
             Err(e) => Err(e),
         }
     }
@@ -728,12 +742,16 @@ mod tests {
         let schema = batch.schema();
 
         let a = schema.column_with_name("a").unwrap();
+        assert_eq!(0, a.0);
         assert_eq!(&DataType::Int64, a.1.data_type());
         let b = schema.column_with_name("b").unwrap();
+        assert_eq!(1, b.0);
         assert_eq!(&DataType::Float64, b.1.data_type());
         let c = schema.column_with_name("c").unwrap();
+        assert_eq!(2, c.0);
         assert_eq!(&DataType::Boolean, c.1.data_type());
         let d = schema.column_with_name("d").unwrap();
+        assert_eq!(3, d.0);
         assert_eq!(&DataType::Utf8, d.1.data_type());
 
         let aa = batch
@@ -891,13 +909,16 @@ mod tests {
         let batch = reader.next().unwrap().unwrap();
 
         assert_eq!(2, batch.num_columns());
+        assert_eq!(2, batch.schema().fields().len());
         assert_eq!(12, batch.num_rows());
 
         let schema = batch.schema();
 
         let a = schema.column_with_name("a").unwrap();
+        assert_eq!(0, a.0);
         assert_eq!(&DataType::Int32, a.1.data_type());
         let c = schema.column_with_name("c").unwrap();
+        assert_eq!(1, c.0);
         assert_eq!(&DataType::Boolean, c.1.data_type());
     }