You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2019/02/19 15:07:08 UTC
[arrow] branch master updated: ARROW-4556: [Rust] Preserve JSON
field order when inferring schema
This is an automated email from the ASF dual-hosted git repository.
kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 28bc3d1 ARROW-4556: [Rust] Preserve JSON field order when inferring schema
28bc3d1 is described below
commit 28bc3d1020137ab7148e4fb2294a7f6badc24592
Author: Neville Dipale <ne...@gmail.com>
AuthorDate: Tue Feb 19 16:03:13 2019 +0100
ARROW-4556: [Rust] Preserve JSON field order when inferring schema
Uses the `preserve-order` feature in `serde_json`, which is required to guarantee that we iterate through JSON records' keys in their provided order.
Adds `crate indexmap`, which `serde_json` already uses.
Author: Neville Dipale <ne...@gmail.com>
Closes #3702 from nevi-me/arrow-4556 and squashes the following commits:
5ffbc723 <Neville Dipale> fmt
5de965ae <Neville Dipale> ARROW-4556: Preserve JSON field order when inferring schema
---
rust/arrow/Cargo.toml | 3 ++-
rust/arrow/src/datatypes.rs | 23 +++++++++++++++++++++--
rust/arrow/src/json/reader.rs | 25 +++++++++++++++++++++++--
3 files changed, 46 insertions(+), 5 deletions(-)
diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml
index ae09ee8..0ff44cd 100644
--- a/rust/arrow/Cargo.toml
+++ b/rust/arrow/Cargo.toml
@@ -39,7 +39,8 @@ bytes = "0.4"
libc = "0.2"
serde = { version = "1.0.80", features = ["alloc", "rc"] }
serde_derive = "1.0.80"
-serde_json = "1.0.13"
+serde_json = { version = "1.0.13", features = ["preserve_order"] }
+indexmap = "1.0"
rand = "0.5"
csv = "1.0.0"
num = "0.2"
diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs
index e5a0d0d..765eae0 100644
--- a/rust/arrow/src/datatypes.rs
+++ b/rust/arrow/src/datatypes.rs
@@ -665,7 +665,7 @@ mod tests {
assert_eq!(
"{\"name\":\"address\",\"nullable\":false,\"type\":{\"fields\":[\
{\"name\":\"street\",\"nullable\":false,\"type\":{\"name\":\"utf8\"}},\
- {\"name\":\"zip\",\"nullable\":false,\"type\":{\"bitWidth\":16,\"isSigned\":false,\"name\":\"int\"}}]}}",
+ {\"name\":\"zip\",\"nullable\":false,\"type\":{\"name\":\"int\",\"bitWidth\":16,\"isSigned\":false}}]}}",
f.to_json().to_string()
);
}
@@ -745,7 +745,26 @@ mod tests {
]);
let json = schema.to_json().to_string();
- assert_eq!(json, "{\"fields\":[{\"name\":\"c1\",\"nullable\":false,\"type\":{\"name\":\"utf8\"}},{\"name\":\"c2\",\"nullable\":false,\"type\":{\"name\":\"date\",\"unit\":\"DAY\"}},{\"name\":\"c3\",\"nullable\":false,\"type\":{\"name\":\"date\",\"unit\":\"MILLISECOND\"}},{\"name\":\"c7\",\"nullable\":false,\"type\":{\"bitWidth\":\"32\",\"name\":\"time\",\"unit\":\"SECOND\"}},{\"name\":\"c8\",\"nullable\":false,\"type\":{\"bitWidth\":\"32\",\"name\":\"time\",\"unit\":\"MILLISECOND\ [...]
+ assert_eq!(json, "{\"fields\":[{\"name\":\"c1\",\"nullable\":false,\"type\":{\"name\":\"utf8\"}},\
+ {\"name\":\"c2\",\"nullable\":false,\"type\":{\"name\":\"date\",\"unit\":\"DAY\"}},\
+ {\"name\":\"c3\",\"nullable\":false,\"type\":{\"name\":\"date\",\"unit\":\"MILLISECOND\"}},\
+ {\"name\":\"c7\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"32\",\"unit\":\"SECOND\"}},\
+ {\"name\":\"c8\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"32\",\"unit\":\"MILLISECOND\"}},\
+ {\"name\":\"c9\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"32\",\"unit\":\"MICROSECOND\"}},\
+ {\"name\":\"c10\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"32\",\"unit\":\"NANOSECOND\"}},\
+ {\"name\":\"c11\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"64\",\"unit\":\"SECOND\"}},\
+ {\"name\":\"c12\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"64\",\"unit\":\"MILLISECOND\"}},\
+ {\"name\":\"c13\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"64\",\"unit\":\"MICROSECOND\"}},\
+ {\"name\":\"c14\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"64\",\"unit\":\"NANOSECOND\"}},\
+ {\"name\":\"c15\",\"nullable\":false,\"type\":{\"name\":\"timestamp\",\"unit\":\"SECOND\"}},\
+ {\"name\":\"c16\",\"nullable\":false,\"type\":{\"name\":\"timestamp\",\"unit\":\"MILLISECOND\"}},\
+ {\"name\":\"c17\",\"nullable\":false,\"type\":{\"name\":\"timestamp\",\"unit\":\"MICROSECOND\"}},\
+ {\"name\":\"c18\",\"nullable\":false,\"type\":{\"name\":\"timestamp\",\"unit\":\"NANOSECOND\"}},\
+ {\"name\":\"c19\",\"nullable\":false,\"type\":{\"name\":\"interval\",\"unit\":\"DAY_TIME\"}},\
+ {\"name\":\"c20\",\"nullable\":false,\"type\":{\"name\":\"interval\",\"unit\":\"YEAR_MONTH\"}},\
+ {\"name\":\"c21\",\"nullable\":false,\"type\":{\"fields\":[\
+ {\"name\":\"a\",\"nullable\":false,\"type\":{\"name\":\"utf8\"}},\
+ {\"name\":\"b\",\"nullable\":false,\"type\":{\"name\":\"int\",\"bitWidth\":16,\"isSigned\":false}}]}}]}");
// convert back to a schema
let value: Value = serde_json::from_str(&json).unwrap();
diff --git a/rust/arrow/src/json/reader.rs b/rust/arrow/src/json/reader.rs
index ab2f42f..5ab0ce0 100644
--- a/rust/arrow/src/json/reader.rs
+++ b/rust/arrow/src/json/reader.rs
@@ -42,7 +42,8 @@
//! let batch = json.next().unwrap().unwrap();
//! ```
-use std::collections::{HashMap, HashSet};
+use indexmap::map::IndexMap as HashMap;
+use indexmap::set::IndexSet as HashSet;
use std::fs::File;
use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
use std::sync::Arc;
@@ -472,8 +473,21 @@ impl<R: Read> Reader<R> {
})
.collect();
+ let projected_fields: Vec<Field> = if projection.is_empty() {
+ self.schema.fields().to_vec()
+ } else {
+ projection
+ .iter()
+ .map(|name| self.schema.column_with_name(name))
+ .filter_map(|c| c)
+ .map(|(_, field)| field.clone())
+ .collect()
+ };
+
+ let projected_schema = Arc::new(Schema::new(projected_fields));
+
match arrays {
- Ok(arr) => Ok(Some(RecordBatch::new(self.schema.clone(), arr))),
+ Ok(arr) => Ok(Some(RecordBatch::new(projected_schema, arr))),
Err(e) => Err(e),
}
}
@@ -728,12 +742,16 @@ mod tests {
let schema = batch.schema();
let a = schema.column_with_name("a").unwrap();
+ assert_eq!(0, a.0);
assert_eq!(&DataType::Int64, a.1.data_type());
let b = schema.column_with_name("b").unwrap();
+ assert_eq!(1, b.0);
assert_eq!(&DataType::Float64, b.1.data_type());
let c = schema.column_with_name("c").unwrap();
+ assert_eq!(2, c.0);
assert_eq!(&DataType::Boolean, c.1.data_type());
let d = schema.column_with_name("d").unwrap();
+ assert_eq!(3, d.0);
assert_eq!(&DataType::Utf8, d.1.data_type());
let aa = batch
@@ -891,13 +909,16 @@ mod tests {
let batch = reader.next().unwrap().unwrap();
assert_eq!(2, batch.num_columns());
+ assert_eq!(2, batch.schema().fields().len());
assert_eq!(12, batch.num_rows());
let schema = batch.schema();
let a = schema.column_with_name("a").unwrap();
+ assert_eq!(0, a.0);
assert_eq!(&DataType::Int32, a.1.data_type());
let c = schema.column_with_name("c").unwrap();
+ assert_eq!(1, c.0);
assert_eq!(&DataType::Boolean, c.1.data_type());
}