You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/11/21 16:59:20 UTC
[arrow-rs] branch master updated: refactor: convert `Field::metadata` to `HashMap` (#3148)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 57f91f2e8 refactor: convert `Field::metadata` to `HashMap` (#3148)
57f91f2e8 is described below
commit 57f91f2e82184d6e04aab296448d1fe7352a7822
Author: Marco Neumann <ma...@crepererum.net>
AuthorDate: Mon Nov 21 16:59:14 2022 +0000
refactor: convert `Field::metadata` to `HashMap` (#3148)
* refactor: convert `Field::metadata` to `HashMap`
Closes #2262.
* refactor: code formatting
Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
---
arrow-integration-test/src/field.rs | 8 ++--
arrow-ipc/src/convert.rs | 6 +--
arrow-schema/src/datatype.rs | 6 +--
arrow-schema/src/field.rs | 85 +++++++++++++++++++++++++++++--------
arrow-schema/src/schema.rs | 23 +++++-----
parquet/src/arrow/schema/complex.rs | 4 +-
6 files changed, 90 insertions(+), 42 deletions(-)
diff --git a/arrow-integration-test/src/field.rs b/arrow-integration-test/src/field.rs
index 5b5863557..4bfbf8e99 100644
--- a/arrow-integration-test/src/field.rs
+++ b/arrow-integration-test/src/field.rs
@@ -18,7 +18,7 @@
use crate::{data_type_from_json, data_type_to_json};
use arrow::datatypes::{DataType, Field};
use arrow::error::{ArrowError, Result};
-use std::collections::BTreeMap;
+use std::collections::HashMap;
/// Parse a `Field` definition from a JSON representation.
pub fn field_from_json(json: &serde_json::Value) -> Result<Field> {
@@ -53,7 +53,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result<Field> {
// Referenced example file: testing/data/arrow-ipc-stream/integration/1.0.0-littleendian/generated_custom_metadata.json.gz
let metadata = match map.get("metadata") {
Some(&Value::Array(ref values)) => {
- let mut res: BTreeMap<String, String> = BTreeMap::default();
+ let mut res: HashMap<String, String> = HashMap::default();
for value in values {
match value.as_object() {
Some(map) => {
@@ -92,7 +92,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result<Field> {
// We also support map format, because Schema's metadata supports this.
// See https://github.com/apache/arrow/pull/5907
Some(&Value::Object(ref values)) => {
- let mut res: BTreeMap<String, String> = BTreeMap::default();
+ let mut res: HashMap<String, String> = HashMap::default();
for (k, v) in values {
if let Some(str_value) = v.as_str() {
res.insert(k.clone(), str_value.to_string().clone());
@@ -110,7 +110,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result<Field> {
"Field `metadata` is not json array".to_string(),
));
}
- _ => BTreeMap::default(),
+ _ => HashMap::default(),
};
// if data_type is a struct or list, get its children
diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs
index a9dda6f2a..e11d64a47 100644
--- a/arrow-ipc/src/convert.rs
+++ b/arrow-ipc/src/convert.rs
@@ -21,7 +21,7 @@ use arrow_schema::*;
use flatbuffers::{
FlatBufferBuilder, ForwardsUOffset, UnionWIPOffset, Vector, WIPOffset,
};
-use std::collections::{BTreeMap, HashMap};
+use std::collections::HashMap;
use crate::{size_prefixed_root_as_message, CONTINUATION_MARKER};
use DataType::*;
@@ -86,7 +86,7 @@ impl<'a> From<crate::Field<'a>> for Field {
)
};
- let mut metadata_map = BTreeMap::default();
+ let mut metadata_map = HashMap::default();
if let Some(list) = field.custom_metadata() {
for kv in list {
if let (Some(k), Some(v)) = (kv.key(), kv.value()) {
@@ -812,7 +812,7 @@ mod tests {
.iter()
.cloned()
.collect();
- let field_md: BTreeMap<String, String> = [("k".to_string(), "v".to_string())]
+ let field_md: HashMap<String, String> = [("k".to_string(), "v".to_string())]
.iter()
.cloned()
.collect();
diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs
index 90ae42942..acf369145 100644
--- a/arrow-schema/src/datatype.rs
+++ b/arrow-schema/src/datatype.rs
@@ -381,10 +381,10 @@ mod tests {
#[test]
#[cfg(feature = "serde")]
fn serde_struct_type() {
- use std::collections::BTreeMap;
+ use std::collections::HashMap;
let kv_array = [("k".to_string(), "v".to_string())];
- let field_metadata: BTreeMap<String, String> = kv_array.iter().cloned().collect();
+ let field_metadata: HashMap<String, String> = kv_array.iter().cloned().collect();
// Non-empty map: should be converted as JSON obj { ... }
let first_name =
@@ -392,7 +392,7 @@ mod tests {
// Empty map: should be omitted.
let last_name = Field::new("last_name", DataType::Utf8, false)
- .with_metadata(BTreeMap::default());
+ .with_metadata(HashMap::default());
let person = DataType::Struct(vec![
first_name,
diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs
index b1de65e55..cd9024747 100644
--- a/arrow-schema/src/field.rs
+++ b/arrow-schema/src/field.rs
@@ -17,7 +17,7 @@
use crate::error::ArrowError;
use std::cmp::Ordering;
-use std::collections::BTreeMap;
+use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use crate::datatype::DataType;
@@ -37,9 +37,9 @@ pub struct Field {
/// A map of key-value pairs containing additional custom meta data.
#[cfg_attr(
feature = "serde",
- serde(skip_serializing_if = "BTreeMap::is_empty", default)
+ serde(skip_serializing_if = "HashMap::is_empty", default)
)]
- metadata: BTreeMap<String, String>,
+ metadata: HashMap<String, String>,
}
// Auto-derive `PartialEq` traits will pull `dict_id` and `dict_is_ordered`
@@ -68,9 +68,33 @@ impl Ord for Field {
fn cmp(&self, other: &Self) -> Ordering {
self.name
.cmp(other.name())
- .then(self.data_type.cmp(other.data_type()))
- .then(self.nullable.cmp(&other.nullable))
- .then(self.metadata.cmp(&other.metadata))
+ .then_with(|| self.data_type.cmp(other.data_type()))
+ .then_with(|| self.nullable.cmp(&other.nullable))
+ .then_with(|| {
+ // ensure deterministic key order
+ let mut keys: Vec<&String> =
+ self.metadata.keys().chain(other.metadata.keys()).collect();
+ keys.sort();
+ for k in keys {
+ match (self.metadata.get(k), other.metadata.get(k)) {
+ (None, None) => {}
+ (Some(_), None) => {
+ return Ordering::Less;
+ }
+ (None, Some(_)) => {
+ return Ordering::Greater;
+ }
+ (Some(v1), Some(v2)) => match v1.cmp(v2) {
+ Ordering::Equal => {}
+ other => {
+ return other;
+ }
+ },
+ }
+ }
+
+ Ordering::Equal
+ })
}
}
@@ -79,7 +103,14 @@ impl Hash for Field {
self.name.hash(state);
self.data_type.hash(state);
self.nullable.hash(state);
- self.metadata.hash(state);
+
+ // ensure deterministic key order
+ let mut keys: Vec<&String> = self.metadata.keys().collect();
+ keys.sort();
+ for k in keys {
+ k.hash(state);
+ self.metadata.get(k).expect("key valid").hash(state);
+ }
}
}
@@ -92,7 +123,7 @@ impl Field {
nullable,
dict_id: 0,
dict_is_ordered: false,
- metadata: BTreeMap::default(),
+ metadata: HashMap::default(),
}
}
@@ -110,29 +141,29 @@ impl Field {
nullable,
dict_id,
dict_is_ordered,
- metadata: BTreeMap::default(),
+ metadata: HashMap::default(),
}
}
/// Sets the `Field`'s optional custom metadata.
/// The metadata is set as `None` for empty map.
#[inline]
- pub fn set_metadata(&mut self, metadata: BTreeMap<String, String>) {
- self.metadata = BTreeMap::default();
+ pub fn set_metadata(&mut self, metadata: HashMap<String, String>) {
+ self.metadata = HashMap::default();
if !metadata.is_empty() {
self.metadata = metadata;
}
}
/// Sets the metadata of this `Field` to be `metadata` and returns self
- pub fn with_metadata(mut self, metadata: BTreeMap<String, String>) -> Self {
+ pub fn with_metadata(mut self, metadata: HashMap<String, String>) -> Self {
self.set_metadata(metadata);
self
}
/// Returns the immutable reference to the `Field`'s optional custom metadata.
#[inline]
- pub const fn metadata(&self) -> &BTreeMap<String, String> {
+ pub const fn metadata(&self) -> &HashMap<String, String> {
&self.metadata
}
@@ -545,10 +576,30 @@ mod test {
assert_ne!(get_field_hash(&dict1), get_field_hash(&dict2));
}
+ #[test]
+ fn test_field_comparison_metadata() {
+ let f1 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([
+ (String::from("k1"), String::from("v1")),
+ (String::from("k2"), String::from("v2")),
+ ]));
+ let f2 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([
+ (String::from("k1"), String::from("v1")),
+ (String::from("k3"), String::from("v3")),
+ ]));
+ let f3 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([
+ (String::from("k1"), String::from("v1")),
+ (String::from("k3"), String::from("v4")),
+ ]));
+
+ assert!(f1.cmp(&f2).is_lt());
+ assert!(f2.cmp(&f3).is_lt());
+ assert!(f1.cmp(&f3).is_lt());
+ }
+
#[test]
fn test_contains_reflexivity() {
let mut field = Field::new("field1", DataType::Float16, false);
- field.set_metadata(BTreeMap::from([
+ field.set_metadata(HashMap::from([
(String::from("k0"), String::from("v0")),
(String::from("k1"), String::from("v1")),
]));
@@ -560,14 +611,14 @@ mod test {
let child_field = Field::new("child1", DataType::Float16, false);
let mut field1 = Field::new("field1", DataType::Struct(vec![child_field]), false);
- field1.set_metadata(BTreeMap::from([(String::from("k1"), String::from("v1"))]));
+ field1.set_metadata(HashMap::from([(String::from("k1"), String::from("v1"))]));
let mut field2 = Field::new("field1", DataType::Struct(vec![]), true);
- field2.set_metadata(BTreeMap::from([(String::from("k2"), String::from("v2"))]));
+ field2.set_metadata(HashMap::from([(String::from("k2"), String::from("v2"))]));
field2.try_merge(&field1).unwrap();
let mut field3 = Field::new("field1", DataType::Struct(vec![]), false);
- field3.set_metadata(BTreeMap::from([(String::from("k3"), String::from("v3"))]));
+ field3.set_metadata(HashMap::from([(String::from("k3"), String::from("v3"))]));
field3.try_merge(&field2).unwrap();
assert!(field2.contains(&field1));
diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs
index 519a8e089..8ff40866d 100644
--- a/arrow-schema/src/schema.rs
+++ b/arrow-schema/src/schema.rs
@@ -290,7 +290,6 @@ mod tests {
use super::*;
use crate::datatype::DataType;
use crate::{TimeUnit, UnionMode};
- use std::collections::BTreeMap;
#[test]
#[cfg(feature = "serde")]
@@ -523,7 +522,7 @@ mod tests {
fn person_schema() -> Schema {
let kv_array = [("k".to_string(), "v".to_string())];
- let field_metadata: BTreeMap<String, String> = kv_array.iter().cloned().collect();
+ let field_metadata: HashMap<String, String> = kv_array.iter().cloned().collect();
let first_name =
Field::new("first_name", DataType::Utf8, false).with_metadata(field_metadata);
@@ -551,18 +550,16 @@ mod tests {
#[test]
fn test_try_merge_field_with_metadata() {
// 1. Different values for the same key should cause error.
- let metadata1: BTreeMap<String, String> =
- [("foo".to_string(), "bar".to_string())]
- .iter()
- .cloned()
- .collect();
+ let metadata1: HashMap<String, String> = [("foo".to_string(), "bar".to_string())]
+ .iter()
+ .cloned()
+ .collect();
let f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata1);
- let metadata2: BTreeMap<String, String> =
- [("foo".to_string(), "baz".to_string())]
- .iter()
- .cloned()
- .collect();
+ let metadata2: HashMap<String, String> = [("foo".to_string(), "baz".to_string())]
+ .iter()
+ .cloned()
+ .collect();
let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata2);
assert!(
@@ -572,7 +569,7 @@ mod tests {
// 2. None + Some
let mut f1 = Field::new("first_name", DataType::Utf8, false);
- let metadata2: BTreeMap<String, String> =
+ let metadata2: HashMap<String, String> =
[("missing".to_string(), "value".to_string())]
.iter()
.cloned()
diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs
index 4ff9c7a39..70cee9ef9 100644
--- a/parquet/src/arrow/schema/complex.rs
+++ b/parquet/src/arrow/schema/complex.rs
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-use std::collections::BTreeMap;
+use std::collections::HashMap;
use crate::arrow::schema::primitive::convert_primitive;
use crate::arrow::ProjectionMask;
@@ -347,7 +347,7 @@ impl Visitor {
let value_field = convert_field(map_value, &value, arrow_value);
let field_metadata = match arrow_map {
Some(field) => field.metadata().clone(),
- _ => BTreeMap::default(),
+ _ => HashMap::default(),
};
let map_field = Field::new(