You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/11/21 16:59:20 UTC

[arrow-rs] branch master updated: refactor: convert `Field::metadata` to `HashMap` (#3148)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 57f91f2e8 refactor: convert `Field::metadata` to `HashMap` (#3148)
57f91f2e8 is described below

commit 57f91f2e82184d6e04aab296448d1fe7352a7822
Author: Marco Neumann <ma...@crepererum.net>
AuthorDate: Mon Nov 21 16:59:14 2022 +0000

    refactor: convert `Field::metadata` to `HashMap` (#3148)
    
    * refactor: convert `Field::metadata` to `HashMap`
    
    Closes #2262.
    
    * refactor: code formatting
    
    Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
    
    Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
---
 arrow-integration-test/src/field.rs |  8 ++--
 arrow-ipc/src/convert.rs            |  6 +--
 arrow-schema/src/datatype.rs        |  6 +--
 arrow-schema/src/field.rs           | 85 +++++++++++++++++++++++++++++--------
 arrow-schema/src/schema.rs          | 23 +++++-----
 parquet/src/arrow/schema/complex.rs |  4 +-
 6 files changed, 90 insertions(+), 42 deletions(-)

diff --git a/arrow-integration-test/src/field.rs b/arrow-integration-test/src/field.rs
index 5b5863557..4bfbf8e99 100644
--- a/arrow-integration-test/src/field.rs
+++ b/arrow-integration-test/src/field.rs
@@ -18,7 +18,7 @@
 use crate::{data_type_from_json, data_type_to_json};
 use arrow::datatypes::{DataType, Field};
 use arrow::error::{ArrowError, Result};
-use std::collections::BTreeMap;
+use std::collections::HashMap;
 
 /// Parse a `Field` definition from a JSON representation.
 pub fn field_from_json(json: &serde_json::Value) -> Result<Field> {
@@ -53,7 +53,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result<Field> {
             // Referenced example file: testing/data/arrow-ipc-stream/integration/1.0.0-littleendian/generated_custom_metadata.json.gz
             let metadata = match map.get("metadata") {
                 Some(&Value::Array(ref values)) => {
-                    let mut res: BTreeMap<String, String> = BTreeMap::default();
+                    let mut res: HashMap<String, String> = HashMap::default();
                     for value in values {
                         match value.as_object() {
                             Some(map) => {
@@ -92,7 +92,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result<Field> {
                 // We also support map format, because Schema's metadata supports this.
                 // See https://github.com/apache/arrow/pull/5907
                 Some(&Value::Object(ref values)) => {
-                    let mut res: BTreeMap<String, String> = BTreeMap::default();
+                    let mut res: HashMap<String, String> = HashMap::default();
                     for (k, v) in values {
                         if let Some(str_value) = v.as_str() {
                             res.insert(k.clone(), str_value.to_string().clone());
@@ -110,7 +110,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result<Field> {
                         "Field `metadata` is not json array".to_string(),
                     ));
                 }
-                _ => BTreeMap::default(),
+                _ => HashMap::default(),
             };
 
             // if data_type is a struct or list, get its children
diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs
index a9dda6f2a..e11d64a47 100644
--- a/arrow-ipc/src/convert.rs
+++ b/arrow-ipc/src/convert.rs
@@ -21,7 +21,7 @@ use arrow_schema::*;
 use flatbuffers::{
     FlatBufferBuilder, ForwardsUOffset, UnionWIPOffset, Vector, WIPOffset,
 };
-use std::collections::{BTreeMap, HashMap};
+use std::collections::HashMap;
 
 use crate::{size_prefixed_root_as_message, CONTINUATION_MARKER};
 use DataType::*;
@@ -86,7 +86,7 @@ impl<'a> From<crate::Field<'a>> for Field {
             )
         };
 
-        let mut metadata_map = BTreeMap::default();
+        let mut metadata_map = HashMap::default();
         if let Some(list) = field.custom_metadata() {
             for kv in list {
                 if let (Some(k), Some(v)) = (kv.key(), kv.value()) {
@@ -812,7 +812,7 @@ mod tests {
             .iter()
             .cloned()
             .collect();
-        let field_md: BTreeMap<String, String> = [("k".to_string(), "v".to_string())]
+        let field_md: HashMap<String, String> = [("k".to_string(), "v".to_string())]
             .iter()
             .cloned()
             .collect();
diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs
index 90ae42942..acf369145 100644
--- a/arrow-schema/src/datatype.rs
+++ b/arrow-schema/src/datatype.rs
@@ -381,10 +381,10 @@ mod tests {
     #[test]
     #[cfg(feature = "serde")]
     fn serde_struct_type() {
-        use std::collections::BTreeMap;
+        use std::collections::HashMap;
 
         let kv_array = [("k".to_string(), "v".to_string())];
-        let field_metadata: BTreeMap<String, String> = kv_array.iter().cloned().collect();
+        let field_metadata: HashMap<String, String> = kv_array.iter().cloned().collect();
 
         // Non-empty map: should be converted as JSON obj { ... }
         let first_name =
@@ -392,7 +392,7 @@ mod tests {
 
         // Empty map: should be omitted.
         let last_name = Field::new("last_name", DataType::Utf8, false)
-            .with_metadata(BTreeMap::default());
+            .with_metadata(HashMap::default());
 
         let person = DataType::Struct(vec![
             first_name,
diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs
index b1de65e55..cd9024747 100644
--- a/arrow-schema/src/field.rs
+++ b/arrow-schema/src/field.rs
@@ -17,7 +17,7 @@
 
 use crate::error::ArrowError;
 use std::cmp::Ordering;
-use std::collections::BTreeMap;
+use std::collections::HashMap;
 use std::hash::{Hash, Hasher};
 
 use crate::datatype::DataType;
@@ -37,9 +37,9 @@ pub struct Field {
     /// A map of key-value pairs containing additional custom meta data.
     #[cfg_attr(
         feature = "serde",
-        serde(skip_serializing_if = "BTreeMap::is_empty", default)
+        serde(skip_serializing_if = "HashMap::is_empty", default)
     )]
-    metadata: BTreeMap<String, String>,
+    metadata: HashMap<String, String>,
 }
 
 // Auto-derive `PartialEq` traits will pull `dict_id` and `dict_is_ordered`
@@ -68,9 +68,33 @@ impl Ord for Field {
     fn cmp(&self, other: &Self) -> Ordering {
         self.name
             .cmp(other.name())
-            .then(self.data_type.cmp(other.data_type()))
-            .then(self.nullable.cmp(&other.nullable))
-            .then(self.metadata.cmp(&other.metadata))
+            .then_with(|| self.data_type.cmp(other.data_type()))
+            .then_with(|| self.nullable.cmp(&other.nullable))
+            .then_with(|| {
+                // ensure deterministic key order
+                let mut keys: Vec<&String> =
+                    self.metadata.keys().chain(other.metadata.keys()).collect();
+                keys.sort();
+                for k in keys {
+                    match (self.metadata.get(k), other.metadata.get(k)) {
+                        (None, None) => {}
+                        (Some(_), None) => {
+                            return Ordering::Less;
+                        }
+                        (None, Some(_)) => {
+                            return Ordering::Greater;
+                        }
+                        (Some(v1), Some(v2)) => match v1.cmp(v2) {
+                            Ordering::Equal => {}
+                            other => {
+                                return other;
+                            }
+                        },
+                    }
+                }
+
+                Ordering::Equal
+            })
     }
 }
 
@@ -79,7 +103,14 @@ impl Hash for Field {
         self.name.hash(state);
         self.data_type.hash(state);
         self.nullable.hash(state);
-        self.metadata.hash(state);
+
+        // ensure deterministic key order
+        let mut keys: Vec<&String> = self.metadata.keys().collect();
+        keys.sort();
+        for k in keys {
+            k.hash(state);
+            self.metadata.get(k).expect("key valid").hash(state);
+        }
     }
 }
 
@@ -92,7 +123,7 @@ impl Field {
             nullable,
             dict_id: 0,
             dict_is_ordered: false,
-            metadata: BTreeMap::default(),
+            metadata: HashMap::default(),
         }
     }
 
@@ -110,29 +141,29 @@ impl Field {
             nullable,
             dict_id,
             dict_is_ordered,
-            metadata: BTreeMap::default(),
+            metadata: HashMap::default(),
         }
     }
 
     /// Sets the `Field`'s optional custom metadata.
     /// The metadata is set as `None` for empty map.
     #[inline]
-    pub fn set_metadata(&mut self, metadata: BTreeMap<String, String>) {
-        self.metadata = BTreeMap::default();
+    pub fn set_metadata(&mut self, metadata: HashMap<String, String>) {
+        self.metadata = HashMap::default();
         if !metadata.is_empty() {
             self.metadata = metadata;
         }
     }
 
     /// Sets the metadata of this `Field` to be `metadata` and returns self
-    pub fn with_metadata(mut self, metadata: BTreeMap<String, String>) -> Self {
+    pub fn with_metadata(mut self, metadata: HashMap<String, String>) -> Self {
         self.set_metadata(metadata);
         self
     }
 
     /// Returns the immutable reference to the `Field`'s optional custom metadata.
     #[inline]
-    pub const fn metadata(&self) -> &BTreeMap<String, String> {
+    pub const fn metadata(&self) -> &HashMap<String, String> {
         &self.metadata
     }
 
@@ -545,10 +576,30 @@ mod test {
         assert_ne!(get_field_hash(&dict1), get_field_hash(&dict2));
     }
 
+    #[test]
+    fn test_field_comparison_metadata() {
+        let f1 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([
+            (String::from("k1"), String::from("v1")),
+            (String::from("k2"), String::from("v2")),
+        ]));
+        let f2 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([
+            (String::from("k1"), String::from("v1")),
+            (String::from("k3"), String::from("v3")),
+        ]));
+        let f3 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([
+            (String::from("k1"), String::from("v1")),
+            (String::from("k3"), String::from("v4")),
+        ]));
+
+        assert!(f1.cmp(&f2).is_lt());
+        assert!(f2.cmp(&f3).is_lt());
+        assert!(f1.cmp(&f3).is_lt());
+    }
+
     #[test]
     fn test_contains_reflexivity() {
         let mut field = Field::new("field1", DataType::Float16, false);
-        field.set_metadata(BTreeMap::from([
+        field.set_metadata(HashMap::from([
             (String::from("k0"), String::from("v0")),
             (String::from("k1"), String::from("v1")),
         ]));
@@ -560,14 +611,14 @@ mod test {
         let child_field = Field::new("child1", DataType::Float16, false);
 
         let mut field1 = Field::new("field1", DataType::Struct(vec![child_field]), false);
-        field1.set_metadata(BTreeMap::from([(String::from("k1"), String::from("v1"))]));
+        field1.set_metadata(HashMap::from([(String::from("k1"), String::from("v1"))]));
 
         let mut field2 = Field::new("field1", DataType::Struct(vec![]), true);
-        field2.set_metadata(BTreeMap::from([(String::from("k2"), String::from("v2"))]));
+        field2.set_metadata(HashMap::from([(String::from("k2"), String::from("v2"))]));
         field2.try_merge(&field1).unwrap();
 
         let mut field3 = Field::new("field1", DataType::Struct(vec![]), false);
-        field3.set_metadata(BTreeMap::from([(String::from("k3"), String::from("v3"))]));
+        field3.set_metadata(HashMap::from([(String::from("k3"), String::from("v3"))]));
         field3.try_merge(&field2).unwrap();
 
         assert!(field2.contains(&field1));
diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs
index 519a8e089..8ff40866d 100644
--- a/arrow-schema/src/schema.rs
+++ b/arrow-schema/src/schema.rs
@@ -290,7 +290,6 @@ mod tests {
     use super::*;
     use crate::datatype::DataType;
     use crate::{TimeUnit, UnionMode};
-    use std::collections::BTreeMap;
 
     #[test]
     #[cfg(feature = "serde")]
@@ -523,7 +522,7 @@ mod tests {
 
     fn person_schema() -> Schema {
         let kv_array = [("k".to_string(), "v".to_string())];
-        let field_metadata: BTreeMap<String, String> = kv_array.iter().cloned().collect();
+        let field_metadata: HashMap<String, String> = kv_array.iter().cloned().collect();
         let first_name =
             Field::new("first_name", DataType::Utf8, false).with_metadata(field_metadata);
 
@@ -551,18 +550,16 @@ mod tests {
     #[test]
     fn test_try_merge_field_with_metadata() {
         // 1. Different values for the same key should cause error.
-        let metadata1: BTreeMap<String, String> =
-            [("foo".to_string(), "bar".to_string())]
-                .iter()
-                .cloned()
-                .collect();
+        let metadata1: HashMap<String, String> = [("foo".to_string(), "bar".to_string())]
+            .iter()
+            .cloned()
+            .collect();
         let f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata1);
 
-        let metadata2: BTreeMap<String, String> =
-            [("foo".to_string(), "baz".to_string())]
-                .iter()
-                .cloned()
-                .collect();
+        let metadata2: HashMap<String, String> = [("foo".to_string(), "baz".to_string())]
+            .iter()
+            .cloned()
+            .collect();
         let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata2);
 
         assert!(
@@ -572,7 +569,7 @@ mod tests {
 
         // 2. None + Some
         let mut f1 = Field::new("first_name", DataType::Utf8, false);
-        let metadata2: BTreeMap<String, String> =
+        let metadata2: HashMap<String, String> =
             [("missing".to_string(), "value".to_string())]
                 .iter()
                 .cloned()
diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs
index 4ff9c7a39..70cee9ef9 100644
--- a/parquet/src/arrow/schema/complex.rs
+++ b/parquet/src/arrow/schema/complex.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::collections::BTreeMap;
+use std::collections::HashMap;
 
 use crate::arrow::schema::primitive::convert_primitive;
 use crate::arrow::ProjectionMask;
@@ -347,7 +347,7 @@ impl Visitor {
                 let value_field = convert_field(map_value, &value, arrow_value);
                 let field_metadata = match arrow_map {
                     Some(field) => field.metadata().clone(),
-                    _ => BTreeMap::default(),
+                    _ => HashMap::default(),
                 };
 
                 let map_field = Field::new(