You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ne...@apache.org on 2019/07/30 05:30:00 UTC

[arrow] branch master updated: ARROW-5901: [Rust] Add equals to json arrays.

This is an automated email from the ASF dual-hosted git repository.

nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new b071f6b  ARROW-5901: [Rust] Add equals to json arrays.
b071f6b is described below

commit b071f6b08107831766f9ecda5b46a89011f00a23
Author: Renjie Liu <li...@gmail.com>
AuthorDate: Tue Jul 30 07:29:38 2019 +0200

    ARROW-5901: [Rust] Add equals to json arrays.
    
    Checks whether an arrow array equals to an json array. This is motivated when I'm developing integration tests of parquet arrow reader. I use protobuf to generate both parquet data and json data, read parquet data to arrow, compare it with json data to verify the correct ness.
    
    Closes #4940 from liurenjie1024/arrow-5901 and squashes the following commits:
    
    c4815efc1 <Renjie Liu> Add tests to improve coverage
    cf3490727 <Renjie Liu> Fix comments
    d6b7c9cc5 <Renjie Liu> Fix code style problem
    b41d05746 <Renjie Liu> Add equals to json arrays.
    
    Authored-by: Renjie Liu <li...@gmail.com>
    Signed-off-by: Neville Dipale <ne...@gmail.com>
---
 rust/arrow/src/array/array.rs |  29 ++-
 rust/arrow/src/array/equal.rs | 497 ++++++++++++++++++++++++++++++++++++++++++
 rust/arrow/src/datatypes.rs   | 104 ++++++++-
 3 files changed, 617 insertions(+), 13 deletions(-)

diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs
index e4e55d0..f58d03c 100644
--- a/rust/arrow/src/array/array.rs
+++ b/rust/arrow/src/array/array.rs
@@ -25,7 +25,9 @@ use std::sync::Arc;
 use chrono::prelude::*;
 
 use super::*;
+use crate::array::equal::JsonEqual;
 use crate::buffer::{Buffer, MutableBuffer};
+use crate::datatypes::DataType::Struct;
 use crate::datatypes::*;
 use crate::error::{ArrowError, Result};
 use crate::memory;
@@ -42,7 +44,7 @@ const NANOSECONDS: i64 = 1_000_000_000;
 
 /// Trait for dealing with different types of array at runtime when the type of the
 /// array is not known in advance
-pub trait Array: Send + Sync + ArrayEqual {
+pub trait Array: Send + Sync + ArrayEqual + JsonEqual {
     /// Returns the array as `Any` so that it can be downcast to a specific implementation
     fn as_any(&self) -> &Any;
 
@@ -726,6 +728,12 @@ impl ListArray {
         self.values.data().data_type().clone()
     }
 
+    /// Returns ith value of this list array.
+    pub fn value(&self, i: usize) -> ArrayRef {
+        self.values
+            .slice(self.value_offset(i) as usize, self.value_length(i) as usize)
+    }
+
     /// Returns the offset for value at index `i`.
     ///
     /// Note this doesn't do any bound checking, for performance reason.
@@ -999,6 +1007,25 @@ impl StructArray {
     pub fn columns(&self) -> Vec<&ArrayRef> {
         self.boxed_fields.iter().collect()
     }
+
+    /// Return field names in this struct array
+    pub fn column_names(&self) -> Vec<&str> {
+        match self.data.data_type() {
+            Struct(fields) => fields
+                .iter()
+                .map(|f| f.name().as_str())
+                .collect::<Vec<&str>>(),
+            _ => unreachable!("Struct array's data type is not struct!"),
+        }
+    }
+
+    /// Return child array whose field name equals to column_name
+    pub fn column_by_name(&self, column_name: &str) -> Option<&ArrayRef> {
+        self.column_names()
+            .iter()
+            .position(|c| c == &column_name)
+            .map(|pos| self.column(pos))
+    }
 }
 
 impl From<ArrayDataRef> for StructArray {
diff --git a/rust/arrow/src/array/equal.rs b/rust/arrow/src/array/equal.rs
index 5f888ab..0f71d54 100644
--- a/rust/arrow/src/array/equal.rs
+++ b/rust/arrow/src/array/equal.rs
@@ -18,6 +18,8 @@
 use super::*;
 use crate::datatypes::*;
 use crate::util::bit_util;
+use serde_json::value::Value::{Null as JNull, Object, String as JString};
+use serde_json::Value;
 
 /// Trait for `Array` equality.
 pub trait ArrayEqual {
@@ -418,6 +420,171 @@ fn value_offset_equal<T: Array + ListArrayOps>(this: &T, other: &T) -> bool {
     true
 }
 
+/// Trait for comparing arrow array with json array
+pub trait JsonEqual {
+    /// Checks whether arrow array equals to json array.
+    fn equals_json(&self, json: &[&Value]) -> bool;
+
+    /// Checks whether arrow array equals to json array.
+    fn equals_json_values(&self, json: &[Value]) -> bool {
+        let refs = json.iter().collect::<Vec<&Value>>();
+
+        self.equals_json(&refs)
+    }
+}
+
+/// Implement array equals for numeric type
+impl<T: ArrowPrimitiveType> JsonEqual for PrimitiveArray<T> {
+    fn equals_json(&self, json: &[&Value]) -> bool {
+        if self.len() != json.len() {
+            return false;
+        }
+
+        let result = (0..self.len()).all(|i| match json[i] {
+            Value::Null => self.is_null(i),
+            v => self.is_valid(i) && Some(v) == self.value(i).into_json_value().as_ref(),
+        });
+
+        result
+    }
+}
+
+impl<T: ArrowPrimitiveType> PartialEq<Value> for PrimitiveArray<T> {
+    fn eq(&self, json: &Value) -> bool {
+        match json {
+            Value::Array(array) => self.equals_json_values(&array),
+            _ => false,
+        }
+    }
+}
+
+impl<T: ArrowPrimitiveType> PartialEq<PrimitiveArray<T>> for Value {
+    fn eq(&self, arrow: &PrimitiveArray<T>) -> bool {
+        match self {
+            Value::Array(array) => arrow.equals_json_values(&array),
+            _ => false,
+        }
+    }
+}
+
+impl JsonEqual for ListArray {
+    fn equals_json(&self, json: &[&Value]) -> bool {
+        if self.len() != json.len() {
+            return false;
+        }
+
+        let result = (0..self.len()).all(|i| match json[i] {
+            Value::Array(v) => self.is_valid(i) && self.value(i).equals_json_values(v),
+            Value::Null => self.is_null(i) || self.value_length(i) == 0,
+            _ => false,
+        });
+
+        result
+    }
+}
+
+impl PartialEq<Value> for ListArray {
+    fn eq(&self, json: &Value) -> bool {
+        match json {
+            Value::Array(json_array) => self.equals_json_values(json_array),
+            _ => false,
+        }
+    }
+}
+
+impl PartialEq<ListArray> for Value {
+    fn eq(&self, arrow: &ListArray) -> bool {
+        match self {
+            Value::Array(json_array) => arrow.equals_json_values(json_array),
+            _ => false,
+        }
+    }
+}
+
+impl JsonEqual for StructArray {
+    fn equals_json(&self, json: &[&Value]) -> bool {
+        if self.len() != json.len() {
+            return false;
+        }
+
+        let all_object = json.iter().all(|v| match v {
+            Object(_) | JNull => true,
+            _ => false,
+        });
+
+        if !all_object {
+            return false;
+        }
+
+        for column_name in self.column_names() {
+            let json_values = json
+                .iter()
+                .map(|obj| obj.get(column_name).unwrap_or(&Value::Null))
+                .collect::<Vec<&Value>>();
+
+            if !self
+                .column_by_name(column_name)
+                .map(|arr| arr.equals_json(&json_values))
+                .unwrap_or(false)
+            {
+                return false;
+            }
+        }
+
+        return true;
+    }
+}
+
+impl PartialEq<Value> for StructArray {
+    fn eq(&self, json: &Value) -> bool {
+        match json {
+            Value::Array(json_array) => self.equals_json_values(&json_array),
+            _ => false,
+        }
+    }
+}
+
+impl PartialEq<StructArray> for Value {
+    fn eq(&self, arrow: &StructArray) -> bool {
+        match self {
+            Value::Array(json_array) => arrow.equals_json_values(&json_array),
+            _ => false,
+        }
+    }
+}
+
+impl JsonEqual for BinaryArray {
+    fn equals_json(&self, json: &[&Value]) -> bool {
+        if self.len() != json.len() {
+            return false;
+        }
+
+        (0..self.len()).all(|i| match json[i] {
+            JString(s) => self.is_valid(i) && s.as_str().as_bytes() == self.value(i),
+            JNull => self.is_null(i),
+            _ => false,
+        })
+    }
+}
+
+impl PartialEq<Value> for BinaryArray {
+    fn eq(&self, json: &Value) -> bool {
+        match json {
+            Value::Array(json_array) => self.equals_json_values(&json_array),
+            _ => false,
+        }
+    }
+}
+
+impl PartialEq<BinaryArray> for Value {
+    fn eq(&self, arrow: &BinaryArray) -> bool {
+        match self {
+            Value::Array(json_array) => arrow.equals_json_values(&json_array),
+            _ => false,
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -703,6 +870,336 @@ mod tests {
         Ok(builder.finish())
     }
 
+    #[test]
+    fn test_primitive_json_equal() {
+        // Test equaled array
+        let arrow_array = Int32Array::from(vec![Some(1), None, Some(2), Some(3)]);
+        let json_array: Value = serde_json::from_str(
+            r#"
+            [
+                1, null, 2, 3
+            ]
+        "#,
+        )
+        .unwrap();
+        assert!(arrow_array.eq(&json_array));
+        assert!(json_array.eq(&arrow_array));
+
+        // Test unequaled array
+        let arrow_array = Int32Array::from(vec![Some(1), None, Some(2), Some(3)]);
+        let json_array: Value = serde_json::from_str(
+            r#"
+            [
+                1, 1, 2, 3
+            ]
+        "#,
+        )
+        .unwrap();
+        assert!(arrow_array.ne(&json_array));
+        assert!(json_array.ne(&arrow_array));
+
+        // Test unequal length case
+        let arrow_array = Int32Array::from(vec![Some(1), None, Some(2), Some(3)]);
+        let json_array: Value = serde_json::from_str(
+            r#"
+            [
+                1, 1
+            ]
+        "#,
+        )
+        .unwrap();
+        assert!(arrow_array.ne(&json_array));
+        assert!(json_array.ne(&arrow_array));
+
+        // Test not json array type case
+        let arrow_array = Int32Array::from(vec![Some(1), None, Some(2), Some(3)]);
+        let json_array: Value = serde_json::from_str(
+            r#"
+            {
+               "a": 1
+            }
+        "#,
+        )
+        .unwrap();
+        assert!(arrow_array.ne(&json_array));
+        assert!(json_array.ne(&arrow_array));
+    }
+
+    #[test]
+    fn test_list_json_equal() {
+        // Test equal case
+        let arrow_array = create_list_array(
+            &mut ListBuilder::new(Int32Builder::new(10)),
+            &[Some(&[1, 2, 3]), None, Some(&[4, 5, 6])],
+        )
+        .unwrap();
+        let json_array: Value = serde_json::from_str(
+            r#"
+            [
+                [1, 2, 3],
+                null,
+                [4, 5, 6]
+            ]
+        "#,
+        )
+        .unwrap();
+        assert!(arrow_array.eq(&json_array));
+        assert!(json_array.eq(&arrow_array));
+
+        // Test unequal case
+        let arrow_array = create_list_array(
+            &mut ListBuilder::new(Int32Builder::new(10)),
+            &[Some(&[1, 2, 3]), None, Some(&[4, 5, 6])],
+        )
+        .unwrap();
+        let json_array: Value = serde_json::from_str(
+            r#"
+            [
+                [1, 2, 3],
+                [7, 8],
+                [4, 5, 6]
+            ]
+        "#,
+        )
+        .unwrap();
+        assert!(arrow_array.ne(&json_array));
+        assert!(json_array.ne(&arrow_array));
+
+        // Test incorrect type case
+        let arrow_array = create_list_array(
+            &mut ListBuilder::new(Int32Builder::new(10)),
+            &[Some(&[1, 2, 3]), None, Some(&[4, 5, 6])],
+        )
+        .unwrap();
+        let json_array: Value = serde_json::from_str(
+            r#"
+            {
+               "a": 1
+            }
+        "#,
+        )
+        .unwrap();
+        assert!(arrow_array.ne(&json_array));
+        assert!(json_array.ne(&arrow_array));
+    }
+
+    #[test]
+    fn test_binary_json_equal() {
+        // Test the equal case
+        let arrow_array = BinaryArray::try_from(vec![
+            Some("hello"),
+            None,
+            None,
+            Some("world"),
+            None,
+            None,
+        ])
+        .unwrap();
+        let json_array: Value = serde_json::from_str(
+            r#"
+            [
+                "hello",
+                null,
+                null,
+                "world",
+                null,
+                null
+            ]
+        "#,
+        )
+        .unwrap();
+        assert!(arrow_array.eq(&json_array));
+        assert!(json_array.eq(&arrow_array));
+
+        // Test unequal case
+        let arrow_array = BinaryArray::try_from(vec![
+            Some("hello"),
+            None,
+            None,
+            Some("world"),
+            None,
+            None,
+        ])
+        .unwrap();
+        let json_array: Value = serde_json::from_str(
+            r#"
+            [
+                "hello",
+                null,
+                null,
+                "arrow",
+                null,
+                null
+            ]
+        "#,
+        )
+        .unwrap();
+        assert!(arrow_array.ne(&json_array));
+        assert!(json_array.ne(&arrow_array));
+
+        // Test unequal length case
+        let arrow_array =
+            BinaryArray::try_from(vec![Some("hello"), None, None, Some("world"), None])
+                .unwrap();
+        let json_array: Value = serde_json::from_str(
+            r#"
+            [
+                "hello",
+                null,
+                null,
+                "arrow",
+                null,
+                null
+            ]
+        "#,
+        )
+        .unwrap();
+        assert!(arrow_array.ne(&json_array));
+        assert!(json_array.ne(&arrow_array));
+
+        // Test incorrect type case
+        let arrow_array =
+            BinaryArray::try_from(vec![Some("hello"), None, None, Some("world"), None])
+                .unwrap();
+        let json_array: Value = serde_json::from_str(
+            r#"
+            {
+                "a": 1
+            }
+        "#,
+        )
+        .unwrap();
+        assert!(arrow_array.ne(&json_array));
+        assert!(json_array.ne(&arrow_array));
+
+        // Test incorrect value type case
+        let arrow_array =
+            BinaryArray::try_from(vec![Some("hello"), None, None, Some("world"), None])
+                .unwrap();
+        let json_array: Value = serde_json::from_str(
+            r#"
+            [
+                "hello",
+                null,
+                null,
+                1,
+                null,
+                null
+            ]
+        "#,
+        )
+        .unwrap();
+        assert!(arrow_array.ne(&json_array));
+        assert!(json_array.ne(&arrow_array));
+    }
+
+    #[test]
+    fn test_struct_json_equal() {
+        // Test equal case
+        let string_builder = BinaryBuilder::new(5);
+        let int_builder = Int32Builder::new(5);
+
+        let mut fields = Vec::new();
+        let mut field_builders = Vec::new();
+        fields.push(Field::new("f1", DataType::Utf8, false));
+        field_builders.push(Box::new(string_builder) as Box<ArrayBuilder>);
+        fields.push(Field::new("f2", DataType::Int32, false));
+        field_builders.push(Box::new(int_builder) as Box<ArrayBuilder>);
+
+        let mut builder = StructBuilder::new(fields, field_builders);
+
+        let arrow_array = create_struct_array(
+            &mut builder,
+            &[Some("joe"), None, None, Some("mark"), Some("doe")],
+            &[Some(1), Some(2), None, Some(4), Some(5)],
+            &[true, true, false, true, true],
+        )
+        .unwrap();
+
+        let json_array: Value = serde_json::from_str(
+            r#"
+            [
+              {
+                "f1": "joe",
+                "f2": 1
+              },
+              {
+                "f2": 2
+              },
+              null,
+              {
+                "f1": "mark",
+                "f2": 4
+              },
+              {
+                "f1": "doe",
+                "f2": 5
+              }
+            ]
+        "#,
+        )
+        .unwrap();
+        assert!(arrow_array.eq(&json_array));
+        assert!(json_array.eq(&arrow_array));
+
+        // Test unequal length case
+        let json_array: Value = serde_json::from_str(
+            r#"
+            [
+              {
+                "f1": "joe",
+                "f2": 1
+              },
+              {
+                "f2": 2
+              },
+              null,
+              {
+                "f1": "mark",
+                "f2": 4
+              }
+            ]
+        "#,
+        )
+        .unwrap();
+        assert!(arrow_array.ne(&json_array));
+        assert!(json_array.ne(&arrow_array));
+
+        // Test incorrect type case
+        let json_array: Value = serde_json::from_str(
+            r#"
+              {
+                "f1": "joe",
+                "f2": 1
+              }
+        "#,
+        )
+        .unwrap();
+        assert!(arrow_array.ne(&json_array));
+        assert!(json_array.ne(&arrow_array));
+
+        // Test not all object case
+        let json_array: Value = serde_json::from_str(
+            r#"
+            [
+              {
+                "f1": "joe",
+                "f2": 1
+              },
+              2,
+              null,
+              {
+                "f1": "mark",
+                "f2": 4
+              }
+            ]
+        "#,
+        )
+        .unwrap();
+        assert!(arrow_array.ne(&json_array));
+        assert!(json_array.ne(&arrow_array));
+    }
+
     fn create_struct_array<
         'a,
         T: AsRef<[Option<&'a str>]>,
diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs
index e0b6d70..f167e4e 100644
--- a/rust/arrow/src/datatypes.rs
+++ b/rust/arrow/src/datatypes.rs
@@ -29,7 +29,7 @@ use std::str::FromStr;
 
 use packed_simd::*;
 use serde_derive::{Deserialize, Serialize};
-use serde_json::{json, Value};
+use serde_json::{json, Number, Value, Value::Number as VNumber};
 
 use crate::error::{ArrowError, Result};
 
@@ -102,6 +102,7 @@ pub struct Field {
 pub trait ArrowNativeType:
     fmt::Debug + Send + Sync + Copy + PartialOrd + FromStr + 'static
 {
+    fn into_json_value(self) -> Option<Value>;
 }
 
 /// Trait indicating a primitive fixed-width type (bool, ints and floats).
@@ -121,17 +122,71 @@ pub trait ArrowPrimitiveType: 'static {
     fn default_value() -> Self::Native;
 }
 
-impl ArrowNativeType for bool {}
-impl ArrowNativeType for i8 {}
-impl ArrowNativeType for i16 {}
-impl ArrowNativeType for i32 {}
-impl ArrowNativeType for i64 {}
-impl ArrowNativeType for u8 {}
-impl ArrowNativeType for u16 {}
-impl ArrowNativeType for u32 {}
-impl ArrowNativeType for u64 {}
-impl ArrowNativeType for f32 {}
-impl ArrowNativeType for f64 {}
+impl ArrowNativeType for bool {
+    fn into_json_value(self) -> Option<Value> {
+        Some(self.into())
+    }
+}
+
+impl ArrowNativeType for i8 {
+    fn into_json_value(self) -> Option<Value> {
+        Some(VNumber(Number::from(self)))
+    }
+}
+
+impl ArrowNativeType for i16 {
+    fn into_json_value(self) -> Option<Value> {
+        Some(VNumber(Number::from(self)))
+    }
+}
+
+impl ArrowNativeType for i32 {
+    fn into_json_value(self) -> Option<Value> {
+        Some(VNumber(Number::from(self)))
+    }
+}
+
+impl ArrowNativeType for i64 {
+    fn into_json_value(self) -> Option<Value> {
+        Some(VNumber(Number::from(self)))
+    }
+}
+
+impl ArrowNativeType for u8 {
+    fn into_json_value(self) -> Option<Value> {
+        Some(VNumber(Number::from(self)))
+    }
+}
+
+impl ArrowNativeType for u16 {
+    fn into_json_value(self) -> Option<Value> {
+        Some(VNumber(Number::from(self)))
+    }
+}
+
+impl ArrowNativeType for u32 {
+    fn into_json_value(self) -> Option<Value> {
+        Some(VNumber(Number::from(self)))
+    }
+}
+
+impl ArrowNativeType for u64 {
+    fn into_json_value(self) -> Option<Value> {
+        Some(VNumber(Number::from(self)))
+    }
+}
+
+impl ArrowNativeType for f32 {
+    fn into_json_value(self) -> Option<Value> {
+        Number::from_f64(self as f64).map(|num| VNumber(num))
+    }
+}
+
+impl ArrowNativeType for f64 {
+    fn into_json_value(self) -> Option<Value> {
+        Number::from_f64(self).map(|num| VNumber(num))
+    }
+}
 
 macro_rules! make_type {
     ($name:ident, $native_ty:ty, $data_ty:expr, $bit_width:expr, $default_val:expr) => {
@@ -770,6 +825,9 @@ impl fmt::Display for Schema {
 mod tests {
     use super::*;
     use serde_json;
+    use serde_json::Number;
+    use serde_json::Value::{Bool, Number as VNumber};
+    use std::f32::NAN;
 
     #[test]
     fn create_struct_type() {
@@ -1018,4 +1076,26 @@ mod tests {
         assert!(schema2 != schema4);
         assert!(schema3 != schema4);
     }
+
+    #[test]
+    fn test_arrow_native_type_to_json() {
+        assert_eq!(Some(Bool(true)), true.into_json_value());
+        assert_eq!(Some(VNumber(Number::from(1))), 1i8.into_json_value());
+        assert_eq!(Some(VNumber(Number::from(1))), 1i16.into_json_value());
+        assert_eq!(Some(VNumber(Number::from(1))), 1i32.into_json_value());
+        assert_eq!(Some(VNumber(Number::from(1))), 1i64.into_json_value());
+        assert_eq!(Some(VNumber(Number::from(1))), 1u8.into_json_value());
+        assert_eq!(Some(VNumber(Number::from(1))), 1u16.into_json_value());
+        assert_eq!(Some(VNumber(Number::from(1))), 1u32.into_json_value());
+        assert_eq!(Some(VNumber(Number::from(1))), 1u64.into_json_value());
+        assert_eq!(
+            Some(VNumber(Number::from_f64(0.01 as f64).unwrap())),
+            0.01.into_json_value()
+        );
+        assert_eq!(
+            Some(VNumber(Number::from_f64(0.01f64).unwrap())),
+            0.01f64.into_json_value()
+        );
+        assert_eq!(None, NAN.into_json_value());
+    }
 }