You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ag...@apache.org on 2020/05/17 17:05:34 UTC
[arrow] branch master updated: ARROW-8833: [Rust] Implement VALIDATE mode in integration tests

This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new ce15eb2  ARROW-8833: [Rust] Implement VALIDATE mode in integration tests
ce15eb2 is described below

commit ce15eb24755bd63f1fcf2d8132d75fb2957a12ae
Author: Andy Grove <an...@gmail.com>
AuthorDate: Sun May 17 11:05:12 2020 -0600

    ARROW-8833: [Rust] Implement VALIDATE mode in integration tests
    
    This follows on from https://github.com/apache/arrow/pull/7206 which should be merged first.
    
    This PR partially implements the VALIDATE mode in arrow-json-integration-test.
    
    Closes #7208 from andygrove/ARROW-8833
    
    Authored-by: Andy Grove <an...@gmail.com>
    Signed-off-by: Andy Grove <an...@gmail.com>
---
 rust/integration-testing/README.md                 |  30 ++++++
 .../src/bin/arrow-json-integration-test.rs         | 103 ++++++++++++++-------
 2 files changed, 97 insertions(+), 36 deletions(-)

diff --git a/rust/integration-testing/README.md b/rust/integration-testing/README.md
new file mode 100644
index 0000000..66248de
--- /dev/null
+++ b/rust/integration-testing/README.md
@@ -0,0 +1,30 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Apache Arrow Rust Integration Testing
+
+See [Integration.rst](../../docs/source/format/Integration.rst) for an overview of integration testing.
+
+This crate contains the following binaries, which are invoked by Archery during integration testing with other Arrow implementations.
+
+| Binary | Purpose |
+|--------|---------|
+| arrow-file-to-stream | Converts an Arrow file to an Arrow stream |
+| arrow-stream-to-file | Converts an Arrow stream to an Arrow file |
+| arrow-json-integration-test | Converts between Arrow and JSON formats |
diff --git a/rust/integration-testing/src/bin/arrow-json-integration-test.rs b/rust/integration-testing/src/bin/arrow-json-integration-test.rs
index d13cc21..5694de9 100644
--- a/rust/integration-testing/src/bin/arrow-json-integration-test.rs
+++ b/rust/integration-testing/src/bin/arrow-json-integration-test.rs
@@ -26,9 +26,11 @@ use arrow::array::{
     UInt16Builder, UInt32Builder, UInt64Builder, UInt8Builder,
 };
 use arrow::datatypes::{DataType, Schema};
+use arrow::error::{ArrowError, Result};
 use arrow::ipc::reader::FileReader;
 use arrow::ipc::writer::FileWriter;
 use arrow::record_batch::{RecordBatch, RecordBatchReader};
+
 use hex::decode;
 use std::env;
 use std::fs::File;
@@ -81,40 +83,23 @@ fn main() {
     }
 }
 
-fn json_to_arrow(
-    json_name: &str,
-    arrow_name: &str,
-    _verbose: bool,
-) -> Result<(), String> {
-    let json_file = File::open(json_name).unwrap();
-    let reader = BufReader::new(json_file);
-
-    let arrow_json: Value = serde_json::from_reader(reader).unwrap();
-
-    let schema = Arc::new(Schema::from(&arrow_json["schema"]).unwrap());
-
-    let mut batches = vec![];
-
-    for b in arrow_json["batches"].as_array().unwrap() {
-        let json_batch: ArrowJsonBatch = serde_json::from_value(b.clone()).unwrap();
-        let batch = record_batch_from_json(schema.clone(), json_batch)?;
-        batches.push(batch);
-    }
+fn json_to_arrow(json_name: &str, arrow_name: &str, _verbose: bool) -> Result<()> {
+    let (schema, batches) = read_json_file(json_name)?;
 
-    let arrow_file = File::create(arrow_name).unwrap();
-    let mut writer = FileWriter::try_new(arrow_file, schema.as_ref()).unwrap();
+    let arrow_file = File::create(arrow_name)?;
+    let mut writer = FileWriter::try_new(arrow_file, &schema)?;
 
     for b in batches {
-        writer.write(&b).unwrap();
+        writer.write(&b)?;
     }
 
     Ok(())
 }
 
 fn record_batch_from_json(
-    schema: Arc<Schema>,
+    schema: &Schema,
     json_batch: ArrowJsonBatch,
-) -> Result<RecordBatch, String> {
+) -> Result<RecordBatch> {
     let mut columns = vec![];
 
     for (field, json_col) in schema.fields().iter().zip(json_batch.columns) {
@@ -312,21 +297,22 @@ fn record_batch_from_json(
                 }
                 Arc::new(b.finish())
             }
-            t => return Err(format!("data type {:?} not supported", t)),
+            t => {
+                return Err(ArrowError::JsonError(format!(
+                    "data type {:?} not supported",
+                    t
+                )))
+            }
         };
         columns.push(col);
     }
 
-    RecordBatch::try_new(schema, columns).map_err(|e| e.to_string())
+    RecordBatch::try_new(Arc::new(schema.clone()), columns)
 }
 
-fn arrow_to_json(
-    arrow_name: &str,
-    json_name: &str,
-    _verbose: bool,
-) -> Result<(), String> {
-    let arrow_file = File::open(arrow_name).unwrap();
-    let mut reader = FileReader::try_new(arrow_file).unwrap();
+fn arrow_to_json(arrow_name: &str, json_name: &str, _verbose: bool) -> Result<()> {
+    let arrow_file = File::open(arrow_name)?;
+    let mut reader = FileReader::try_new(arrow_file)?;
 
     let mut fields = vec![];
     for f in reader.schema().fields() {
@@ -345,12 +331,57 @@ fn arrow_to_json(
         dictionaries: None,
     };
 
-    let json_file = File::create(json_name).unwrap();
+    let json_file = File::create(json_name)?;
     serde_json::to_writer(&json_file, &arrow_json).unwrap();
 
     Ok(())
 }
 
-fn validate(_arrow_name: &str, _json_name: &str, _verbose: bool) -> Result<(), String> {
-    panic!("validate not implemented");
+fn validate(arrow_name: &str, json_name: &str, _verbose: bool) -> Result<()> {
+    // open JSON file
+    let (json_schema, json_batches) = read_json_file(json_name)?;
+
+    // open Arrow file
+    let arrow_file = File::open(arrow_name)?;
+    let mut arrow_reader = FileReader::try_new(arrow_file)?;
+    let arrow_schema = arrow_reader.schema().as_ref().to_owned();
+
+    // compare schemas
+    assert!(json_schema == arrow_schema);
+
+    for json_batch in &json_batches {
+        if let Some(arrow_batch) = arrow_reader.next_batch()? {
+            // compare batches
+            assert!(arrow_batch.num_columns() == json_batch.num_columns());
+            assert!(arrow_batch.num_rows() == json_batch.num_rows());
+
+        // TODO compare in more detail
+        } else {
+            return Err(ArrowError::ComputeError(
+                "no more arrow batches left".to_owned(),
+            ));
+        }
+    }
+
+    if let Some(_) = arrow_reader.next_batch()? {
+        return Err(ArrowError::ComputeError(
+            "no more json batches left".to_owned(),
+        ));
+    }
+
+    Ok(())
+}
+
+fn read_json_file(json_name: &str) -> Result<(Schema, Vec<RecordBatch>)> {
+    let json_file = File::open(json_name)?;
+    let reader = BufReader::new(json_file);
+    let arrow_json: Value = serde_json::from_reader(reader).unwrap();
+    let schema = Schema::from(&arrow_json["schema"])?;
+    let mut batches = vec![];
+    for b in arrow_json["batches"].as_array().unwrap() {
+        let json_batch: ArrowJsonBatch = serde_json::from_value(b.clone()).unwrap();
+        let batch = record_batch_from_json(&schema, json_batch)?;
+        batches.push(batch);
+    }
+    Ok((schema, batches))
 }