You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by su...@apache.org on 2019/04/25 17:34:36 UTC

[arrow] branch master updated: ARROW-5191: [Rust] Expose CSV and JSON reader schemas

This is an automated email from the ASF dual-hosted git repository.

sunchao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 5f9275e  ARROW-5191: [Rust] Expose CSV and JSON reader schemas
5f9275e is described below

commit 5f9275e3cdc6159b53180cf164bcc15523e7db4d
Author: Neville Dipale <ne...@gmail.com>
AuthorDate: Thu Apr 25 10:34:26 2019 -0700

    ARROW-5191: [Rust] Expose CSV and JSON reader schemas
    
    This is useful for lazy evaluation, because it enables the user to infer the schema of a data source, then use that schema without reading its data.
    
    Author: Neville Dipale <ne...@gmail.com>
    
    Closes #4181 from nevi-me/ARROW-5191 and squashes the following commits:
    
    257a98a1 <Neville Dipale> ARROW-5191:  Expose file reader schemas
---
 rust/arrow/src/csv/reader.rs  | 36 +++++++++++++++++++++++++++++++++---
 rust/arrow/src/json/reader.rs | 43 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs
index ffeffdd..cf28e38 100644
--- a/rust/arrow/src/csv/reader.rs
+++ b/rust/arrow/src/csv/reader.rs
@@ -217,6 +217,21 @@ impl<R: Read> Reader<R> {
         )
     }
 
+    /// Returns the schema of the reader, useful for getting the schema without reading
+    /// record batches
+    pub fn schema(&self) -> Arc<Schema> {
+        match &self.projection {
+            Some(projection) => {
+                let fields = self.schema.fields();
+                let projected_fields: Vec<Field> =
+                    projection.iter().map(|i| fields[*i].clone()).collect();
+
+                Arc::new(Schema::new(projected_fields))
+            }
+            None => self.schema.clone(),
+        }
+    }
+
     /// Create a new CsvReader from a `BufReader<R: Read>
     ///
     /// This constructor allows you more flexibility in what records are processed by the
@@ -536,7 +551,8 @@ mod tests {
 
         let file = File::open("test/data/uk_cities.csv").unwrap();
 
-        let mut csv = Reader::new(file, Arc::new(schema), false, 1024, None);
+        let mut csv = Reader::new(file, Arc::new(schema.clone()), false, 1024, None);
+        assert_eq!(Arc::new(schema), csv.schema());
         let batch = csv.next().unwrap().unwrap();
         assert_eq!(37, batch.num_rows());
         assert_eq!(3, batch.num_columns());
@@ -594,6 +610,12 @@ mod tests {
         let builder = ReaderBuilder::new().has_headers(true).infer_schema(None);
 
         let mut csv = builder.build(file).unwrap();
+        let expected_schema = Schema::new(vec![
+            Field::new("city", DataType::Utf8, false),
+            Field::new("lat", DataType::Float64, false),
+            Field::new("lng", DataType::Float64, false),
+        ]);
+        assert_eq!(Arc::new(expected_schema), csv.schema());
         let batch = csv.next().unwrap().unwrap();
         assert_eq!(37, batch.num_rows());
         assert_eq!(3, batch.num_columns());
@@ -625,14 +647,16 @@ mod tests {
         let builder = ReaderBuilder::new().infer_schema(None);
 
         let mut csv = builder.build(file).unwrap();
-        let batch = csv.next().unwrap().unwrap();
 
         // csv field names should be 'column_{number}'
-        let schema = batch.schema();
+        let schema = csv.schema();
         assert_eq!("column_1", schema.field(0).name());
         assert_eq!("column_2", schema.field(1).name());
         assert_eq!("column_3", schema.field(2).name());
+        let batch = csv.next().unwrap().unwrap();
+        let batch_schema = batch.schema();
 
+        assert_eq!(&schema, batch_schema);
         assert_eq!(37, batch.num_rows());
         assert_eq!(3, batch.num_columns());
 
@@ -667,7 +691,13 @@ mod tests {
         let file = File::open("test/data/uk_cities.csv").unwrap();
 
         let mut csv = Reader::new(file, Arc::new(schema), false, 1024, Some(vec![0, 1]));
+        let projected_schema = Arc::new(Schema::new(vec![
+            Field::new("city", DataType::Utf8, false),
+            Field::new("lat", DataType::Float64, false),
+        ]));
+        assert_eq!(projected_schema.clone(), csv.schema());
         let batch = csv.next().unwrap().unwrap();
+        assert_eq!(&projected_schema, batch.schema());
         assert_eq!(37, batch.num_rows());
         assert_eq!(2, batch.num_columns());
     }
diff --git a/rust/arrow/src/json/reader.rs b/rust/arrow/src/json/reader.rs
index 8bdbf89..467a89a 100644
--- a/rust/arrow/src/json/reader.rs
+++ b/rust/arrow/src/json/reader.rs
@@ -345,6 +345,29 @@ impl<R: Read> Reader<R> {
         }
     }
 
+    /// Returns the schema of the reader, useful for getting the schema without reading
+    /// record batches
+    pub fn schema(&self) -> Arc<Schema> {
+        match &self.projection {
+            Some(projection) => {
+                let fields = self.schema.fields();
+                let projected_fields: Vec<Field> = fields
+                    .iter()
+                    .filter_map(|field| {
+                        if projection.contains(field.name()) {
+                            Some(field.clone())
+                        } else {
+                            None
+                        }
+                    })
+                    .collect();
+
+                Arc::new(Schema::new(projected_fields))
+            }
+            None => self.schema.clone(),
+        }
+    }
+
     /// Read the next batch of records
     pub fn next(&mut self) -> Result<Option<RecordBatch>> {
         let mut rows: Vec<Value> = Vec::with_capacity(self.batch_size);
@@ -742,7 +765,9 @@ mod tests {
         assert_eq!(4, batch.num_columns());
         assert_eq!(12, batch.num_rows());
 
-        let schema = batch.schema();
+        let schema = reader.schema();
+        let batch_schema = batch.schema();
+        assert_eq!(&schema, batch_schema);
 
         let a = schema.column_with_name("a").unwrap();
         assert_eq!(0, a.0);
@@ -798,7 +823,9 @@ mod tests {
         assert_eq!(4, batch.num_columns());
         assert_eq!(12, batch.num_rows());
 
-        let schema = batch.schema();
+        let schema = reader.schema();
+        let batch_schema = batch.schema();
+        assert_eq!(&schema, batch_schema);
 
         let a = schema.column_with_name("a").unwrap();
         assert_eq!(&DataType::Int64, a.1.data_type());
@@ -855,10 +882,12 @@ mod tests {
 
         let mut reader: Reader<File> = Reader::new(
             BufReader::new(File::open("test/data/basic.json").unwrap()),
-            Arc::new(schema),
+            Arc::new(schema.clone()),
             1024,
             None,
         );
+        let reader_schema = reader.schema();
+        assert_eq!(reader_schema, Arc::new(schema));
         let batch = reader.next().unwrap().unwrap();
 
         assert_eq!(4, batch.num_columns());
@@ -909,6 +938,13 @@ mod tests {
             1024,
             Some(vec!["a".to_string(), "c".to_string()]),
         );
+        let reader_schema = reader.schema();
+        let expected_schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("c", DataType::Boolean, false),
+        ]));
+        assert_eq!(reader_schema.clone(), expected_schema);
+
         let batch = reader.next().unwrap().unwrap();
 
         assert_eq!(2, batch.num_columns());
@@ -916,6 +952,7 @@ mod tests {
         assert_eq!(12, batch.num_rows());
 
         let schema = batch.schema();
+        assert_eq!(&reader_schema, schema);
 
         let a = schema.column_with_name("a").unwrap();
         assert_eq!(0, a.0);