You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2019/02/07 12:27:21 UTC
[arrow] branch master updated: ARROW-4449: [Rust] Convert File to T: Read + Seek for schema inference

This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 5fc09d4  ARROW-4449: [Rust] Convert File to T: Read + Seek for schema inference
5fc09d4 is described below

commit 5fc09d4e5a2481c297f3d3956477c4a290b5ab81
Author: Neville Dipale <ne...@gmail.com>
AuthorDate: Thu Feb 7 13:27:08 2019 +0100

    ARROW-4449: [Rust] Convert File to T: Read + Seek for schema inference
    
    This removes the dependency on `::std::fs::File` when inferring schemas.
    I couldn't find a way of inferring schemas without
    `Seek`.
    
    The downside's that we can't use a `ReaderBuilder` on `::std::io::Cursor`.
    
    Author: Neville Dipale <ne...@gmail.com>
    
    Closes #3541 from nevi-me/rust/arrow-4449 and squashes the following commits:
    
    e116cb5e <Neville Dipale> Arrow-4449:  Convert File to T: Read + Seek for schema inference
---
 rust/arrow/src/csv/reader.rs | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs
index b543011..7d3b309 100644
--- a/rust/arrow/src/csv/reader.rs
+++ b/rust/arrow/src/csv/reader.rs
@@ -43,7 +43,6 @@
 use lazy_static::lazy_static;
 use regex::{Regex, RegexBuilder};
 use std::collections::HashSet;
-use std::fs::File;
 use std::io::{BufReader, Read, Seek, SeekFrom};
 use std::sync::Arc;
 
@@ -89,15 +88,15 @@ fn infer_field_schema(string: &str) -> DataType {
 /// with `max_read_records` controlling the maximum number of records to read.
 ///
 /// If `max_read_records` is not set, the whole file is read to infer its schema.
-fn infer_file_schema(
-    mut file: File,
+fn infer_file_schema<R: Read + Seek>(
+    reader: &mut BufReader<R>,
     delimiter: u8,
     max_read_records: Option<usize>,
     has_headers: bool,
 ) -> Result<Schema> {
     let mut csv_reader = csv::ReaderBuilder::new()
         .delimiter(delimiter)
-        .from_reader(BufReader::new(file.try_clone()?));
+        .from_reader(reader);
 
     // get or create header names
     // when has_headers is false, creates default column names with column_ prefix
@@ -127,7 +126,7 @@ fn infer_file_schema(
     let mut fields = vec![];
 
     for result in csv_reader
-        .into_records()
+        .records()
         .take(max_read_records.unwrap_or(std::usize::MAX))
     {
         let record = result?;
@@ -176,8 +175,8 @@ fn infer_file_schema(
         }
     }
 
-    // return the file seek back to the start
-    file.seek(SeekFrom::Start(0))?;
+    // return the reader seek back to the start
+    csv_reader.into_inner().seek(SeekFrom::Start(0))?;
 
     Ok(Schema::new(fields))
 }
@@ -197,8 +196,9 @@ pub struct Reader<R: Read> {
 impl<R: Read> Reader<R> {
     /// Create a new CsvReader from any value that implements the `Read` trait.
     ///
-    /// If reading a `File` you can customise the Reader, such as to enable schema
-    /// inference, use `ReaderBuilder`.
+    /// If reading a `File` or an input that supports `std::io::Read` and `std::io::Seek`;
+    /// you can customise the Reader, such as to enable schema inference, use
+    /// `ReaderBuilder`.
     pub fn new(
         reader: R,
         schema: Arc<Schema>,
@@ -465,13 +465,14 @@ impl ReaderBuilder {
     }
 
     /// Create a new `Reader` from the `ReaderBuilder`
-    pub fn build(self, file: File) -> Result<Reader<File>> {
+    pub fn build<R: Read + Seek>(self, reader: R) -> Result<Reader<R>> {
         // check if schema should be inferred
+        let mut buf_reader = BufReader::new(reader);
         let schema = match self.schema {
             Some(schema) => schema,
             None => {
                 let inferred_schema = infer_file_schema(
-                    file.try_clone().unwrap(),
+                    &mut buf_reader,
                     self.delimiter.unwrap_or(b','),
                     self.max_records,
                     self.has_headers,
@@ -483,7 +484,7 @@ impl ReaderBuilder {
         let csv_reader = csv::ReaderBuilder::new()
             .delimiter(self.delimiter.unwrap_or(b','))
             .has_headers(self.has_headers)
-            .from_reader(BufReader::new(file));
+            .from_reader(buf_reader);
         let record_iter = csv_reader.into_records();
         Ok(Reader {
             schema,
@@ -498,6 +499,7 @@ impl ReaderBuilder {
 mod tests {
     use super::*;
 
+    use std::fs::File;
     use std::io::Cursor;
 
     use crate::array::*;