You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2019/02/07 12:27:21 UTC
[arrow] branch master updated: ARROW-4449: [Rust] Convert File to
T: Read + Seek for schema inference
This is an automated email from the ASF dual-hosted git repository.
kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 5fc09d4 ARROW-4449: [Rust] Convert File to T: Read + Seek for schema inference
5fc09d4 is described below
commit 5fc09d4e5a2481c297f3d3956477c4a290b5ab81
Author: Neville Dipale <ne...@gmail.com>
AuthorDate: Thu Feb 7 13:27:08 2019 +0100
ARROW-4449: [Rust] Convert File to T: Read + Seek for schema inference
This removes the dependency on `::std::fs::File` when inferring schemas.
I couldn't find a way of inferring schemas without
`Seek`.
The downside's that we can't use a `ReaderBuilder` on `::std::io::Cursor`.
Author: Neville Dipale <ne...@gmail.com>
Closes #3541 from nevi-me/rust/arrow-4449 and squashes the following commits:
e116cb5e <Neville Dipale> Arrow-4449: Convert File to T: Read + Seek for schema inference
---
rust/arrow/src/csv/reader.rs | 26 ++++++++++++++------------
1 file changed, 14 insertions(+), 12 deletions(-)
diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs
index b543011..7d3b309 100644
--- a/rust/arrow/src/csv/reader.rs
+++ b/rust/arrow/src/csv/reader.rs
@@ -43,7 +43,6 @@
use lazy_static::lazy_static;
use regex::{Regex, RegexBuilder};
use std::collections::HashSet;
-use std::fs::File;
use std::io::{BufReader, Read, Seek, SeekFrom};
use std::sync::Arc;
@@ -89,15 +88,15 @@ fn infer_field_schema(string: &str) -> DataType {
/// with `max_read_records` controlling the maximum number of records to read.
///
/// If `max_read_records` is not set, the whole file is read to infer its schema.
-fn infer_file_schema(
- mut file: File,
+fn infer_file_schema<R: Read + Seek>(
+ reader: &mut BufReader<R>,
delimiter: u8,
max_read_records: Option<usize>,
has_headers: bool,
) -> Result<Schema> {
let mut csv_reader = csv::ReaderBuilder::new()
.delimiter(delimiter)
- .from_reader(BufReader::new(file.try_clone()?));
+ .from_reader(reader);
// get or create header names
// when has_headers is false, creates default column names with column_ prefix
@@ -127,7 +126,7 @@ fn infer_file_schema(
let mut fields = vec![];
for result in csv_reader
- .into_records()
+ .records()
.take(max_read_records.unwrap_or(std::usize::MAX))
{
let record = result?;
@@ -176,8 +175,8 @@ fn infer_file_schema(
}
}
- // return the file seek back to the start
- file.seek(SeekFrom::Start(0))?;
+ // return the reader seek back to the start
+ csv_reader.into_inner().seek(SeekFrom::Start(0))?;
Ok(Schema::new(fields))
}
@@ -197,8 +196,9 @@ pub struct Reader<R: Read> {
impl<R: Read> Reader<R> {
/// Create a new CsvReader from any value that implements the `Read` trait.
///
- /// If reading a `File` you can customise the Reader, such as to enable schema
- /// inference, use `ReaderBuilder`.
+ /// If reading a `File` or an input that supports `std::io::Read` and `std::io::Seek`;
+ /// you can customise the Reader, such as to enable schema inference, use
+ /// `ReaderBuilder`.
pub fn new(
reader: R,
schema: Arc<Schema>,
@@ -465,13 +465,14 @@ impl ReaderBuilder {
}
/// Create a new `Reader` from the `ReaderBuilder`
- pub fn build(self, file: File) -> Result<Reader<File>> {
+ pub fn build<R: Read + Seek>(self, reader: R) -> Result<Reader<R>> {
// check if schema should be inferred
+ let mut buf_reader = BufReader::new(reader);
let schema = match self.schema {
Some(schema) => schema,
None => {
let inferred_schema = infer_file_schema(
- file.try_clone().unwrap(),
+ &mut buf_reader,
self.delimiter.unwrap_or(b','),
self.max_records,
self.has_headers,
@@ -483,7 +484,7 @@ impl ReaderBuilder {
let csv_reader = csv::ReaderBuilder::new()
.delimiter(self.delimiter.unwrap_or(b','))
.has_headers(self.has_headers)
- .from_reader(BufReader::new(file));
+ .from_reader(buf_reader);
let record_iter = csv_reader.into_records();
Ok(Reader {
schema,
@@ -498,6 +499,7 @@ impl ReaderBuilder {
mod tests {
use super::*;
+ use std::fs::File;
use std::io::Cursor;
use crate::array::*;