You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/11/28 21:17:42 UTC
[arrow-rs] branch master updated: Infer timestamps from CSV files (#3209)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 64b466e78 Infer timestamps from CSV files (#3209)
64b466e78 is described below
commit 64b466e7864f8b019d3396c45efec81342b46a7f
Author: Jeffrey <22...@users.noreply.github.com>
AuthorDate: Tue Nov 29 08:17:37 2022 +1100
Infer timestamps from CSV files (#3209)
* Infer timestamps from CSV files
* Fix regex patterns
---
arrow-csv/src/reader.rs | 25 ++++++++++++++++++-------
1 file changed, 18 insertions(+), 7 deletions(-)
diff --git a/arrow-csv/src/reader.rs b/arrow-csv/src/reader.rs
index 6432fb1b8..f8f9f50a3 100644
--- a/arrow-csv/src/reader.rs
+++ b/arrow-csv/src/reader.rs
@@ -70,9 +70,11 @@ lazy_static! {
.case_insensitive(true)
.build()
.unwrap();
- static ref DATE_RE: Regex = Regex::new(r"^\d{4}-\d\d-\d\d$").unwrap();
+ static ref DATE32_RE: Regex = Regex::new(r"^\d{4}-\d\d-\d\d$").unwrap();
+ static ref DATE64_RE: Regex =
+ Regex::new(r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d$").unwrap();
static ref DATETIME_RE: Regex =
- Regex::new(r"^\d{4}-\d\d-\d\dT\d\d:\d\d:\d\d$").unwrap();
+ Regex::new(r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d\.\d{1,9}$").unwrap();
}
/// Infer the data type of a record
@@ -90,10 +92,12 @@ fn infer_field_schema(string: &str, datetime_re: Option<Regex>) -> DataType {
DataType::Float64
} else if INTEGER_RE.is_match(string) {
DataType::Int64
- } else if datetime_re.is_match(string) {
- DataType::Date64
- } else if DATE_RE.is_match(string) {
+ } else if DATE32_RE.is_match(string) {
DataType::Date32
+ } else if DATE64_RE.is_match(string) {
+ DataType::Date64
+ } else if datetime_re.is_match(string) {
+ DataType::Timestamp(TimeUnit::Nanosecond, None)
} else {
DataType::Utf8
}
@@ -1590,10 +1594,9 @@ mod tests {
infer_field_schema("2020-11-08T14:20:01", None),
DataType::Date64
);
- // to be inferred as a date64 this needs a custom datetime_re
assert_eq!(
infer_field_schema("2020-11-08 14:20:01", None),
- DataType::Utf8
+ DataType::Date64
);
let reg = Regex::new(r"^\d{4}-\d\d-\d\d \d\d:\d\d:\d\d$").ok();
assert_eq!(
@@ -1602,6 +1605,14 @@ mod tests {
);
assert_eq!(infer_field_schema("-5.13", None), DataType::Float64);
assert_eq!(infer_field_schema("0.1300", None), DataType::Float64);
+ assert_eq!(
+ infer_field_schema("2021-12-19 13:12:30.921", None),
+ DataType::Timestamp(TimeUnit::Nanosecond, None)
+ );
+ assert_eq!(
+ infer_field_schema("2021-12-19T13:12:30.123456789", None),
+ DataType::Timestamp(TimeUnit::Nanosecond, None)
+ );
}
#[test]