You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/11/28 21:17:42 UTC

[arrow-rs] branch master updated: Infer timestamps from CSV files (#3209)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 64b466e78 Infer timestamps from CSV files (#3209)
64b466e78 is described below

commit 64b466e7864f8b019d3396c45efec81342b46a7f
Author: Jeffrey <22...@users.noreply.github.com>
AuthorDate: Tue Nov 29 08:17:37 2022 +1100

    Infer timestamps from CSV files (#3209)
    
    * Infer timestamps from CSV files
    
    * Fix regex patterns
---
 arrow-csv/src/reader.rs | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/arrow-csv/src/reader.rs b/arrow-csv/src/reader.rs
index 6432fb1b8..f8f9f50a3 100644
--- a/arrow-csv/src/reader.rs
+++ b/arrow-csv/src/reader.rs
@@ -70,9 +70,11 @@ lazy_static! {
         .case_insensitive(true)
         .build()
         .unwrap();
-    static ref DATE_RE: Regex = Regex::new(r"^\d{4}-\d\d-\d\d$").unwrap();
+    static ref DATE32_RE: Regex = Regex::new(r"^\d{4}-\d\d-\d\d$").unwrap();
+    static ref DATE64_RE: Regex =
+        Regex::new(r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d$").unwrap();
     static ref DATETIME_RE: Regex =
-        Regex::new(r"^\d{4}-\d\d-\d\dT\d\d:\d\d:\d\d$").unwrap();
+        Regex::new(r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d\.\d{1,9}$").unwrap();
 }
 
 /// Infer the data type of a record
@@ -90,10 +92,12 @@ fn infer_field_schema(string: &str, datetime_re: Option<Regex>) -> DataType {
         DataType::Float64
     } else if INTEGER_RE.is_match(string) {
         DataType::Int64
-    } else if datetime_re.is_match(string) {
-        DataType::Date64
-    } else if DATE_RE.is_match(string) {
+    } else if DATE32_RE.is_match(string) {
         DataType::Date32
+    } else if DATE64_RE.is_match(string) {
+        DataType::Date64
+    } else if datetime_re.is_match(string) {
+        DataType::Timestamp(TimeUnit::Nanosecond, None)
     } else {
         DataType::Utf8
     }
@@ -1590,10 +1594,9 @@ mod tests {
             infer_field_schema("2020-11-08T14:20:01", None),
             DataType::Date64
         );
-        // to be inferred as a date64 this needs a custom datetime_re
         assert_eq!(
             infer_field_schema("2020-11-08 14:20:01", None),
-            DataType::Utf8
+            DataType::Date64
         );
         let reg = Regex::new(r"^\d{4}-\d\d-\d\d \d\d:\d\d:\d\d$").ok();
         assert_eq!(
@@ -1602,6 +1605,14 @@ mod tests {
         );
         assert_eq!(infer_field_schema("-5.13", None), DataType::Float64);
         assert_eq!(infer_field_schema("0.1300", None), DataType::Float64);
+        assert_eq!(
+            infer_field_schema("2021-12-19 13:12:30.921", None),
+            DataType::Timestamp(TimeUnit::Nanosecond, None)
+        );
+        assert_eq!(
+            infer_field_schema("2021-12-19T13:12:30.123456789", None),
+            DataType::Timestamp(TimeUnit::Nanosecond, None)
+        );
     }
 
     #[test]