You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ze...@apache.org on 2022/05/15 17:35:47 UTC

[arrow] branch master updated: ARROW-16504 [Go][CSV] Add arrow.TimestampType support to the reader

This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 9465466abb ARROW-16504 [Go][CSV] Add arrow.TimestampType support to the reader
9465466abb is described below

commit 9465466abb223b190478f3171e4291db5a14b1ee
Author: Mark Wolfe <ma...@wolfe.id.au>
AuthorDate: Sun May 15 13:35:30 2022 -0400

    ARROW-16504 [Go][CSV] Add arrow.TimestampType support to the reader
    
    There is already a helper to convert strings to arrow.Timestamp so incorporate this into the CSV reader.
    
    The CSV files I am currently working with have RFC3339 timestamps so I followed some of the code JSON and stuck with millisecond default.
    
    Was really easy to add this using the existing functions and structure.
    
    Closes #13098 from wolfeidau/ARROW-16504-add-timestamp-support-to-reader
    
    Authored-by: Mark Wolfe <ma...@wolfe.id.au>
    Signed-off-by: Matt Topol <zo...@gmail.com>
---
 go/arrow/csv/common.go           |  1 +
 go/arrow/csv/reader.go           | 23 ++++++++++++++++++++++-
 go/arrow/csv/reader_test.go      |  4 ++++
 go/arrow/csv/testdata/header.csv |  8 ++++----
 go/arrow/csv/testdata/types.csv  |  8 ++++----
 5 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/go/arrow/csv/common.go b/go/arrow/csv/common.go
index a43059b1b0..0f1b9c4bb2 100644
--- a/go/arrow/csv/common.go
+++ b/go/arrow/csv/common.go
@@ -167,6 +167,7 @@ func validate(schema *arrow.Schema) {
 		case *arrow.Uint8Type, *arrow.Uint16Type, *arrow.Uint32Type, *arrow.Uint64Type:
 		case *arrow.Float32Type, *arrow.Float64Type:
 		case *arrow.StringType:
+		case *arrow.TimestampType:
 		default:
 			panic(fmt.Errorf("arrow/csv: field %d (%s) has invalid data type %T", i, f.Name, ft))
 		}
diff --git a/go/arrow/csv/reader.go b/go/arrow/csv/reader.go
index ef59cdd97d..9e270d625f 100644
--- a/go/arrow/csv/reader.go
+++ b/go/arrow/csv/reader.go
@@ -261,7 +261,7 @@ func (r *Reader) read(recs []string) {
 }
 
 func (r *Reader) initFieldConverter(field *arrow.Field) func(array.Builder, string) {
-	switch field.Type.(type) {
+	switch dt := field.Type.(type) {
 	case *arrow.BooleanType:
 		return func(field array.Builder, str string) {
 			r.parseBool(field, str)
@@ -321,6 +321,10 @@ func (r *Reader) initFieldConverter(field *arrow.Field) func(array.Builder, stri
 				field.(*array.StringBuilder).Append(str)
 			}
 		}
+	case *arrow.TimestampType:
+		return func(field array.Builder, str string) {
+			r.parseTimestamp(field, str, dt.Unit)
+		}
 
 	default:
 		panic(fmt.Errorf("arrow/csv: unhandled field type %T", field.Type))
@@ -507,6 +511,23 @@ func (r *Reader) parseFloat64(field array.Builder, str string) {
 	field.(*array.Float64Builder).Append(v)
 }
 
+// parses timestamps using millisecond precision
+func (r *Reader) parseTimestamp(field array.Builder, str string, unit arrow.TimeUnit) {
+	if r.isNull(str) {
+		field.AppendNull()
+		return
+	}
+
+	v, err := arrow.TimestampFromString(str, unit)
+	if err != nil && r.err == nil {
+		r.err = err
+		field.AppendNull()
+		return
+	}
+
+	field.(*array.TimestampBuilder).Append(v)
+}
+
 // Retain increases the reference count by 1.
 // Retain may be called simultaneously from multiple goroutines.
 func (r *Reader) Retain() {
diff --git a/go/arrow/csv/reader_test.go b/go/arrow/csv/reader_test.go
index d8a67b91eb..6f0316e35a 100644
--- a/go/arrow/csv/reader_test.go
+++ b/go/arrow/csv/reader_test.go
@@ -201,6 +201,7 @@ func testCSVReader(t *testing.T, filepath string, withHeader bool) {
 			arrow.Field{Name: "f32", Type: arrow.PrimitiveTypes.Float32},
 			arrow.Field{Name: "f64", Type: arrow.PrimitiveTypes.Float64},
 			arrow.Field{Name: "str", Type: arrow.BinaryTypes.String},
+			arrow.Field{Name: "ts", Type: arrow.FixedWidthTypes.Timestamp_ms},
 		},
 		nil,
 	)
@@ -246,6 +247,7 @@ rec[0]["u64"]: [1]
 rec[0]["f32"]: [1.1]
 rec[0]["f64"]: [1.1]
 rec[0]["str"]: ["str-1"]
+rec[0]["ts"]: [1652054461000]
 rec[1]["bool"]: [false]
 rec[1]["i8"]: [-2]
 rec[1]["i16"]: [-2]
@@ -258,6 +260,7 @@ rec[1]["u64"]: [2]
 rec[1]["f32"]: [2.2]
 rec[1]["f64"]: [2.2]
 rec[1]["str"]: ["str-2"]
+rec[1]["ts"]: [1652140799000]
 rec[2]["bool"]: [(null)]
 rec[2]["i8"]: [(null)]
 rec[2]["i16"]: [(null)]
@@ -270,6 +273,7 @@ rec[2]["u64"]: [(null)]
 rec[2]["f32"]: [(null)]
 rec[2]["f64"]: [(null)]
 rec[2]["str"]: [(null)]
+rec[2]["ts"]: [(null)]
 `
 
 	if got, want := out.String(), want; got != want {
diff --git a/go/arrow/csv/testdata/header.csv b/go/arrow/csv/testdata/header.csv
index bbcd33f44f..a55f5ff062 100644
--- a/go/arrow/csv/testdata/header.csv
+++ b/go/arrow/csv/testdata/header.csv
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-bool;i8;i16;i32;i64;u8;u16;u32;u64;f32;f64;str
-true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;str-1
-false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;str-2
-null;null;null;null;null;null;null;null;null;null;null;null
+bool;i8;i16;i32;i64;u8;u16;u32;u64;f32;f64;str;ts
+true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;str-1;2022-05-09T00:01:01
+false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;str-2;2022-05-09T23:59:59
+null;NULL;null;N/A;;null;null;null;null;null;null;null;null
\ No newline at end of file
diff --git a/go/arrow/csv/testdata/types.csv b/go/arrow/csv/testdata/types.csv
index 1c9c4afe95..14153e6f05 100644
--- a/go/arrow/csv/testdata/types.csv
+++ b/go/arrow/csv/testdata/types.csv
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-## supported types: bool;int8;int16;int32;int64;uint8;uint16;uint32;uint64;float32;float64;string
-true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;str-1
-false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;str-2
-null;NULL;null;N/A;;null;null;null;null;null;null;null
+## supported types: bool;int8;int16;int32;int64;uint8;uint16;uint32;uint64;float32;float64;string;timestamp
+true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;str-1;2022-05-09T00:01:01
+false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;str-2;2022-05-09T23:59:59
+null;NULL;null;N/A;;null;null;null;null;null;null;null;null
\ No newline at end of file