You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ze...@apache.org on 2023/05/22 14:52:06 UTC

[arrow] branch main updated: GH-35684: [Go][Parquet] Fix nil dereference with nil list array (#35690)

This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 584dc7bcde GH-35684: [Go][Parquet] Fix nil dereference with nil list array (#35690)
584dc7bcde is described below

commit 584dc7bcde7afa0c67cedf737bac6b4b7a221088
Author: Matt Topol <zo...@gmail.com>
AuthorDate: Mon May 22 10:51:58 2023 -0400

    GH-35684: [Go][Parquet] Fix nil dereference with nil list array (#35690)
    
    
    
    ### Rationale for this change
    If a column to be written to parquet is a list array with only nulls in it, then the child will be empty and potentially have nil buffers. This case isn't checked in the `writeDenseArrow` function, so we blindly attempt to grab the bytes from the buffers which might be nil.
    
    ### What changes are included in this PR?
    Adding a check of `leafArr.Len()` at the top of `writeDenseArrow`. If the leaf array's length is > 0 we can be sure that the data buffer is non-nil. So we can just bail if the leaf array is empty, nothing to write.
    
    ### Are these changes tested?
    Unit test is added.
    
    * Closes: #35684
    
    Authored-by: Matt Topol <zo...@gmail.com>
    Signed-off-by: Matt Topol <zo...@gmail.com>
---
 go/parquet/pqarrow/encode_arrow.go      | 30 ++++++++++++++++++++----------
 go/parquet/pqarrow/encode_arrow_test.go | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/go/parquet/pqarrow/encode_arrow.go b/go/parquet/pqarrow/encode_arrow.go
index a7a1f113e9..dea412c8b9 100644
--- a/go/parquet/pqarrow/encode_arrow.go
+++ b/go/parquet/pqarrow/encode_arrow.go
@@ -287,12 +287,16 @@ func writeDenseArrow(ctx *arrowWriteContext, cw file.ColumnChunkWriter, leafArr
 		case arrow.INT32:
 			data = leafArr.(*array.Int32).Int32Values()
 		case arrow.DATE32, arrow.UINT32:
-			data = arrow.Int32Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
-			data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
-		case arrow.TIME32:
-			if leafArr.DataType().(*arrow.Time32Type).Unit != arrow.Second {
+			if leafArr.Data().Buffers()[1] != nil {
 				data = arrow.Int32Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
 				data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
+			}
+		case arrow.TIME32:
+			if leafArr.DataType().(*arrow.Time32Type).Unit != arrow.Second {
+				if leafArr.Data().Buffers()[1] != nil {
+					data = arrow.Int32Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
+					data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
+				}
 			} else { // coerce time32 if necessary by multiplying by 1000
 				ctx.dataBuffer.ResizeNoShrink(arrow.Int32Traits.BytesRequired(leafArr.Len()))
 				data = arrow.Int32Traits.CastFromBytes(ctx.dataBuffer.Bytes())
@@ -351,8 +355,10 @@ func writeDenseArrow(ctx *arrowWriteContext, cw file.ColumnChunkWriter, leafArr
 				// user explicitly requested coercion to specific unit
 				if tstype.Unit == ctx.props.coerceTimestampUnit {
 					// no conversion necessary
-					data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
-					data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
+					if leafArr.Data().Buffers()[1] != nil {
+						data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
+						data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
+					}
 				} else {
 					ctx.dataBuffer.ResizeNoShrink(arrow.Int64Traits.BytesRequired(leafArr.Len()))
 					data = arrow.Int64Traits.CastFromBytes(ctx.dataBuffer.Bytes())
@@ -380,8 +386,10 @@ func writeDenseArrow(ctx *arrowWriteContext, cw file.ColumnChunkWriter, leafArr
 				}
 			} else {
 				// no data conversion neccessary
-				data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
-				data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
+				if leafArr.Data().Buffers()[1] != nil {
+					data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
+					data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
+				}
 			}
 		case arrow.UINT32:
 			ctx.dataBuffer.ResizeNoShrink(arrow.Int64Traits.BytesRequired(leafArr.Len()))
@@ -392,8 +400,10 @@ func writeDenseArrow(ctx *arrowWriteContext, cw file.ColumnChunkWriter, leafArr
 		case arrow.INT64:
 			data = leafArr.(*array.Int64).Int64Values()
 		case arrow.UINT64, arrow.TIME64, arrow.DATE64:
-			data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
-			data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
+			if leafArr.Data().Buffers()[1] != nil {
+				data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
+				data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
+			}
 		default:
 			return fmt.Errorf("unimplemented arrow type to write to int64 column: %s", leafArr.DataType().Name())
 		}
diff --git a/go/parquet/pqarrow/encode_arrow_test.go b/go/parquet/pqarrow/encode_arrow_test.go
index f9c1165a6f..877f584f20 100644
--- a/go/parquet/pqarrow/encode_arrow_test.go
+++ b/go/parquet/pqarrow/encode_arrow_test.go
@@ -359,6 +359,38 @@ func simpleRoundTrip(t *testing.T, tbl arrow.Table, rowGroupSize int64) {
 	}
 }
 
+func TestWriteEmptyLists(t *testing.T) {
+	sc := arrow.NewSchema([]arrow.Field{
+		{Name: "f1", Type: arrow.ListOf(arrow.FixedWidthTypes.Date32)},
+		{Name: "f2", Type: arrow.ListOf(arrow.FixedWidthTypes.Date64)},
+		{Name: "f3", Type: arrow.ListOf(arrow.FixedWidthTypes.Timestamp_us)},
+		{Name: "f4", Type: arrow.ListOf(arrow.FixedWidthTypes.Timestamp_ms)},
+		{Name: "f5", Type: arrow.ListOf(arrow.FixedWidthTypes.Time32ms)},
+		{Name: "f6", Type: arrow.ListOf(arrow.FixedWidthTypes.Time64ns)},
+		{Name: "f7", Type: arrow.ListOf(arrow.FixedWidthTypes.Time64us)},
+	}, nil)
+	bldr := array.NewRecordBuilder(memory.DefaultAllocator, sc)
+	defer bldr.Release()
+	for _, b := range bldr.Fields() {
+		b.AppendNull()
+	}
+
+	rec := bldr.NewRecord()
+	defer rec.Release()
+
+	props := parquet.NewWriterProperties(
+		parquet.WithVersion(parquet.V1_0),
+	)
+	arrprops := pqarrow.DefaultWriterProps()
+	var buf bytes.Buffer
+	fw, err := pqarrow.NewFileWriter(sc, &buf, props, arrprops)
+	require.NoError(t, err)
+	err = fw.Write(rec)
+	require.NoError(t, err)
+	err = fw.Close()
+	require.NoError(t, err)
+}
+
 func TestArrowReadWriteTableChunkedCols(t *testing.T) {
 	chunkSizes := []int{2, 4, 10, 2}
 	const totalLen = int64(18)