You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ze...@apache.org on 2023/05/22 14:52:06 UTC
[arrow] branch main updated: GH-35684: [Go][Parquet] Fix nil dereference with nil list array (#35690)
This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 584dc7bcde GH-35684: [Go][Parquet] Fix nil dereference with nil list array (#35690)
584dc7bcde is described below
commit 584dc7bcde7afa0c67cedf737bac6b4b7a221088
Author: Matt Topol <zo...@gmail.com>
AuthorDate: Mon May 22 10:51:58 2023 -0400
GH-35684: [Go][Parquet] Fix nil dereference with nil list array (#35690)
### Rationale for this change
If a column to be written to parquet is a list array with only nulls in it, then the child will be empty and potentially have nil buffers. This case isn't checked in the `writeDenseArrow` function, so we blindly attempt to grab the bytes from the buffers which might be nil.
### What changes are included in this PR?
Adding a check of `leafArr.Len()` at the top of `writeDenseArrow`. If the leaf array's length is > 0 we can be sure that the data buffer is non-nil. So we can just bail if the leaf array is empty, nothing to write.
### Are these changes tested?
Unit test is added.
* Closes: #35684
Authored-by: Matt Topol <zo...@gmail.com>
Signed-off-by: Matt Topol <zo...@gmail.com>
---
go/parquet/pqarrow/encode_arrow.go | 30 ++++++++++++++++++++----------
go/parquet/pqarrow/encode_arrow_test.go | 32 ++++++++++++++++++++++++++++++++
2 files changed, 52 insertions(+), 10 deletions(-)
diff --git a/go/parquet/pqarrow/encode_arrow.go b/go/parquet/pqarrow/encode_arrow.go
index a7a1f113e9..dea412c8b9 100644
--- a/go/parquet/pqarrow/encode_arrow.go
+++ b/go/parquet/pqarrow/encode_arrow.go
@@ -287,12 +287,16 @@ func writeDenseArrow(ctx *arrowWriteContext, cw file.ColumnChunkWriter, leafArr
case arrow.INT32:
data = leafArr.(*array.Int32).Int32Values()
case arrow.DATE32, arrow.UINT32:
- data = arrow.Int32Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
- data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
- case arrow.TIME32:
- if leafArr.DataType().(*arrow.Time32Type).Unit != arrow.Second {
+ if leafArr.Data().Buffers()[1] != nil {
data = arrow.Int32Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
+ }
+ case arrow.TIME32:
+ if leafArr.DataType().(*arrow.Time32Type).Unit != arrow.Second {
+ if leafArr.Data().Buffers()[1] != nil {
+ data = arrow.Int32Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
+ data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
+ }
} else { // coerce time32 if necessary by multiplying by 1000
ctx.dataBuffer.ResizeNoShrink(arrow.Int32Traits.BytesRequired(leafArr.Len()))
data = arrow.Int32Traits.CastFromBytes(ctx.dataBuffer.Bytes())
@@ -351,8 +355,10 @@ func writeDenseArrow(ctx *arrowWriteContext, cw file.ColumnChunkWriter, leafArr
// user explicitly requested coercion to specific unit
if tstype.Unit == ctx.props.coerceTimestampUnit {
// no conversion necessary
- data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
- data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
+ if leafArr.Data().Buffers()[1] != nil {
+ data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
+ data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
+ }
} else {
ctx.dataBuffer.ResizeNoShrink(arrow.Int64Traits.BytesRequired(leafArr.Len()))
data = arrow.Int64Traits.CastFromBytes(ctx.dataBuffer.Bytes())
@@ -380,8 +386,10 @@ func writeDenseArrow(ctx *arrowWriteContext, cw file.ColumnChunkWriter, leafArr
}
} else {
// no data conversion neccessary
- data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
- data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
+ if leafArr.Data().Buffers()[1] != nil {
+ data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
+ data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
+ }
}
case arrow.UINT32:
ctx.dataBuffer.ResizeNoShrink(arrow.Int64Traits.BytesRequired(leafArr.Len()))
@@ -392,8 +400,10 @@ func writeDenseArrow(ctx *arrowWriteContext, cw file.ColumnChunkWriter, leafArr
case arrow.INT64:
data = leafArr.(*array.Int64).Int64Values()
case arrow.UINT64, arrow.TIME64, arrow.DATE64:
- data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
- data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
+ if leafArr.Data().Buffers()[1] != nil {
+ data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes())
+ data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()]
+ }
default:
return fmt.Errorf("unimplemented arrow type to write to int64 column: %s", leafArr.DataType().Name())
}
diff --git a/go/parquet/pqarrow/encode_arrow_test.go b/go/parquet/pqarrow/encode_arrow_test.go
index f9c1165a6f..877f584f20 100644
--- a/go/parquet/pqarrow/encode_arrow_test.go
+++ b/go/parquet/pqarrow/encode_arrow_test.go
@@ -359,6 +359,38 @@ func simpleRoundTrip(t *testing.T, tbl arrow.Table, rowGroupSize int64) {
}
}
+func TestWriteEmptyLists(t *testing.T) {
+ sc := arrow.NewSchema([]arrow.Field{
+ {Name: "f1", Type: arrow.ListOf(arrow.FixedWidthTypes.Date32)},
+ {Name: "f2", Type: arrow.ListOf(arrow.FixedWidthTypes.Date64)},
+ {Name: "f3", Type: arrow.ListOf(arrow.FixedWidthTypes.Timestamp_us)},
+ {Name: "f4", Type: arrow.ListOf(arrow.FixedWidthTypes.Timestamp_ms)},
+ {Name: "f5", Type: arrow.ListOf(arrow.FixedWidthTypes.Time32ms)},
+ {Name: "f6", Type: arrow.ListOf(arrow.FixedWidthTypes.Time64ns)},
+ {Name: "f7", Type: arrow.ListOf(arrow.FixedWidthTypes.Time64us)},
+ }, nil)
+ bldr := array.NewRecordBuilder(memory.DefaultAllocator, sc)
+ defer bldr.Release()
+ for _, b := range bldr.Fields() {
+ b.AppendNull()
+ }
+
+ rec := bldr.NewRecord()
+ defer rec.Release()
+
+ props := parquet.NewWriterProperties(
+ parquet.WithVersion(parquet.V1_0),
+ )
+ arrprops := pqarrow.DefaultWriterProps()
+ var buf bytes.Buffer
+ fw, err := pqarrow.NewFileWriter(sc, &buf, props, arrprops)
+ require.NoError(t, err)
+ err = fw.Write(rec)
+ require.NoError(t, err)
+ err = fw.Close()
+ require.NoError(t, err)
+}
+
func TestArrowReadWriteTableChunkedCols(t *testing.T) {
chunkSizes := []int{2, 4, 10, 2}
const totalLen = int64(18)