You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ze...@apache.org on 2023/06/01 19:30:37 UTC
[arrow] branch main updated: GH-35871: [Go] Account for struct validity bitmap in `array.ApproxEqual` (#35872)
This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 61692b69e4 GH-35871: [Go] Account for struct validity bitmap in `array.ApproxEqual` (#35872)
61692b69e4 is described below
commit 61692b69e4954f50ced110bf46dd92041748776f
Author: Alex Shcherbakov <ca...@users.noreply.github.com>
AuthorDate: Thu Jun 1 22:30:30 2023 +0300
GH-35871: [Go] Account for struct validity bitmap in `array.ApproxEqual` (#35872)
### Rationale for this change
When comparing `array.Struct` values with `array.ApproxEqual` the validity bitmap of the struct itself should take precedence:
> When reading the struct array the parent validity bitmap takes priority.
This follows a brief discussion from #35851.
### What changes are included in this PR?
`array.arrayApproxEqualStruct` will check the fields data only if the struct elem is valid.
### Are these changes tested?
`pqarrow` tests are updated accordingly (no special treatment for structs, just `array.ApproxEqual`
### Are there any user-facing changes?
`array.ApproxEqual` behavior changed to match the docs about validity bitmap precedence.
* Closes: #35871
Authored-by: candiduslynx <ca...@gmail.com>
Signed-off-by: Matt Topol <zo...@gmail.com>
---
go/arrow/array/compare.go | 20 +++++++---
go/arrow/array/struct.go | 2 +-
go/parquet/pqarrow/column_readers.go | 54 +++++++++++----------------
go/parquet/pqarrow/encode_arrow_test.go | 66 ++++++++++++++++++++-------------
4 files changed, 79 insertions(+), 63 deletions(-)
diff --git a/go/arrow/array/compare.go b/go/arrow/array/compare.go
index 48b0df9c84..230c985929 100644
--- a/go/arrow/array/compare.go
+++ b/go/arrow/array/compare.go
@@ -22,6 +22,7 @@ import (
"github.com/apache/arrow/go/v13/arrow"
"github.com/apache/arrow/go/v13/arrow/float16"
+ "github.com/apache/arrow/go/v13/internal/bitutils"
)
// RecordEqual reports whether the two provided records are equal.
@@ -735,13 +736,22 @@ func arrayApproxEqualFixedSizeList(left, right *FixedSizeList, opt equalOption)
}
func arrayApproxEqualStruct(left, right *Struct, opt equalOption) bool {
- for i, lf := range left.fields {
- rf := right.fields[i]
- if !arrayApproxEqual(lf, rf, opt) {
- return false
+ return bitutils.VisitSetBitRuns(
+ left.NullBitmapBytes(),
+ int64(left.Offset()), int64(left.Len()),
+ approxEqualStructRun(left, right, opt),
+ ) == nil
+}
+
+func approxEqualStructRun(left, right *Struct, opt equalOption) bitutils.VisitFn {
+ return func(pos int64, length int64) error {
+ for i := range left.fields {
+ if !sliceApproxEqual(left.fields[i], pos, pos+length, right.fields[i], pos, pos+length, opt) {
+ return arrow.ErrInvalid
+ }
}
+ return nil
}
- return true
}
// arrayApproxEqualMap doesn't care about the order of keys (in Go map traversal order is undefined)
diff --git a/go/arrow/array/struct.go b/go/arrow/array/struct.go
index cb4fb5ec5e..9ebc794006 100644
--- a/go/arrow/array/struct.go
+++ b/go/arrow/array/struct.go
@@ -129,7 +129,7 @@ func (a *Struct) newStructFieldWithParentValidityMask(fieldIndex int) arrow.Arra
maskedNullBitmapBytes := make([]byte, len(nullBitmapBytes))
copy(maskedNullBitmapBytes, nullBitmapBytes)
for i := 0; i < field.Len(); i++ {
- if !a.IsValid(i) {
+ if a.IsNull(i) {
bitutil.ClearBit(maskedNullBitmapBytes, i)
}
}
diff --git a/go/parquet/pqarrow/column_readers.go b/go/parquet/pqarrow/column_readers.go
index df02b2c949..767131f1a8 100644
--- a/go/parquet/pqarrow/column_readers.go
+++ b/go/parquet/pqarrow/column_readers.go
@@ -168,16 +168,6 @@ func (sr *structReader) Release() {
}
func newStructReader(rctx *readerCtx, filtered *arrow.Field, levelInfo file.LevelInfo, children []*ColumnReader, props ArrowReadProperties) *ColumnReader {
- // there could be a mix of children some might be repeated and some might not be
- // if possible use one that isn't since that will be guaranteed to have the least
- // number of levels to reconstruct a nullable bitmap
- var result *ColumnReader
- for _, child := range children {
- if !child.IsOrHasRepeatedChild() {
- result = child
- }
- }
-
ret := &structReader{
rctx: rctx,
filtered: filtered,
@@ -186,10 +176,18 @@ func newStructReader(rctx *readerCtx, filtered *arrow.Field, levelInfo file.Leve
props: props,
refCount: 1,
}
- if result != nil {
- ret.defRepLevelChild = result
- ret.hasRepeatedChild = false
- } else {
+
+ // there could be a mix of children some might be repeated and some might not be
+ // if possible use one that isn't since that will be guaranteed to have the least
+ // number of levels to reconstruct a nullable bitmap
+ for _, child := range children {
+ if !child.IsOrHasRepeatedChild() {
+ ret.defRepLevelChild = child
+ break
+ }
+ }
+
+ if ret.defRepLevelChild == nil {
ret.defRepLevelChild = children[0]
ret.hasRepeatedChild = true
}
@@ -250,15 +248,16 @@ func (sr *structReader) BuildArray(lenBound int64) (*arrow.Chunked, error) {
var nullBitmap *memory.Buffer
- if lenBound > 0 {
+ if lenBound > 0 && (sr.hasRepeatedChild || sr.filtered.Nullable) {
+ nullBitmap = memory.NewResizableBuffer(sr.rctx.mem)
+ nullBitmap.Resize(int(bitutil.BytesForBits(lenBound)))
+ validityIO.ValidBits = nullBitmap.Bytes()
+ defLevels, err := sr.GetDefLevels()
+ if err != nil {
+ return nil, err
+ }
+
if sr.hasRepeatedChild {
- nullBitmap = memory.NewResizableBuffer(sr.rctx.mem)
- nullBitmap.Resize(int(bitutil.BytesForBits(lenBound)))
- validityIO.ValidBits = nullBitmap.Bytes()
- defLevels, err := sr.GetDefLevels()
- if err != nil {
- return nil, err
- }
repLevels, err := sr.GetRepLevels()
if err != nil {
return nil, err
@@ -267,16 +266,7 @@ func (sr *structReader) BuildArray(lenBound int64) (*arrow.Chunked, error) {
if err := file.DefRepLevelsToBitmap(defLevels, repLevels, sr.levelInfo, &validityIO); err != nil {
return nil, err
}
-
- } else if sr.filtered.Nullable {
- nullBitmap = memory.NewResizableBuffer(sr.rctx.mem)
- nullBitmap.Resize(int(bitutil.BytesForBits(lenBound)))
- validityIO.ValidBits = nullBitmap.Bytes()
- defLevels, err := sr.GetDefLevels()
- if err != nil {
- return nil, err
- }
-
+ } else {
file.DefLevelsToBitmap(defLevels, sr.levelInfo, &validityIO)
}
}
diff --git a/go/parquet/pqarrow/encode_arrow_test.go b/go/parquet/pqarrow/encode_arrow_test.go
index 877f584f20..31f8a2b5ab 100644
--- a/go/parquet/pqarrow/encode_arrow_test.go
+++ b/go/parquet/pqarrow/encode_arrow_test.go
@@ -19,7 +19,6 @@ package pqarrow_test
import (
"bytes"
"context"
- "errors"
"fmt"
"math"
"strconv"
@@ -32,7 +31,6 @@ import (
"github.com/apache/arrow/go/v13/arrow/decimal128"
"github.com/apache/arrow/go/v13/arrow/ipc"
"github.com/apache/arrow/go/v13/arrow/memory"
- "github.com/apache/arrow/go/v13/internal/bitutils"
"github.com/apache/arrow/go/v13/internal/types"
"github.com/apache/arrow/go/v13/internal/utils"
"github.com/apache/arrow/go/v13/parquet"
@@ -931,6 +929,44 @@ func (ps *ParquetIOTestSuite) TestReadDecimals() {
ps.True(array.Equal(expected, chunked.Chunk(0)))
}
+func (ps *ParquetIOTestSuite) TestReadNestedStruct() {
+ mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+ defer mem.AssertSize(ps.T(), 0)
+
+ dt := arrow.StructOf(arrow.Field{
+ Name: "nested",
+ Type: arrow.StructOf(
+ arrow.Field{Name: "bool", Type: arrow.FixedWidthTypes.Boolean},
+ arrow.Field{Name: "int32", Type: arrow.PrimitiveTypes.Int32},
+ arrow.Field{Name: "int64", Type: arrow.PrimitiveTypes.Int64},
+ ),
+ })
+ field := arrow.Field{Name: "struct", Type: dt, Nullable: true}
+
+ builder := array.NewStructBuilder(mem, dt)
+ defer builder.Release()
+ nested := builder.FieldBuilder(0).(*array.StructBuilder)
+
+ builder.Append(true)
+ nested.Append(true)
+ nested.FieldBuilder(0).(*array.BooleanBuilder).Append(true)
+ nested.FieldBuilder(1).(*array.Int32Builder).Append(int32(-1))
+ nested.FieldBuilder(2).(*array.Int64Builder).Append(int64(-2))
+ builder.AppendNull()
+
+ arr := builder.NewStructArray()
+ defer arr.Release()
+
+ expected := array.NewTable(
+ arrow.NewSchema([]arrow.Field{field}, nil),
+ []arrow.Column{*arrow.NewColumn(field, arrow.NewChunked(dt, []arrow.Array{arr}))},
+ -1,
+ )
+ defer arr.Release() // NewChunked
+ defer expected.Release()
+ ps.roundTripTable(mem, expected, true)
+}
+
func (ps *ParquetIOTestSuite) writeColumn(mem memory.Allocator, sc *schema.GroupNode, values arrow.Array) []byte {
var buf bytes.Buffer
arrsc, err := pqarrow.FromParquet(schema.NewSchema(sc), nil, nil)
@@ -1032,29 +1068,9 @@ func (ps *ParquetIOTestSuite) roundTripTable(_ memory.Allocator, expected arrow.
tblChunk := tbl.Column(0).Data()
ps.Equal(len(exChunk.Chunks()), len(tblChunk.Chunks()))
- if exChunk.DataType().ID() != arrow.STRUCT {
- exc := exChunk.Chunk(0)
- tbc := tblChunk.Chunk(0)
- ps.Truef(array.Equal(exc, tbc), "expected: %T %s\ngot: %T %s", exc, exc, tbc, tbc)
- } else {
- // current impl of ArrayEquals for structs doesn't correctly handle nulls in the parent
- // with a non-nullable child when comparing. Since after the round trip, the data in the
- // child will have the nulls, not the original data.
- ex := exChunk.Chunk(0)
- tb := tblChunk.Chunk(0)
- ps.Equal(ex.NullN(), tb.NullN())
- if ex.NullN() > 0 {
- ps.Equal(ex.NullBitmapBytes()[:int(bitutil.BytesForBits(int64(ex.Len())))], tb.NullBitmapBytes()[:int(bitutil.BytesForBits(int64(tb.Len())))])
- }
- ps.Equal(ex.Len(), tb.Len())
- // only compare the non-null values
- ps.NoErrorf(bitutils.VisitSetBitRuns(ex.NullBitmapBytes(), int64(ex.Data().Offset()), int64(ex.Len()), func(pos, length int64) error {
- if !ps.True(array.SliceEqual(ex, pos, pos+length, tb, pos, pos+length)) {
- return errors.New("failed")
- }
- return nil
- }), "expected: %s\ngot: %s", ex, tb)
- }
+ exc := exChunk.Chunk(0)
+ tbc := tblChunk.Chunk(0)
+ ps.Truef(array.ApproxEqual(exc, tbc), "expected: %T %s\ngot: %T %s", exc, exc, tbc, tbc)
}
func makeEmptyListsArray(size int) arrow.Array {