You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ze...@apache.org on 2023/06/01 19:30:37 UTC

[arrow] branch main updated: GH-35871: [Go] Account for struct validity bitmap in `array.ApproxEqual` (#35872)

This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 61692b69e4 GH-35871: [Go] Account for struct validity bitmap in `array.ApproxEqual` (#35872)
61692b69e4 is described below

commit 61692b69e4954f50ced110bf46dd92041748776f
Author: Alex Shcherbakov <ca...@users.noreply.github.com>
AuthorDate: Thu Jun 1 22:30:30 2023 +0300

    GH-35871: [Go] Account for struct validity bitmap in `array.ApproxEqual` (#35872)
    
    ### Rationale for this change
    
    When comparing `array.Struct` values with `array.ApproxEqual` the validity bitmap of the struct itself should take precedence:
    > When reading the struct array the parent validity bitmap takes priority.
    
    This follows a brief discussion from #35851.
    
    ### What changes are included in this PR?
    
    `array.arrayApproxEqualStruct` will check the fields data only if the struct elem is valid.
    
    ### Are these changes tested?
    
    `pqarrow` tests are updated accordingly (no special treatment for structs, just `array.ApproxEqual`
    
    ### Are there any user-facing changes?
    
    `array.ApproxEqual` behavior changed to match the docs about validity bitmap precedence.
    
    * Closes: #35871
    
    Authored-by: candiduslynx <ca...@gmail.com>
    Signed-off-by: Matt Topol <zo...@gmail.com>
---
 go/arrow/array/compare.go               | 20 +++++++---
 go/arrow/array/struct.go                |  2 +-
 go/parquet/pqarrow/column_readers.go    | 54 +++++++++++----------------
 go/parquet/pqarrow/encode_arrow_test.go | 66 ++++++++++++++++++++-------------
 4 files changed, 79 insertions(+), 63 deletions(-)

diff --git a/go/arrow/array/compare.go b/go/arrow/array/compare.go
index 48b0df9c84..230c985929 100644
--- a/go/arrow/array/compare.go
+++ b/go/arrow/array/compare.go
@@ -22,6 +22,7 @@ import (
 
 	"github.com/apache/arrow/go/v13/arrow"
 	"github.com/apache/arrow/go/v13/arrow/float16"
+	"github.com/apache/arrow/go/v13/internal/bitutils"
 )
 
 // RecordEqual reports whether the two provided records are equal.
@@ -735,13 +736,22 @@ func arrayApproxEqualFixedSizeList(left, right *FixedSizeList, opt equalOption)
 }
 
 func arrayApproxEqualStruct(left, right *Struct, opt equalOption) bool {
-	for i, lf := range left.fields {
-		rf := right.fields[i]
-		if !arrayApproxEqual(lf, rf, opt) {
-			return false
+	return bitutils.VisitSetBitRuns(
+		left.NullBitmapBytes(),
+		int64(left.Offset()), int64(left.Len()),
+		approxEqualStructRun(left, right, opt),
+	) == nil
+}
+
+func approxEqualStructRun(left, right *Struct, opt equalOption) bitutils.VisitFn {
+	return func(pos int64, length int64) error {
+		for i := range left.fields {
+			if !sliceApproxEqual(left.fields[i], pos, pos+length, right.fields[i], pos, pos+length, opt) {
+				return arrow.ErrInvalid
+			}
 		}
+		return nil
 	}
-	return true
 }
 
 // arrayApproxEqualMap doesn't care about the order of keys (in Go map traversal order is undefined)
diff --git a/go/arrow/array/struct.go b/go/arrow/array/struct.go
index cb4fb5ec5e..9ebc794006 100644
--- a/go/arrow/array/struct.go
+++ b/go/arrow/array/struct.go
@@ -129,7 +129,7 @@ func (a *Struct) newStructFieldWithParentValidityMask(fieldIndex int) arrow.Arra
 	maskedNullBitmapBytes := make([]byte, len(nullBitmapBytes))
 	copy(maskedNullBitmapBytes, nullBitmapBytes)
 	for i := 0; i < field.Len(); i++ {
-		if !a.IsValid(i) {
+		if a.IsNull(i) {
 			bitutil.ClearBit(maskedNullBitmapBytes, i)
 		}
 	}
diff --git a/go/parquet/pqarrow/column_readers.go b/go/parquet/pqarrow/column_readers.go
index df02b2c949..767131f1a8 100644
--- a/go/parquet/pqarrow/column_readers.go
+++ b/go/parquet/pqarrow/column_readers.go
@@ -168,16 +168,6 @@ func (sr *structReader) Release() {
 }
 
 func newStructReader(rctx *readerCtx, filtered *arrow.Field, levelInfo file.LevelInfo, children []*ColumnReader, props ArrowReadProperties) *ColumnReader {
-	// there could be a mix of children some might be repeated and some might not be
-	// if possible use one that isn't since that will be guaranteed to have the least
-	// number of levels to reconstruct a nullable bitmap
-	var result *ColumnReader
-	for _, child := range children {
-		if !child.IsOrHasRepeatedChild() {
-			result = child
-		}
-	}
-
 	ret := &structReader{
 		rctx:      rctx,
 		filtered:  filtered,
@@ -186,10 +176,18 @@ func newStructReader(rctx *readerCtx, filtered *arrow.Field, levelInfo file.Leve
 		props:     props,
 		refCount:  1,
 	}
-	if result != nil {
-		ret.defRepLevelChild = result
-		ret.hasRepeatedChild = false
-	} else {
+
+	// there could be a mix of children some might be repeated and some might not be
+	// if possible use one that isn't since that will be guaranteed to have the least
+	// number of levels to reconstruct a nullable bitmap
+	for _, child := range children {
+		if !child.IsOrHasRepeatedChild() {
+			ret.defRepLevelChild = child
+			break
+		}
+	}
+
+	if ret.defRepLevelChild == nil {
 		ret.defRepLevelChild = children[0]
 		ret.hasRepeatedChild = true
 	}
@@ -250,15 +248,16 @@ func (sr *structReader) BuildArray(lenBound int64) (*arrow.Chunked, error) {
 
 	var nullBitmap *memory.Buffer
 
-	if lenBound > 0 {
+	if lenBound > 0 && (sr.hasRepeatedChild || sr.filtered.Nullable) {
+		nullBitmap = memory.NewResizableBuffer(sr.rctx.mem)
+		nullBitmap.Resize(int(bitutil.BytesForBits(lenBound)))
+		validityIO.ValidBits = nullBitmap.Bytes()
+		defLevels, err := sr.GetDefLevels()
+		if err != nil {
+			return nil, err
+		}
+
 		if sr.hasRepeatedChild {
-			nullBitmap = memory.NewResizableBuffer(sr.rctx.mem)
-			nullBitmap.Resize(int(bitutil.BytesForBits(lenBound)))
-			validityIO.ValidBits = nullBitmap.Bytes()
-			defLevels, err := sr.GetDefLevels()
-			if err != nil {
-				return nil, err
-			}
 			repLevels, err := sr.GetRepLevels()
 			if err != nil {
 				return nil, err
@@ -267,16 +266,7 @@ func (sr *structReader) BuildArray(lenBound int64) (*arrow.Chunked, error) {
 			if err := file.DefRepLevelsToBitmap(defLevels, repLevels, sr.levelInfo, &validityIO); err != nil {
 				return nil, err
 			}
-
-		} else if sr.filtered.Nullable {
-			nullBitmap = memory.NewResizableBuffer(sr.rctx.mem)
-			nullBitmap.Resize(int(bitutil.BytesForBits(lenBound)))
-			validityIO.ValidBits = nullBitmap.Bytes()
-			defLevels, err := sr.GetDefLevels()
-			if err != nil {
-				return nil, err
-			}
-
+		} else {
 			file.DefLevelsToBitmap(defLevels, sr.levelInfo, &validityIO)
 		}
 	}
diff --git a/go/parquet/pqarrow/encode_arrow_test.go b/go/parquet/pqarrow/encode_arrow_test.go
index 877f584f20..31f8a2b5ab 100644
--- a/go/parquet/pqarrow/encode_arrow_test.go
+++ b/go/parquet/pqarrow/encode_arrow_test.go
@@ -19,7 +19,6 @@ package pqarrow_test
 import (
 	"bytes"
 	"context"
-	"errors"
 	"fmt"
 	"math"
 	"strconv"
@@ -32,7 +31,6 @@ import (
 	"github.com/apache/arrow/go/v13/arrow/decimal128"
 	"github.com/apache/arrow/go/v13/arrow/ipc"
 	"github.com/apache/arrow/go/v13/arrow/memory"
-	"github.com/apache/arrow/go/v13/internal/bitutils"
 	"github.com/apache/arrow/go/v13/internal/types"
 	"github.com/apache/arrow/go/v13/internal/utils"
 	"github.com/apache/arrow/go/v13/parquet"
@@ -931,6 +929,44 @@ func (ps *ParquetIOTestSuite) TestReadDecimals() {
 	ps.True(array.Equal(expected, chunked.Chunk(0)))
 }
 
+func (ps *ParquetIOTestSuite) TestReadNestedStruct() {
+	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+	defer mem.AssertSize(ps.T(), 0)
+
+	dt := arrow.StructOf(arrow.Field{
+		Name: "nested",
+		Type: arrow.StructOf(
+			arrow.Field{Name: "bool", Type: arrow.FixedWidthTypes.Boolean},
+			arrow.Field{Name: "int32", Type: arrow.PrimitiveTypes.Int32},
+			arrow.Field{Name: "int64", Type: arrow.PrimitiveTypes.Int64},
+		),
+	})
+	field := arrow.Field{Name: "struct", Type: dt, Nullable: true}
+
+	builder := array.NewStructBuilder(mem, dt)
+	defer builder.Release()
+	nested := builder.FieldBuilder(0).(*array.StructBuilder)
+
+	builder.Append(true)
+	nested.Append(true)
+	nested.FieldBuilder(0).(*array.BooleanBuilder).Append(true)
+	nested.FieldBuilder(1).(*array.Int32Builder).Append(int32(-1))
+	nested.FieldBuilder(2).(*array.Int64Builder).Append(int64(-2))
+	builder.AppendNull()
+
+	arr := builder.NewStructArray()
+	defer arr.Release()
+
+	expected := array.NewTable(
+		arrow.NewSchema([]arrow.Field{field}, nil),
+		[]arrow.Column{*arrow.NewColumn(field, arrow.NewChunked(dt, []arrow.Array{arr}))},
+		-1,
+	)
+	defer arr.Release() // NewChunked
+	defer expected.Release()
+	ps.roundTripTable(mem, expected, true)
+}
+
 func (ps *ParquetIOTestSuite) writeColumn(mem memory.Allocator, sc *schema.GroupNode, values arrow.Array) []byte {
 	var buf bytes.Buffer
 	arrsc, err := pqarrow.FromParquet(schema.NewSchema(sc), nil, nil)
@@ -1032,29 +1068,9 @@ func (ps *ParquetIOTestSuite) roundTripTable(_ memory.Allocator, expected arrow.
 	tblChunk := tbl.Column(0).Data()
 
 	ps.Equal(len(exChunk.Chunks()), len(tblChunk.Chunks()))
-	if exChunk.DataType().ID() != arrow.STRUCT {
-		exc := exChunk.Chunk(0)
-		tbc := tblChunk.Chunk(0)
-		ps.Truef(array.Equal(exc, tbc), "expected: %T %s\ngot: %T %s", exc, exc, tbc, tbc)
-	} else {
-		// current impl of ArrayEquals for structs doesn't correctly handle nulls in the parent
-		// with a non-nullable child when comparing. Since after the round trip, the data in the
-		// child will have the nulls, not the original data.
-		ex := exChunk.Chunk(0)
-		tb := tblChunk.Chunk(0)
-		ps.Equal(ex.NullN(), tb.NullN())
-		if ex.NullN() > 0 {
-			ps.Equal(ex.NullBitmapBytes()[:int(bitutil.BytesForBits(int64(ex.Len())))], tb.NullBitmapBytes()[:int(bitutil.BytesForBits(int64(tb.Len())))])
-		}
-		ps.Equal(ex.Len(), tb.Len())
-		// only compare the non-null values
-		ps.NoErrorf(bitutils.VisitSetBitRuns(ex.NullBitmapBytes(), int64(ex.Data().Offset()), int64(ex.Len()), func(pos, length int64) error {
-			if !ps.True(array.SliceEqual(ex, pos, pos+length, tb, pos, pos+length)) {
-				return errors.New("failed")
-			}
-			return nil
-		}), "expected: %s\ngot: %s", ex, tb)
-	}
+	exc := exChunk.Chunk(0)
+	tbc := tblChunk.Chunk(0)
+	ps.Truef(array.ApproxEqual(exc, tbc), "expected: %T %s\ngot: %T %s", exc, exc, tbc, tbc)
 }
 
 func makeEmptyListsArray(size int) arrow.Array {