You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ze...@apache.org on 2022/05/25 03:58:02 UTC
[arrow] branch master updated: ARROW-16638: [Go][Parquet] Fix boolean column skip
This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 5994fd8825 ARROW-16638: [Go][Parquet] Fix boolean column skip
5994fd8825 is described below
commit 5994fd88259b8a6ee0efef7233d75352b7360188
Author: Matt DePero <de...@neeva.co>
AuthorDate: Tue May 24 23:57:52 2022 -0400
ARROW-16638: [Go][Parquet] Fix boolean column skip
Uses `nil` for `defLvls` and `repLvls` when skipping boolean values, since the scratch buffer allocated for n boolean values when skipping is not large enough to hold n def and rep levels, resulting in an [out of bounds panic](https://github.com/apache/arrow/blob/4c21fd12f93e4853c03c05919ffb22c6bb8f09b0/go/parquet/file/column_reader.go#L407) when skipping too many rows.
Closes #13221 from mdepero/go-boolskip
Authored-by: Matt DePero <de...@neeva.co>
Signed-off-by: Matthew Topol <mt...@factset.com>
---
go/parquet/file/column_reader_test.go | 87 +++++++++++++++++++++++++
go/parquet/file/column_reader_types.gen.go | 4 +-
go/parquet/file/column_reader_types.gen.go.tmpl | 6 +-
3 files changed, 93 insertions(+), 4 deletions(-)
diff --git a/go/parquet/file/column_reader_test.go b/go/parquet/file/column_reader_test.go
index f3a2fe7c49..b14db23c09 100755
--- a/go/parquet/file/column_reader_test.go
+++ b/go/parquet/file/column_reader_test.go
@@ -310,6 +310,93 @@ func (p *PrimitiveReaderSuite) TestBoolFlatOptional() {
p.testPlain(npages, levelsPerPage, d, reflect.TypeOf(true))
}
+func (p *PrimitiveReaderSuite) TestBoolFlatOptionalSkip() {
+ const (
+ levelsPerPage int = 100
+ npages int = 5
+ )
+
+ p.maxDefLvl = 4
+ p.maxRepLvl = 0
+ typ := schema.NewBooleanNode("a", parquet.Repetitions.Optional, -1)
+ d := schema.NewColumn(typ, p.maxDefLvl, p.maxRepLvl)
+ p.pages, p.nvalues, p.values, p.defLevels, p.repLevels = makePages(p.dataPageVersion, d, npages, levelsPerPage, reflect.TypeOf(true), parquet.Encodings.Plain)
+ p.initReader(d)
+
+ vresult := make([]bool, levelsPerPage/2)
+ dresult := make([]int16, levelsPerPage/2)
+ rresult := make([]int16, levelsPerPage/2)
+
+ rdr := p.reader.(*file.BooleanColumnChunkReader)
+
+ values := p.values.Interface().([]bool)
+ rIdx := int64(0)
+
+ p.Run("skip_size > page_size", func() {
+ // skip first 2 pages
+ skipped, _ := rdr.Skip(int64(2 * levelsPerPage))
+ // move test values forward
+ for i := int64(0); i < skipped; i++ {
+ if p.defLevels[rIdx] == p.maxDefLvl {
+ values = values[1:]
+ }
+ rIdx++
+ }
+ p.Equal(int64(2*levelsPerPage), skipped)
+
+ // Read half a page
+ rowsRead, valsRead, _ := rdr.ReadBatch(int64(levelsPerPage/2), vresult, dresult, rresult)
+ subVals := values[0:valsRead]
+ p.Equal(subVals, vresult[:valsRead])
+ // move test values forward
+ rIdx += rowsRead
+ values = values[valsRead:]
+ })
+
+ p.Run("skip_size == page_size", func() {
+ // skip one page worth of values across page 2 and 3
+ skipped, _ := rdr.Skip(int64(levelsPerPage))
+ // move test values forward
+ for i := int64(0); i < skipped; i++ {
+ if p.defLevels[rIdx] == p.maxDefLvl {
+ values = values[1:]
+ }
+ rIdx++
+ }
+ p.Equal(int64(levelsPerPage), skipped)
+
+ // read half a page
+ rowsRead, valsRead, _ := rdr.ReadBatch(int64(levelsPerPage/2), vresult, dresult, rresult)
+ subVals := values[0:valsRead]
+ p.Equal(subVals, vresult[:valsRead])
+ // move test values forward
+ rIdx += rowsRead
+ values = values[valsRead:]
+ })
+
+ p.Run("skip_size < page_size", func() {
+ // skip limited to a single page
+ // skip half a page
+ skipped, _ := rdr.Skip(int64(levelsPerPage / 2))
+ // move test values forward
+ for i := int64(0); i < skipped; i++ {
+ if p.defLevels[rIdx] == p.maxDefLvl {
+ values = values[1:] // move test values forward
+ }
+ rIdx++
+ }
+ p.Equal(int64(0.5*float32(levelsPerPage)), skipped)
+
+ // Read half a page
+ rowsRead, valsRead, _ := rdr.ReadBatch(int64(levelsPerPage/2), vresult, dresult, rresult)
+ subVals := values[0:valsRead]
+ p.Equal(subVals, vresult[:valsRead])
+ // move test values forward
+ rIdx += rowsRead
+ values = values[valsRead:]
+ })
+}
+
func (p *PrimitiveReaderSuite) TestInt32FlatRequired() {
const (
levelsPerPage int = 100
diff --git a/go/parquet/file/column_reader_types.gen.go b/go/parquet/file/column_reader_types.gen.go
index 7850e7a19d..6163b35b7a 100644
--- a/go/parquet/file/column_reader_types.gen.go
+++ b/go/parquet/file/column_reader_types.gen.go
@@ -209,8 +209,8 @@ func (cr *BooleanColumnChunkReader) Skip(nvalues int64) (int64, error) {
func(batch int64, buf []byte) (int64, error) {
vals, _, err := cr.ReadBatch(batch,
*(*[]bool)(unsafe.Pointer(&buf)),
- arrow.Int16Traits.CastFromBytes(buf),
- arrow.Int16Traits.CastFromBytes(buf))
+ nil,
+ nil)
return vals, err
})
}
diff --git a/go/parquet/file/column_reader_types.gen.go.tmpl b/go/parquet/file/column_reader_types.gen.go.tmpl
index 42e56dc88c..6a83d389c3 100644
--- a/go/parquet/file/column_reader_types.gen.go.tmpl
+++ b/go/parquet/file/column_reader_types.gen.go.tmpl
@@ -36,11 +36,13 @@ func (cr *{{.Name}}ColumnChunkReader) Skip(nvalues int64) (int64, error) {
vals, _, err := cr.ReadBatch(batch,
{{- if ne .Name "Boolean"}}
{{.prefix}}.{{.Name}}Traits.CastFromBytes(buf),
+ arrow.Int16Traits.CastFromBytes(buf),
+ arrow.Int16Traits.CastFromBytes(buf))
{{- else}}
*(*[]bool)(unsafe.Pointer(&buf)),
+ nil,
+ nil)
{{- end}}
- arrow.Int16Traits.CastFromBytes(buf),
- arrow.Int16Traits.CastFromBytes(buf))
return vals, err
})
}