You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ze...@apache.org on 2022/05/25 03:58:02 UTC

[arrow] branch master updated: ARROW-16638: [Go][Parquet] Fix boolean column skip

This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 5994fd8825 ARROW-16638: [Go][Parquet] Fix boolean column skip
5994fd8825 is described below

commit 5994fd88259b8a6ee0efef7233d75352b7360188
Author: Matt DePero <de...@neeva.co>
AuthorDate: Tue May 24 23:57:52 2022 -0400

    ARROW-16638: [Go][Parquet] Fix boolean column skip
    
    Uses `nil` for `defLvls` and `repLvls` when skipping boolean values, since the scratch buffer allocated for n boolean values when skipping is not large enough to hold n def and rep levels, resulting in an [out of bounds panic](https://github.com/apache/arrow/blob/4c21fd12f93e4853c03c05919ffb22c6bb8f09b0/go/parquet/file/column_reader.go#L407) when skipping too many rows.
    
    Closes #13221 from mdepero/go-boolskip
    
    Authored-by: Matt DePero <de...@neeva.co>
    Signed-off-by: Matthew Topol <mt...@factset.com>
---
 go/parquet/file/column_reader_test.go           | 87 +++++++++++++++++++++++++
 go/parquet/file/column_reader_types.gen.go      |  4 +-
 go/parquet/file/column_reader_types.gen.go.tmpl |  6 +-
 3 files changed, 93 insertions(+), 4 deletions(-)

diff --git a/go/parquet/file/column_reader_test.go b/go/parquet/file/column_reader_test.go
index f3a2fe7c49..b14db23c09 100755
--- a/go/parquet/file/column_reader_test.go
+++ b/go/parquet/file/column_reader_test.go
@@ -310,6 +310,93 @@ func (p *PrimitiveReaderSuite) TestBoolFlatOptional() {
 	p.testPlain(npages, levelsPerPage, d, reflect.TypeOf(true))
 }
 
+func (p *PrimitiveReaderSuite) TestBoolFlatOptionalSkip() {
+	const (
+		levelsPerPage int = 100
+		npages        int = 5
+	)
+
+	p.maxDefLvl = 4
+	p.maxRepLvl = 0
+	typ := schema.NewBooleanNode("a", parquet.Repetitions.Optional, -1)
+	d := schema.NewColumn(typ, p.maxDefLvl, p.maxRepLvl)
+	p.pages, p.nvalues, p.values, p.defLevels, p.repLevels = makePages(p.dataPageVersion, d, npages, levelsPerPage, reflect.TypeOf(true), parquet.Encodings.Plain)
+	p.initReader(d)
+
+	vresult := make([]bool, levelsPerPage/2)
+	dresult := make([]int16, levelsPerPage/2)
+	rresult := make([]int16, levelsPerPage/2)
+
+	rdr := p.reader.(*file.BooleanColumnChunkReader)
+
+	values := p.values.Interface().([]bool)
+	rIdx := int64(0)
+
+	p.Run("skip_size > page_size", func() {
+		// skip first 2 pages
+		skipped, _ := rdr.Skip(int64(2 * levelsPerPage))
+		// move test values forward
+		for i := int64(0); i < skipped; i++ {
+			if p.defLevels[rIdx] == p.maxDefLvl {
+				values = values[1:]
+			}
+			rIdx++
+		}
+		p.Equal(int64(2*levelsPerPage), skipped)
+
+		// Read half a page
+		rowsRead, valsRead, _ := rdr.ReadBatch(int64(levelsPerPage/2), vresult, dresult, rresult)
+		subVals := values[0:valsRead]
+		p.Equal(subVals, vresult[:valsRead])
+		// move test values forward
+		rIdx += rowsRead
+		values = values[valsRead:]
+	})
+
+	p.Run("skip_size == page_size", func() {
+		// skip one page worth of values across page 2 and 3
+		skipped, _ := rdr.Skip(int64(levelsPerPage))
+		// move test values forward
+		for i := int64(0); i < skipped; i++ {
+			if p.defLevels[rIdx] == p.maxDefLvl {
+				values = values[1:]
+			}
+			rIdx++
+		}
+		p.Equal(int64(levelsPerPage), skipped)
+
+		// read half a page
+		rowsRead, valsRead, _ := rdr.ReadBatch(int64(levelsPerPage/2), vresult, dresult, rresult)
+		subVals := values[0:valsRead]
+		p.Equal(subVals, vresult[:valsRead])
+		// move test values forward
+		rIdx += rowsRead
+		values = values[valsRead:]
+	})
+
+	p.Run("skip_size < page_size", func() {
+		// skip limited to a single page
+		// skip half a page
+		skipped, _ := rdr.Skip(int64(levelsPerPage / 2))
+		// move test values forward
+		for i := int64(0); i < skipped; i++ {
+			if p.defLevels[rIdx] == p.maxDefLvl {
+				values = values[1:] // move test values forward
+			}
+			rIdx++
+		}
+		p.Equal(int64(0.5*float32(levelsPerPage)), skipped)
+
+		// Read half a page
+		rowsRead, valsRead, _ := rdr.ReadBatch(int64(levelsPerPage/2), vresult, dresult, rresult)
+		subVals := values[0:valsRead]
+		p.Equal(subVals, vresult[:valsRead])
+		// move test values forward
+		rIdx += rowsRead
+		values = values[valsRead:]
+	})
+}
+
 func (p *PrimitiveReaderSuite) TestInt32FlatRequired() {
 	const (
 		levelsPerPage int = 100
diff --git a/go/parquet/file/column_reader_types.gen.go b/go/parquet/file/column_reader_types.gen.go
index 7850e7a19d..6163b35b7a 100644
--- a/go/parquet/file/column_reader_types.gen.go
+++ b/go/parquet/file/column_reader_types.gen.go
@@ -209,8 +209,8 @@ func (cr *BooleanColumnChunkReader) Skip(nvalues int64) (int64, error) {
 		func(batch int64, buf []byte) (int64, error) {
 			vals, _, err := cr.ReadBatch(batch,
 				*(*[]bool)(unsafe.Pointer(&buf)),
-				arrow.Int16Traits.CastFromBytes(buf),
-				arrow.Int16Traits.CastFromBytes(buf))
+				nil,
+				nil)
 			return vals, err
 		})
 }
diff --git a/go/parquet/file/column_reader_types.gen.go.tmpl b/go/parquet/file/column_reader_types.gen.go.tmpl
index 42e56dc88c..6a83d389c3 100644
--- a/go/parquet/file/column_reader_types.gen.go.tmpl
+++ b/go/parquet/file/column_reader_types.gen.go.tmpl
@@ -36,11 +36,13 @@ func (cr *{{.Name}}ColumnChunkReader) Skip(nvalues int64) (int64, error) {
       vals, _, err := cr.ReadBatch(batch,
         {{- if ne .Name "Boolean"}}
         {{.prefix}}.{{.Name}}Traits.CastFromBytes(buf),
+        arrow.Int16Traits.CastFromBytes(buf),
+        arrow.Int16Traits.CastFromBytes(buf))
         {{- else}}
         *(*[]bool)(unsafe.Pointer(&buf)),
+        nil,
+        nil)
         {{- end}}
-        arrow.Int16Traits.CastFromBytes(buf),
-        arrow.Int16Traits.CastFromBytes(buf))
       return vals, err
     })
 }