You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ze...@apache.org on 2023/09/26 15:01:44 UTC
[arrow] branch main updated: GH-37845: [Go][Parquet] Check the number of logical fields instead of physical columns (#37846)

This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 2895af4922 GH-37845: [Go][Parquet] Check the number of logical fields instead of physical columns (#37846)
2895af4922 is described below

commit 2895af492236e5dc42ac665406de8648a9aac6db
Author: Tim Schaub <ts...@users.noreply.github.com>
AuthorDate: Tue Sep 26 09:01:36 2023 -0600

    GH-37845: [Go][Parquet] Check the number of logical fields instead of physical columns (#37846)
    
    ### Rationale for this change
    
    This makes it so trying to read with a column chunk reader consistently returns an error if the index is outside the bounds of the logical fields (currently it panics in some cases and returns an error in others).
    
    ### What changes are included in this PR?
    
    This makes it so the column chunk reader checks the number of logical fields instead of the number of physical columns when checking if an index is out of range.
    
    ### Are these changes tested?
    
    The new test will panics without the accompanying code change.
    
    ### Are there any user-facing changes?
    
    Applications that used to panic will now have an error to handle instead.
    
    * Closes: #37845
    
    Authored-by: Tim Schaub <ti...@planet.com>
    Signed-off-by: Matt Topol <zo...@gmail.com>
---
 go/parquet/pqarrow/file_reader.go      |  4 +-
 go/parquet/pqarrow/file_reader_test.go | 67 ++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/go/parquet/pqarrow/file_reader.go b/go/parquet/pqarrow/file_reader.go
index d54e365b55..d91010c62c 100755
--- a/go/parquet/pqarrow/file_reader.go
+++ b/go/parquet/pqarrow/file_reader.go
@@ -394,8 +394,8 @@ func (fr *FileReader) ReadRowGroups(ctx context.Context, indices, rowGroups []in
 }
 
 func (fr *FileReader) getColumnReader(ctx context.Context, i int, colFactory itrFactory) (*ColumnReader, error) {
-	if i < 0 || i >= fr.rdr.MetaData().Schema.NumColumns() {
-		return nil, fmt.Errorf("invalid column index chosen %d, there are only %d columns", i, fr.rdr.MetaData().Schema.NumColumns())
+	if i < 0 || i >= len(fr.Manifest.Fields) {
+		return nil, fmt.Errorf("invalid column index chosen %d, there are only %d columns", i, len(fr.Manifest.Fields))
 	}
 
 	ctx = context.WithValue(ctx, rdrCtxKey{}, readerCtx{
diff --git a/go/parquet/pqarrow/file_reader_test.go b/go/parquet/pqarrow/file_reader_test.go
index 2b4aa8ab78..d1f3ae1c98 100644
--- a/go/parquet/pqarrow/file_reader_test.go
+++ b/go/parquet/pqarrow/file_reader_test.go
@@ -19,9 +19,11 @@ package pqarrow_test
 import (
 	"bytes"
 	"context"
+	"fmt"
 	"io"
 	"os"
 	"path/filepath"
+	"strings"
 	"testing"
 
 	"github.com/apache/arrow/go/v14/arrow"
@@ -216,3 +218,68 @@ func TestFileReaderWriterMetadata(t *testing.T) {
 	assert.Equal(t, []string{"foo", "bar"}, kvMeta.Keys())
 	assert.Equal(t, []string{"bar", "baz"}, kvMeta.Values())
 }
+
+func TestFileReaderColumnChunkBoundsErrors(t *testing.T) {
+	schema := arrow.NewSchema([]arrow.Field{
+		{Name: "zero", Type: arrow.PrimitiveTypes.Float64},
+		{Name: "g", Type: arrow.StructOf(
+			arrow.Field{Name: "one", Type: arrow.PrimitiveTypes.Float64},
+			arrow.Field{Name: "two", Type: arrow.PrimitiveTypes.Float64},
+			arrow.Field{Name: "three", Type: arrow.PrimitiveTypes.Float64},
+		)},
+	}, nil)
+
+	// generate Parquet data with four columns
+	// that are represented by two logical fields
+	data := `[
+		{
+			"zero": 1,
+			"g": {
+				"one": 1,
+				"two": 1,
+				"three": 1
+			}
+		},
+		{
+			"zero": 2,
+			"g": {
+				"one": 2,
+				"two": 2,
+				"three": 2
+			}
+		}
+	]`
+
+	record, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, strings.NewReader(data))
+	require.NoError(t, err)
+
+	output := &bytes.Buffer{}
+	writer, err := pqarrow.NewFileWriter(schema, output, parquet.NewWriterProperties(), pqarrow.DefaultWriterProps())
+	require.NoError(t, err)
+
+	require.NoError(t, writer.Write(record))
+	require.NoError(t, writer.Close())
+
+	fileReader, err := file.NewParquetReader(bytes.NewReader(output.Bytes()))
+	require.NoError(t, err)
+
+	arrowReader, err := pqarrow.NewFileReader(fileReader, pqarrow.ArrowReadProperties{BatchSize: 1024}, memory.DefaultAllocator)
+	require.NoError(t, err)
+
+	// assert that errors are returned for indexes outside the bounds of the logical fields (instead of the physical columns)
+	ctx := pqarrow.NewArrowWriteContext(context.Background(), nil)
+	assert.Greater(t, fileReader.NumRowGroups(), 0)
+	for rowGroupIndex := 0; rowGroupIndex < fileReader.NumRowGroups(); rowGroupIndex += 1 {
+		rowGroupReader := arrowReader.RowGroup(rowGroupIndex)
+		for fieldNum := 0; fieldNum < schema.NumFields(); fieldNum += 1 {
+			_, err := rowGroupReader.Column(fieldNum).Read(ctx)
+			assert.NoError(t, err, "reading field num: %d", fieldNum)
+		}
+
+		_, subZeroErr := rowGroupReader.Column(-1).Read(ctx)
+		assert.Error(t, subZeroErr)
+
+		_, tooHighErr := rowGroupReader.Column(schema.NumFields()).Read(ctx)
+		assert.ErrorContains(t, tooHighErr, fmt.Sprintf("there are only %d columns", schema.NumFields()))
+	}
+}