You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "lidavidm (via GitHub)" <gi...@apache.org> on 2023/02/10 19:26:36 UTC

[GitHub] [arrow] lidavidm commented on a diff in pull request #34127: GH-34101: [Go][Parquet] NewSchemaManifest creates wrong schema field

lidavidm commented on code in PR #34127:
URL: https://github.com/apache/arrow/pull/34127#discussion_r1103151700


##########
go/parquet/pqarrow/schema_test.go:
##########
@@ -309,3 +310,63 @@ func TestConvertArrowStruct(t *testing.T) {
 		assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name())
 	}
 }
+
+func TestListStructBackwardCompatible(t *testing.T) {
+	// Set up old construction for list of struct, not using
+	// the 3-level encoding. Schema looks like:
+	//
+	//     required group field_id=-1 root {
+	//       optional group field_id=-1 answers (List) {
+	//		   repeated group field_id=-1 array {
+	//           optional byte_array field_id=-1 type (String);
+	//           optional byte_array field_id=-1 rdata (String);
+	//           optional byte_array field_id=-1 class (String);
+	//         }
+	//       }
+	//     }
+	//
+	// Instaed of the proper 3-level encoding which would be:
+	//
+	//     repeated group field_id=-1 schema {
+	//       optional group field_id=-1 answers (List) {
+	//         repeated group field_id=-1 list {
+	//           optional group field_id=-1 element {
+	//             optional byte_array field_id=-1 type (String);
+	//             optional byte_array field_id=-1 rdata (String);
+	//             optional byte_array field_id=-1 class (String);
+	//           }
+	//         }
+	//       }
+	//     }
+	//
+	pqSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("root", parquet.Repetitions.Required, schema.FieldList{
+		schema.Must(schema.NewGroupNodeLogical("answers", parquet.Repetitions.Optional, schema.FieldList{
+			schema.Must(schema.NewGroupNode("array", parquet.Repetitions.Repeated, schema.FieldList{
+				schema.MustPrimitive(schema.NewPrimitiveNodeLogical("type", parquet.Repetitions.Optional,
+					schema.StringLogicalType{}, parquet.Types.ByteArray, -1, -1)),
+				schema.MustPrimitive(schema.NewPrimitiveNodeLogical("rdata", parquet.Repetitions.Optional,
+					schema.StringLogicalType{}, parquet.Types.ByteArray, -1, -1)),
+				schema.MustPrimitive(schema.NewPrimitiveNodeLogical("class", parquet.Repetitions.Optional,
+					schema.StringLogicalType{}, parquet.Types.ByteArray, -1, -1)),
+			}, -1)),
+		}, schema.NewListLogicalType(), -1)),
+	}, -1)))
+
+	meta := arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})
+	// desired equivalent arrow schema would be list<item: struct<type: utf8, rdata: utf8, class: utf8>>
+	arrowSchema := arrow.NewSchema(
+		[]arrow.Field{
+			{Name: "answers", Type: arrow.ListOfField(arrow.Field{
+				Name: "array", Type: arrow.StructOf(
+					arrow.Field{Name: "type", Type: arrow.BinaryTypes.String, Nullable: true, Metadata: meta},
+					arrow.Field{Name: "rdata", Type: arrow.BinaryTypes.String, Nullable: true, Metadata: meta},
+					arrow.Field{Name: "class", Type: arrow.BinaryTypes.String, Nullable: true, Metadata: meta},
+				), Nullable: true}), Nullable: true, Metadata: meta},
+		}, nil)
+
+	arrsc, err := pqarrow.FromParquet(pqSchema, nil, metadata.KeyValueMetadata{})
+	assert.NoError(t, err)
+	fmt.Println(arrowSchema)
+	fmt.Println(arrsc)

Review Comment:
   Did you mean to leave these in?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org