You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/08/05 20:14:08 UTC

[GitHub] [arrow] wolfeidau commented on a diff in pull request #13806: ARROW-17276: [Go][Integratoin] Implement IPC handling for union type

wolfeidau commented on code in PR #13806:
URL: https://github.com/apache/arrow/pull/13806#discussion_r939185208


##########
go/arrow/ipc/file_reader.go:
##########
@@ -617,6 +626,45 @@ func (ctx *arrayLoaderContext) loadStruct(dt *arrow.StructType) arrow.ArrayData
 	return array.NewData(dt, int(field.Length()), buffers, subs, int(field.NullCount()), 0)
 }
 
+func (ctx *arrayLoaderContext) loadUnion(dt arrow.UnionType) arrow.ArrayData {
+	nBuffers := 2

Review Comment:
   Maybe a small comment on what 2 and 3 represent here.



##########
go/arrow/ipc/file_reader.go:
##########
@@ -381,6 +382,7 @@ func (src *ipcSource) buffer(i int) *memory.Buffer {
 	if !src.meta.Buffers(&buf, i) {
 		panic("buffer index out of bound")

Review Comment:
   Again prefix this with "arrow: "?



##########
go/arrow/ipc/file_reader.go:
##########
@@ -617,6 +626,45 @@ func (ctx *arrayLoaderContext) loadStruct(dt *arrow.StructType) arrow.ArrayData
 	return array.NewData(dt, int(field.Length()), buffers, subs, int(field.NullCount()), 0)
 }
 
+func (ctx *arrayLoaderContext) loadUnion(dt arrow.UnionType) arrow.ArrayData {
+	nBuffers := 2
+	if dt.Mode() == arrow.DenseMode {
+		nBuffers = 3
+	}
+
+	field, buffers := ctx.loadCommon(dt.ID(), nBuffers)
+	if field.NullCount() != 0 && buffers[0] != nil {
+		panic("cannot read pre-1.0.0 union array with top-level validity bitmap")

Review Comment:
   Prefix this with "arrow: "?



##########
go/arrow/ipc/writer.go:
##########
@@ -574,6 +577,100 @@ func (w *recordEncoder) visit(p *Payload, arr arrow.Array) error {
 		}
 		w.depth++
 
+	case *arrow.SparseUnionType:
+		offset, length := arr.Data().Offset(), arr.Len()
+		arr := arr.(*array.SparseUnion)
+		typeCodes := getTruncatedBuffer(int64(offset), int64(length), int32(unsafe.Sizeof(arrow.UnionTypeCode(0))), arr.TypeCodes())
+		p.body = append(p.body, typeCodes)
+
+		w.depth--
+		for i := 0; i < arr.NumFields(); i++ {
+			err := w.visit(p, arr.Field(i))
+			if err != nil {
+				return fmt.Errorf("could not visit field %d of sparse union array: %w", i, err)
+			}
+		}
+		w.depth++
+	case *arrow.DenseUnionType:
+		offset, length := arr.Data().Offset(), arr.Len()
+		arr := arr.(*array.DenseUnion)
+		typeCodes := getTruncatedBuffer(int64(offset), int64(length), int32(unsafe.Sizeof(arrow.UnionTypeCode(0))), arr.TypeCodes())
+		p.body = append(p.body, typeCodes)
+
+		w.depth--
+		dt := arr.UnionType()
+
+		// union type codes are not necessarily 0-indexed
+		maxCode := dt.MaxTypeCode()
+
+		// allocate an array of child offsets. Set all to -1 to indicate we
+		// haven't observed a first occurrence of a particular child yet
+		offsets := make([]int32, maxCode+1)
+		lengths := make([]int32, maxCode+1)
+		offsets[0], lengths[0] = -1, 0
+		for i := 1; i < len(offsets); i *= 2 {
+			copy(offsets[i:], offsets[:i])
+			copy(lengths[i:], lengths[:i])
+		}
+
+		var valueOffsets *memory.Buffer
+		if offset != 0 {
+			// this case sucks. Because the offsets are different for each
+			// child array, when we have a sliced array, we need to re-base
+			// the value offsets for each array! ew.
+

Review Comment:
   From this comment it almost feels like you want to make a smaller function for this so it can be tested in isolation.
   
   Feels like it could be super hard to track down in this case statement.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org