You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/08/04 16:45:28 UTC

[GitHub] [arrow] wolfeidau commented on a diff in pull request #13768: ARROW-3678: [Go] Implement Union Arrays

wolfeidau commented on code in PR #13768:
URL: https://github.com/apache/arrow/pull/13768#discussion_r938025833


##########
go/arrow/datatype_nested.go:
##########
@@ -384,6 +388,286 @@ func (t *MapType) Layout() DataTypeLayout {
 
 func (MapType) OffsetTypeTraits() OffsetTraits { return Int32Traits }
 
+type (
+	// UnionTypeCode is an alias to int8 which is the type of the ids
+	// used for union arrays.
+	UnionTypeCode = int8
+	UnionMode     int8
+)
+
+const (
+	MaxUnionTypeCode    UnionTypeCode = 127
+	InvalidUnionChildID int           = -1
+
+	SparseMode UnionMode = iota
+	DenseMode
+)
+
+// UnionType is an interface to encompass both Dense and Sparse Union types.
+//
+// A UnionType is a nested type where each logical value is taken
+// from a single child. A buffer of 8-bit type ids (typed as UnionTypeCode)
+// indicates which child a given logical value is to be taken from. This is
+// represented as the "child id" or "child index", which is the index into the
+// list of child fields for a given child.
+type UnionType interface {
+	NestedType
+	// Mode returns either SparseMode or DenseMode depending on the current
+	// concrete data type.
+	Mode() UnionMode
+	// ChildIDs returns a slice of ints to map UnionTypeCode values to
+	// the index in the Fields that represents the given Type. It is
+	// initialized with all values being InvalidUnionChildID (-1)
+	// before being populated based on the TypeCodes and fields of the type.
+	// The field for a given type can be retrieved by Fields()[ChildIDs()[typeCode]]
+	ChildIDs() []int
+	// TypeCodes returns the list of available type codes for this union type
+	// which will correspond to indexes into the ChildIDs slice to locate the
+	// appropriate child. A union Array contains a buffer of these type codes
+	// which indicate for a given index, which child has the value for that index.
+	TypeCodes() []UnionTypeCode
+	// MaxTypeCode returns the value of the largest TypeCode in the list of typecodes
+	// that are defined by this Union type
+	MaxTypeCode() UnionTypeCode
+}
+
+// UnionOf returns an appropriate union type for the given Mode (Sparse or Dense),
+// child fields, and type codes. len(fields) == len(typeCodes) must be true, or else
+// this will panic. len(fields) can be 0.
+func UnionOf(mode UnionMode, fields []Field, typeCodes []UnionTypeCode) UnionType {
+	switch mode {
+	case SparseMode:
+		return SparseUnionOf(fields, typeCodes)
+	case DenseMode:
+		return DenseUnionOf(fields, typeCodes)
+	default:
+		panic("arrow: invalid union mode")
+	}
+}
+
+type unionType struct {
+	children  []Field
+	typeCodes []UnionTypeCode
+	childIDs  [int(MaxUnionTypeCode) + 1]int
+}
+
+func (t *unionType) init(fields []Field, typeCodes []UnionTypeCode) {
+	// initialize all child IDs to -1
+	t.childIDs[0] = InvalidUnionChildID
+	for i := 1; i < len(t.childIDs); i *= 2 {
+		copy(t.childIDs[i:], t.childIDs[:i])
+	}
+
+	t.children = fields
+	t.typeCodes = typeCodes
+
+	for i, tc := range t.typeCodes {
+		t.childIDs[tc] = i
+	}
+}
+
+func (t unionType) Fields() []Field            { return t.children }
+func (t unionType) TypeCodes() []UnionTypeCode { return t.typeCodes }
+func (t unionType) ChildIDs() []int            { return t.childIDs[:] }
+
+func (t *unionType) validate(fields []Field, typeCodes []UnionTypeCode, _ UnionMode) error {
+	if len(fields) != len(typeCodes) {
+		return errors.New("arrow: union types should have the same number of fields as type codes")
+	}
+
+	for _, c := range typeCodes {
+		if c < 0 || c > MaxUnionTypeCode {
+			return errors.New("arrow: union type code out of bounds")
+		}
+	}
+	return nil
+}
+
+func (t *unionType) MaxTypeCode() (max UnionTypeCode) {
+	if len(t.typeCodes) == 0 {
+		return
+	}
+
+	max = t.typeCodes[0]
+	for _, c := range t.typeCodes[1:] {
+		if c > max {
+			max = c
+		}
+	}
+	return
+}
+
+func (t *unionType) String() string {
+	var b strings.Builder
+	b.WriteByte('<')
+	for i := range t.typeCodes {
+		if i != 0 {
+			b.WriteString(", ")
+		}
+		fmt.Fprintf(&b, "%s=%d", t.children[i], t.typeCodes[i])
+	}
+	b.WriteByte('>')
+	return b.String()
+}
+
+func (t *unionType) fingerprint() string {
+	var b strings.Builder
+	for _, c := range t.typeCodes {
+		fmt.Fprintf(&b, ":%d", c)
+	}
+	b.WriteString("]{")
+	for _, c := range t.children {
+		fingerprint := c.Fingerprint()
+		if len(fingerprint) == 0 {
+			return ""
+		}
+		b.WriteString(fingerprint)
+		b.WriteByte(';')
+	}
+	b.WriteByte('}')
+	return b.String()
+}
+
+func fieldsFromArrays(arrays []Array, names ...string) (ret []Field) {
+	ret = make([]Field, len(arrays))
+	if len(names) == 0 {
+		for i, c := range arrays {
+			ret[i] = Field{Name: strconv.Itoa(i), Type: c.DataType(), Nullable: true}
+		}
+	} else {
+		debug.Assert(len(names) == len(arrays), "mismatch of arrays and names")
+		for i, c := range arrays {
+			ret[i] = Field{Name: names[i], Type: c.DataType(), Nullable: true}
+		}
+	}
+	return
+}
+
+// SparseUnionType is the concrete type for Sparse union data.
+//
+// A sparse union is a nested type where each logical value is taken
+// from a single child. A buffer of 8-bit type ids indicates which child
+// a given logical value is to be taken from.
+//
+// In a sparse union, each child array will have the same length as the
+// union array itself, regardless of the actual number of union values which
+// refer to it.
+//
+// Unlike most other types, unions do not have a top-level validity bitmap.
+type SparseUnionType struct {
+	unionType
+}
+
+// SparseUnionFromArrays enables creating a union type from a list of Arrays,
+// field names, and type codes. len(fields) should be either 0 or equal to len(children).
+// len(codes) should also be either 0, or equal to len(children).
+//
+// If len(fields) == 0, then the fields will be named numerically as "0", "1", "2"...
+// and so on. If len(codes) == 0, then the type codes will be constructed as
+// [0, 1, 2, ..., n].
+func SparseUnionFromArrays(children []Array, fields []string, codes []UnionTypeCode) *SparseUnionType {
+	if len(codes) == 0 {
+		codes = make([]UnionTypeCode, len(children))
+		for i := range children {
+			codes[i] = UnionTypeCode(i)
+		}
+	}
+	return SparseUnionOf(fieldsFromArrays(children, fields...), codes)
+}
+
+// SparseUnionOf is equivalent to UnionOf(arrow.SparseMode, fields, typeCodes),
+// constructing a SparseUnionType from a list of fields and type codes.
+//
+// If len(fields) != len(typeCodes) this will panic. They are allowed to be
+// of length 0.
+func SparseUnionOf(fields []Field, typeCodes []UnionTypeCode) *SparseUnionType {
+	ret := &SparseUnionType{}

Review Comment:
   Again for consistency use new(SparseUnionType )



##########
go/arrow/datatype_nested.go:
##########
@@ -384,6 +388,286 @@ func (t *MapType) Layout() DataTypeLayout {
 
 func (MapType) OffsetTypeTraits() OffsetTraits { return Int32Traits }
 
+type (
+	// UnionTypeCode is an alias to int8 which is the type of the ids
+	// used for union arrays.
+	UnionTypeCode = int8
+	UnionMode     int8
+)
+
+const (
+	MaxUnionTypeCode    UnionTypeCode = 127
+	InvalidUnionChildID int           = -1
+
+	SparseMode UnionMode = iota
+	DenseMode
+)
+
+// UnionType is an interface to encompass both Dense and Sparse Union types.
+//
+// A UnionType is a nested type where each logical value is taken
+// from a single child. A buffer of 8-bit type ids (typed as UnionTypeCode)
+// indicates which child a given logical value is to be taken from. This is
+// represented as the "child id" or "child index", which is the index into the
+// list of child fields for a given child.
+type UnionType interface {
+	NestedType
+	// Mode returns either SparseMode or DenseMode depending on the current
+	// concrete data type.
+	Mode() UnionMode
+	// ChildIDs returns a slice of ints to map UnionTypeCode values to
+	// the index in the Fields that represents the given Type. It is
+	// initialized with all values being InvalidUnionChildID (-1)
+	// before being populated based on the TypeCodes and fields of the type.
+	// The field for a given type can be retrieved by Fields()[ChildIDs()[typeCode]]
+	ChildIDs() []int
+	// TypeCodes returns the list of available type codes for this union type
+	// which will correspond to indexes into the ChildIDs slice to locate the
+	// appropriate child. A union Array contains a buffer of these type codes
+	// which indicate for a given index, which child has the value for that index.
+	TypeCodes() []UnionTypeCode
+	// MaxTypeCode returns the value of the largest TypeCode in the list of typecodes
+	// that are defined by this Union type
+	MaxTypeCode() UnionTypeCode
+}
+
+// UnionOf returns an appropriate union type for the given Mode (Sparse or Dense),
+// child fields, and type codes. len(fields) == len(typeCodes) must be true, or else
+// this will panic. len(fields) can be 0.
+func UnionOf(mode UnionMode, fields []Field, typeCodes []UnionTypeCode) UnionType {
+	switch mode {
+	case SparseMode:
+		return SparseUnionOf(fields, typeCodes)
+	case DenseMode:
+		return DenseUnionOf(fields, typeCodes)
+	default:
+		panic("arrow: invalid union mode")
+	}
+}
+
+type unionType struct {
+	children  []Field
+	typeCodes []UnionTypeCode
+	childIDs  [int(MaxUnionTypeCode) + 1]int
+}
+
+func (t *unionType) init(fields []Field, typeCodes []UnionTypeCode) {
+	// initialize all child IDs to -1
+	t.childIDs[0] = InvalidUnionChildID
+	for i := 1; i < len(t.childIDs); i *= 2 {
+		copy(t.childIDs[i:], t.childIDs[:i])
+	}
+
+	t.children = fields
+	t.typeCodes = typeCodes
+
+	for i, tc := range t.typeCodes {
+		t.childIDs[tc] = i
+	}
+}
+
+func (t unionType) Fields() []Field            { return t.children }
+func (t unionType) TypeCodes() []UnionTypeCode { return t.typeCodes }
+func (t unionType) ChildIDs() []int            { return t.childIDs[:] }
+
+func (t *unionType) validate(fields []Field, typeCodes []UnionTypeCode, _ UnionMode) error {
+	if len(fields) != len(typeCodes) {
+		return errors.New("arrow: union types should have the same number of fields as type codes")
+	}
+
+	for _, c := range typeCodes {
+		if c < 0 || c > MaxUnionTypeCode {
+			return errors.New("arrow: union type code out of bounds")
+		}
+	}
+	return nil
+}
+
+func (t *unionType) MaxTypeCode() (max UnionTypeCode) {
+	if len(t.typeCodes) == 0 {
+		return
+	}
+
+	max = t.typeCodes[0]
+	for _, c := range t.typeCodes[1:] {
+		if c > max {
+			max = c
+		}
+	}
+	return
+}
+
+func (t *unionType) String() string {
+	var b strings.Builder
+	b.WriteByte('<')
+	for i := range t.typeCodes {
+		if i != 0 {
+			b.WriteString(", ")
+		}
+		fmt.Fprintf(&b, "%s=%d", t.children[i], t.typeCodes[i])
+	}
+	b.WriteByte('>')
+	return b.String()
+}
+
+func (t *unionType) fingerprint() string {
+	var b strings.Builder
+	for _, c := range t.typeCodes {
+		fmt.Fprintf(&b, ":%d", c)
+	}
+	b.WriteString("]{")
+	for _, c := range t.children {
+		fingerprint := c.Fingerprint()
+		if len(fingerprint) == 0 {
+			return ""
+		}
+		b.WriteString(fingerprint)
+		b.WriteByte(';')
+	}
+	b.WriteByte('}')
+	return b.String()
+}
+
+func fieldsFromArrays(arrays []Array, names ...string) (ret []Field) {
+	ret = make([]Field, len(arrays))
+	if len(names) == 0 {
+		for i, c := range arrays {
+			ret[i] = Field{Name: strconv.Itoa(i), Type: c.DataType(), Nullable: true}
+		}
+	} else {
+		debug.Assert(len(names) == len(arrays), "mismatch of arrays and names")
+		for i, c := range arrays {
+			ret[i] = Field{Name: names[i], Type: c.DataType(), Nullable: true}
+		}
+	}
+	return
+}
+
+// SparseUnionType is the concrete type for Sparse union data.
+//
+// A sparse union is a nested type where each logical value is taken
+// from a single child. A buffer of 8-bit type ids indicates which child
+// a given logical value is to be taken from.
+//
+// In a sparse union, each child array will have the same length as the
+// union array itself, regardless of the actual number of union values which
+// refer to it.
+//
+// Unlike most other types, unions do not have a top-level validity bitmap.
+type SparseUnionType struct {
+	unionType
+}
+
+// SparseUnionFromArrays enables creating a union type from a list of Arrays,
+// field names, and type codes. len(fields) should be either 0 or equal to len(children).
+// len(codes) should also be either 0, or equal to len(children).
+//
+// If len(fields) == 0, then the fields will be named numerically as "0", "1", "2"...
+// and so on. If len(codes) == 0, then the type codes will be constructed as
+// [0, 1, 2, ..., n].
+func SparseUnionFromArrays(children []Array, fields []string, codes []UnionTypeCode) *SparseUnionType {
+	if len(codes) == 0 {
+		codes = make([]UnionTypeCode, len(children))
+		for i := range children {
+			codes[i] = UnionTypeCode(i)
+		}
+	}
+	return SparseUnionOf(fieldsFromArrays(children, fields...), codes)
+}
+
+// SparseUnionOf is equivalent to UnionOf(arrow.SparseMode, fields, typeCodes),
+// constructing a SparseUnionType from a list of fields and type codes.
+//
+// If len(fields) != len(typeCodes) this will panic. They are allowed to be
+// of length 0.
+func SparseUnionOf(fields []Field, typeCodes []UnionTypeCode) *SparseUnionType {
+	ret := &SparseUnionType{}
+	if err := ret.validate(fields, typeCodes, ret.Mode()); err != nil {
+		panic(err)
+	}
+	ret.init(fields, typeCodes)
+	return ret
+}
+
+func (SparseUnionType) ID() Type        { return SPARSE_UNION }
+func (SparseUnionType) Name() string    { return "sparse_union" }
+func (SparseUnionType) Mode() UnionMode { return SparseMode }
+func (t *SparseUnionType) Fingerprint() string {
+	return typeFingerprint(t) + "[s" + t.fingerprint()
+}
+func (SparseUnionType) Layout() DataTypeLayout {
+	return DataTypeLayout{Buffers: []BufferSpec{SpecAlwaysNull(), SpecFixedWidth(Uint8SizeBytes)}}
+}
+func (t *SparseUnionType) String() string {
+	return t.Name() + t.unionType.String()
+}
+
+// DenseUnionType is the concrete type for dense union data.
+//
+// A dense union is a nested type where each logical value is taken from a
+// single child, at a specific offset. A buffer of 8-bit type ids (typed
+// as UnionTypeCode) indicates which child a given logical value is to be
+// taken from and a buffer of 32-bit offsets indicating which physical position
+// in the given child array has the logical value for that index.
+//
+// Unlike a sparse union, a dense union allows encoding only the child values
+// which are actually referred to by the union array. This is counterbalanced
+// by the additional footprint of the offsets buffer, and the additional
+// indirection cost when looking up values.
+//
+// Unlike most other types, unions don't have a top-level validity bitmap
+type DenseUnionType struct {
+	unionType
+}
+
+// DenseUnionFromArrays enables creating a union type from a list of Arrays,
+// field names, and type codes. len(fields) should be either 0 or equal to len(children).
+// len(codes) should also be either 0, or equal to len(children).
+//
+// If len(fields) == 0, then the fields will be named numerically as "0", "1", "2"...
+// and so on. If len(codes) == 0, then the type codes will be constructed as
+// [0, 1, 2, ..., n].
+func DenseUnionFromArrays(children []Array, fields []string, codes []UnionTypeCode) *DenseUnionType {
+	if len(codes) == 0 {
+		codes = make([]UnionTypeCode, len(children))
+		for i := range children {
+			codes[i] = UnionTypeCode(i)
+		}
+	}
+	return DenseUnionOf(fieldsFromArrays(children, fields...), codes)
+}
+
+// DenseUnionOf is equivalent to UnionOf(arrow.DenseMode, fields, typeCodes),
+// constructing a SparseUnionType from a list of fields and type codes.
+//
+// If len(fields) != len(typeCodes) this will panic. They are allowed to be
+// of length 0.
+func DenseUnionOf(fields []Field, typeCodes []UnionTypeCode) *DenseUnionType {
+	ret := &DenseUnionType{}

Review Comment:
   As above for consistency use new(SparseUnionType )



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org