You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "lquerel (via GitHub)" <gi...@apache.org> on 2023/06/20 22:32:50 UTC

[GitHub] [arrow] lquerel commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

lquerel commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1235818429


##########
go/arrow/array/binary.go:
##########
@@ -318,6 +319,116 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool {
 	return true
 }
 
+type ViewLike interface {
+	arrow.Array
+	ValueHeader(int) *arrow.StringHeader
+}
+
+type BinaryView struct {
+	array
+	values      []arrow.StringHeader
+	dataBuffers []*memory.Buffer
+}
+
+func NewBinaryViewData(data arrow.ArrayData) *BinaryView {
+	a := &BinaryView{}
+	a.refCount = 1
+	a.setData(data.(*Data))
+	return a
+}
+
+func (a *BinaryView) setData(data *Data) {
+	if len(data.buffers) < 2 {
+		panic("len(data.buffers) < 2")
+	}
+	a.array.setData(data)
+
+	if valueData := data.buffers[1]; valueData != nil {
+		a.values = arrow.StringHeaderTraits.CastFromBytes(valueData.Bytes())
+	}
+
+	a.dataBuffers = data.buffers[2:]
+}
+
+func (a *BinaryView) ValueHeader(i int) *arrow.StringHeader {
+	if i < 0 || i >= a.array.data.length {
+		panic("arrow/array: index out of range")
+	}
+	return &a.values[a.array.data.offset+i]
+}
+
+func (a *BinaryView) Value(i int) []byte {

Review Comment:
   Since there is no compiler guarantee in Go that the slice returned by this method will remain unchanged (whether intentionally or unintentionally) by users of this API, I would suggest adding a comment to this method. This comment should clearly specify that it's unsafe to make any kind of changes to the returned slice.



##########
go/arrow/array/binary.go:
##########
@@ -318,6 +319,116 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool {
 	return true
 }
 
+type ViewLike interface {
+	arrow.Array
+	ValueHeader(int) *arrow.StringHeader
+}
+
+type BinaryView struct {
+	array
+	values      []arrow.StringHeader
+	dataBuffers []*memory.Buffer
+}
+
+func NewBinaryViewData(data arrow.ArrayData) *BinaryView {
+	a := &BinaryView{}
+	a.refCount = 1
+	a.setData(data.(*Data))
+	return a
+}
+
+func (a *BinaryView) setData(data *Data) {
+	if len(data.buffers) < 2 {
+		panic("len(data.buffers) < 2")
+	}
+	a.array.setData(data)
+
+	if valueData := data.buffers[1]; valueData != nil {
+		a.values = arrow.StringHeaderTraits.CastFromBytes(valueData.Bytes())
+	}
+
+	a.dataBuffers = data.buffers[2:]
+}
+
+func (a *BinaryView) ValueHeader(i int) *arrow.StringHeader {
+	if i < 0 || i >= a.array.data.length {
+		panic("arrow/array: index out of range")
+	}
+	return &a.values[a.array.data.offset+i]
+}
+
+func (a *BinaryView) Value(i int) []byte {
+	s := a.ValueHeader(i)
+	if s.IsInline() {
+		return s.InlineBytes()
+	}
+	start := s.BufferOffset()
+	buf := a.dataBuffers[s.BufferIndex()]
+	return buf.Bytes()[start : start+uint32(s.Len())]
+}
+
+func (a *BinaryView) ValueString(i int) string {
+	b := a.Value(i)
+	return *(*string)(unsafe.Pointer(&b))
+}
+
+func (a *BinaryView) String() string {
+	var o strings.Builder
+	o.WriteString("[")
+	for i := 0; i < a.Len(); i++ {
+		if i > 0 {
+			o.WriteString(" ")
+		}
+		switch {
+		case a.IsNull(i):
+			o.WriteString(NullValueStr)
+		default:
+			fmt.Fprintf(&o, "%q", a.ValueString(i))
+		}
+	}
+	o.WriteString("]")
+	return o.String()
+}
+
+func (a *BinaryView) ValueStr(i int) string {

Review Comment:
   It would likely help users to have a comment describing the differences between this method and `ValueString`.



##########
go/arrow/array/binarybuilder.go:
##########
@@ -358,6 +359,301 @@ func (b *BinaryBuilder) UnmarshalJSON(data []byte) error {
 	return b.Unmarshal(dec)
 }
 
+const (
+	dfltBlockSize             = 1 << 20 // 1 MB
+	viewValueSizeLimit uint32 = math.MaxUint32
+)
+
+type BinaryViewBuilder struct {
+	builder
+	dtype arrow.BinaryDataType
+
+	data    *memory.Buffer
+	rawData []arrow.StringHeader
+
+	blockBuilder multiBufferBuilder
+}
+
+func NewBinaryViewBuilder(mem memory.Allocator) *BinaryViewBuilder {
+	return &BinaryViewBuilder{
+		dtype: arrow.BinaryTypes.BinaryView,
+		builder: builder{
+			refCount: 1,
+			mem:      mem,
+		},
+		blockBuilder: multiBufferBuilder{
+			refCount:  1,
+			blockSize: dfltBlockSize,
+			mem:       mem,
+		},
+	}
+}
+
+func (b *BinaryViewBuilder) Type() arrow.DataType { return b.dtype }
+
+func (b *BinaryViewBuilder) Release() {
+	debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases")
+
+	if atomic.AddInt64(&b.refCount, -1) == 0 {
+		if b.nullBitmap != nil {
+			b.nullBitmap.Release()
+			b.nullBitmap = nil
+		}
+		if b.data != nil {
+			b.data.Release()
+			b.data = nil
+			b.rawData = nil
+		}
+	}
+}
+
+func (b *BinaryViewBuilder) init(capacity int) {
+	b.builder.init(capacity)
+	b.data = memory.NewResizableBuffer(b.mem)
+	bytesN := arrow.StringHeaderTraits.BytesRequired(capacity)
+	b.data.Resize(bytesN)
+	b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+}
+
+func (b *BinaryViewBuilder) Resize(n int) {
+	nbuild := n
+	if n < minBuilderCapacity {
+		n = minBuilderCapacity
+	}
+
+	if b.capacity == 0 {
+		b.init(n)
+	} else {
+		b.builder.resize(nbuild, b.init)
+		b.data.Resize(arrow.StringHeaderTraits.BytesRequired(n))
+		b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+	}
+}
+
+func (b *BinaryViewBuilder) ReserveData(length int) {
+	if uint32(length) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB",
+			arrow.ErrInvalid))
+	}
+	b.blockBuilder.Reserve(int(length))
+}
+
+func (b *BinaryViewBuilder) Reserve(n int) {
+	b.builder.reserve(n, b.Resize)
+}
+
+func (b *BinaryViewBuilder) Append(v []byte) {
+	if uint32(len(v)) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB", arrow.ErrInvalid))
+	}
+
+	if !arrow.IsStringHeaderInline(len(v)) {
+		b.ReserveData(len(v))
+	}
+
+	b.Reserve(1)
+	b.UnsafeAppend(v)
+}
+
+func (b *BinaryViewBuilder) AppendString(v string) {
+	// create a []byte without copying the bytes
+	// in go1.20 this would be unsafe.StringData
+	val := *(*[]byte)(unsafe.Pointer(&struct {
+		string
+		int
+	}{v, len(v)}))
+	b.Append(val)
+}
+
+func (b *BinaryViewBuilder) AppendNull() {
+	b.Reserve(1)
+	b.UnsafeAppendBoolToBitmap(false)
+}
+
+func (b *BinaryViewBuilder) AppendEmptyValue() {
+	b.Reserve(1)
+	b.UnsafeAppendBoolToBitmap(true)
+}
+
+func (b *BinaryViewBuilder) UnsafeAppend(v []byte) {
+	hdr := &b.rawData[b.length]
+	hdr.SetBytes(v)
+	if !hdr.IsInline() {
+		b.blockBuilder.UnsafeAppend(hdr, v)
+	}
+	b.UnsafeAppendBoolToBitmap(true)
+}
+
+func (b *BinaryViewBuilder) AppendValues(v [][]byte, valid []bool) {
+	if len(v) != len(valid) && len(valid) != 0 {
+		panic("len(v) != len(valid) && len(valid) != 0")
+	}
+
+	if len(v) == 0 {
+		return
+	}
+
+	b.Reserve(len(v))
+	outOfLineTotal := 0
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			if !arrow.IsStringHeaderInline(len(vv)) {
+				outOfLineTotal += len(vv)
+			}
+		}
+	}
+
+	b.ReserveData(outOfLineTotal)
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			hdr := &b.rawData[b.length+i]
+			hdr.SetBytes(vv)
+			if !hdr.IsInline() {
+				b.blockBuilder.UnsafeAppend(hdr, vv)
+			}
+		}
+	}
+
+	b.builder.unsafeAppendBoolsToBitmap(valid, len(v))
+}
+
+func (b *BinaryViewBuilder) AppendStringValues(v []string, valid []bool) {
+	if len(v) != len(valid) && len(valid) != 0 {
+		panic("len(v) != len(valid) && len(valid) != 0")
+	}
+
+	if len(v) == 0 {
+		return
+	}
+
+	b.Reserve(len(v))
+	outOfLineTotal := 0
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			if !arrow.IsStringHeaderInline(len(vv)) {
+				outOfLineTotal += len(vv)
+			}
+		}
+	}
+
+	b.ReserveData(outOfLineTotal)
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			hdr := &b.rawData[b.length+i]
+			hdr.SetString(vv)
+			if !hdr.IsInline() {
+				b.blockBuilder.UnsafeAppendString(hdr, vv)
+			}
+		}
+	}
+
+	b.builder.unsafeAppendBoolsToBitmap(valid, len(v))
+}
+
+func (b *BinaryViewBuilder) AppendValueFromString(s string) error {

Review Comment:
   It would likely help users to have a comment describing the differences between this method and `AppendString`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org