You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "zeroshade (via GitHub)" <gi...@apache.org> on 2023/05/25 17:58:48 UTC

[GitHub] [arrow] zeroshade opened a new pull request, #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

zeroshade opened a new pull request, #35769:
URL: https://github.com/apache/arrow/pull/35769

   <!--
   Thanks for opening a pull request!
   If this is your first pull request you can find detailed information on how 
   to contribute here:
     * [New Contributor's Guide](https://arrow.apache.org/docs/dev/developers/guide/step_by_step/pr_lifecycle.html#reviews-and-merge-of-the-pull-request)
     * [Contributing Overview](https://arrow.apache.org/docs/dev/developers/overview.html)
   
   
   If this is not a [minor PR](https://github.com/apache/arrow/blob/main/CONTRIBUTING.md#Minor-Fixes). Could you open an issue for this pull request on GitHub? https://github.com/apache/arrow/issues/new/choose
   
   Opening GitHub issues ahead of time contributes to the [Openness](http://theapacheway.com/open/#:~:text=Openness%20allows%20new%20users%20the,must%20happen%20in%20the%20open.) of the Apache Arrow project.
   
   Then could you also rename the pull request title in the following format?
   
       GH-${GITHUB_ISSUE_ID}: [${COMPONENT}] ${SUMMARY}
   
   or
   
       MINOR: [${COMPONENT}] ${SUMMARY}
   
   In the case of PARQUET issues on JIRA the title also supports:
   
       PARQUET-${JIRA_ISSUE_ID}: [${COMPONENT}] ${SUMMARY}
   
   -->
   
   ### Rationale for this change
   See #35628 for the rationale and description of the StringView/BinaryView array types.
   
   This change is adding Go as a second implementation of it.
   
   <!--
    Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed.
    Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes.  
   -->
   
   ### What changes are included in this PR?
   
   Add Array Types for `StringView` and `BinaryView` along with `StringViewType` and `BinaryViewType` and necessary enums and builders. These arrays can be round tripped through JSON and IPC.
   
   <!--
   There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR.
   -->
   
   ### Are these changes tested?
   Yes, unit tests have been added and integration tests run (currently only Go vs Go but will run against C++ when #35628 is merged)
   
   <!--
   We typically require tests for all PRs in order to:
   1. Prevent the code from being accidentally broken by subsequent changes
   2. Serve as another way to document the expected behavior of the code
   
   If tests are not included in your PR, please explain why (for example, are they covered by existing tests)?
   -->
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1238624571


##########
go/arrow/internal/flatbuf/RecordBatch.go:
##########
@@ -128,8 +128,42 @@ func (rcv *RecordBatch) Compression(obj *BodyCompression) *BodyCompression {
 }
 
 /// Optional compression of the message body
+/// Some types such as Utf8View are represented using a variable number of buffers.
+/// For each such Field in the pre-ordered flattened logical schema, there will be
+/// an entry in variadicCounts to indicate the number of extra buffers which belong
+/// to that Field.
+func (rcv *RecordBatch) VariadicCounts(j int) int64 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(12))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.GetInt64(a + flatbuffers.UOffsetT(j*8))
+	}
+	return 0

Review Comment:
   I'm not going to modify the generated code. Everything in this folder is generated by `flatc` from the `.fbs` files.



##########
go/arrow/internal/flatbuf/RecordBatch.go:
##########
@@ -128,8 +128,42 @@ func (rcv *RecordBatch) Compression(obj *BodyCompression) *BodyCompression {
 }
 
 /// Optional compression of the message body
+/// Some types such as Utf8View are represented using a variable number of buffers.
+/// For each such Field in the pre-ordered flattened logical schema, there will be
+/// an entry in variadicCounts to indicate the number of extra buffers which belong
+/// to that Field.
+func (rcv *RecordBatch) VariadicCounts(j int) int64 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(12))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.GetInt64(a + flatbuffers.UOffsetT(j*8))
+	}
+	return 0
+}
+
+func (rcv *RecordBatch) VariadicCountsLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(12))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+/// Some types such as Utf8View are represented using a variable number of buffers.
+/// For each such Field in the pre-ordered flattened logical schema, there will be
+/// an entry in variadicCounts to indicate the number of extra buffers which belong
+/// to that Field.
+func (rcv *RecordBatch) MutateVariadicCounts(j int, n int64) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(12))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.MutateInt64(a+flatbuffers.UOffsetT(j*8), n)
+	}
+	return false

Review Comment:
   I'm not going to modify the generated code. Everything in this folder is generated by flatc from the .fbs files.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation [arrow]

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1376458006


##########
go/arrow/ipc/file_reader.go:
##########
@@ -476,6 +487,9 @@ func (ctx *arrayLoaderContext) loadArray(dt arrow.DataType) arrow.ArrayData {
 	case *arrow.BinaryType, *arrow.StringType, *arrow.LargeStringType, *arrow.LargeBinaryType:
 		return ctx.loadBinary(dt)
 
+	case arrow.BinaryViewDataType:

Review Comment:
   `BinaryViewDataType` is an interface, not a concrete type. `StringViewDataType` implements the `BinaryViewDataType` interface. The concrete types are `BinaryViewType` and `StringViewType`. The types that end with `DataType` in the name are interfaces (like `FixedWidthDataType`). So we don't need a separate case for `StringViewType` because they are processed identically



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation [arrow]

Posted by "bkietz (via GitHub)" <gi...@apache.org>.
bkietz commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1376288902


##########
go/arrow/ipc/metadata.go:
##########
@@ -1186,18 +1202,25 @@ func writeDictionaryMessage(mem memory.Allocator, id int64, isDelta bool, size,
 	return writeMessageFB(b, mem, flatbuf.MessageHeaderDictionaryBatch, dictFB, bodyLength)
 }
 
-func recordToFB(b *flatbuffers.Builder, size, bodyLength int64, fields []fieldMetadata, meta []bufferMetadata, codec flatbuf.CompressionType) flatbuffers.UOffsetT {
+func recordToFB(b *flatbuffers.Builder, size, bodyLength int64, fields []fieldMetadata, meta []bufferMetadata, codec flatbuf.CompressionType, variadicCounts []int64) flatbuffers.UOffsetT {
 	fieldsFB := writeFieldNodes(b, fields, flatbuf.RecordBatchStartNodesVector)
 	metaFB := writeBuffers(b, meta, flatbuf.RecordBatchStartBuffersVector)
 	var bodyCompressFB flatbuffers.UOffsetT
 	if codec != -1 {
 		bodyCompressFB = writeBodyCompression(b, codec)
 	}
 
+	flatbuf.RecordBatchStartVariadicBufferCountsVector(b, len(variadicCounts))
+	for i := len(variadicCounts) - 1; i >= 0; i-- {
+		b.PrependInt64(variadicCounts[i])
+	}
+	vcFB := b.EndVector(len(variadicCounts))
+

Review Comment:
   I think it probably would be preferable to leave the variadic buffer counts absent rather than empty if none are necessary



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation [arrow]

Posted by "candiduslynx (via GitHub)" <gi...@apache.org>.
candiduslynx commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1391457998


##########
go/arrow/array/binary.go:
##########
@@ -24,6 +24,7 @@ import (
 	"unsafe"
 
 	"github.com/apache/arrow/go/v15/arrow"
+  "github.com/apache/arrow/go/v15/arrow/memory"

Review Comment:
   formatting issue?



##########
go/arrow/array/bufferbuilder.go:
##########
@@ -18,7 +18,9 @@ package array
 
 import (
 	"sync/atomic"
-
+	"unsafe"
+  
+  "github.com/apache/arrow/go/v15/arrow"

Review Comment:
   format?



##########
go/arrow/array/binary.go:
##########
@@ -318,6 +319,126 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool {
 	return true
 }
 
+type ViewLike interface {
+	arrow.Array
+	ValueHeader(int) *arrow.ViewHeader
+}
+
+type BinaryView struct {
+	array
+	values      []arrow.ViewHeader
+	dataBuffers []*memory.Buffer
+}
+
+func NewBinaryViewData(data arrow.ArrayData) *BinaryView {
+	a := &BinaryView{}
+	a.refCount = 1
+	a.setData(data.(*Data))
+	return a
+}
+
+func (a *BinaryView) setData(data *Data) {
+	if len(data.buffers) < 2 {
+		panic("len(data.buffers) < 2")
+	}
+	a.array.setData(data)
+
+	if valueData := data.buffers[1]; valueData != nil {
+		a.values = arrow.ViewHeaderTraits.CastFromBytes(valueData.Bytes())
+	}
+
+	a.dataBuffers = data.buffers[2:]
+}
+
+func (a *BinaryView) ValueHeader(i int) *arrow.ViewHeader {
+	if i < 0 || i >= a.array.data.length {
+		panic("arrow/array: index out of range")
+	}
+	return &a.values[a.array.data.offset+i]
+}
+
+func (a *BinaryView) Value(i int) []byte {
+	s := a.ValueHeader(i)
+	if s.IsInline() {
+		return s.InlineBytes()
+	}
+	start := s.BufferOffset()
+	buf := a.dataBuffers[s.BufferIndex()]
+	return buf.Bytes()[start : start+int32(s.Len())]
+}
+
+// ValueString returns the value at index i as a string instead of
+// a byte slice, without copying the underlying data.
+func (a *BinaryView) ValueString(i int) string {
+	b := a.Value(i)
+	return *(*string)(unsafe.Pointer(&b))
+}
+
+func (a *BinaryView) String() string {
+	var o strings.Builder
+	o.WriteString("[")
+	for i := 0; i < a.Len(); i++ {
+		if i > 0 {
+			o.WriteString(" ")
+		}
+		switch {
+		case a.IsNull(i):
+			o.WriteString(NullValueStr)
+		default:
+			fmt.Fprintf(&o, "%q", a.ValueString(i))
+		}
+	}
+	o.WriteString("]")
+	return o.String()
+}
+
+// ValueStr is paired with AppendValueFromString in that it returns
+// the value at index i as a string: Semantically this means that for
+// a null value it will return the string "(null)", otherwise it will
+// return the value as a base64 encoded string suitable for CSV/JSON.
+//
+// This is always going to be less performant than just using ValueString
+// and exists to fulfill the Array interface to provide a method which
+// can produce a human readable string for a given index.
+func (a *BinaryView) ValueStr(i int) string {
+	if a.IsNull(i) {
+		return NullValueStr
+	}
+	return base64.StdEncoding.EncodeToString(a.Value(i))
+}
+
+func (a *BinaryView) GetOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+	return a.Value(i)
+}
+
+func (a *BinaryView) MarshalJSON() ([]byte, error) {

Review Comment:
   please add tests for `MarshalJSON` <-> `Unmarshal`



##########
go/arrow/datatype_viewheader.go:
##########
@@ -0,0 +1,141 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arrow
+
+import (
+	"bytes"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v15/arrow/endian"
+	"github.com/apache/arrow/go/v15/arrow/internal/debug"
+	"github.com/apache/arrow/go/v15/arrow/memory"
+)
+
+const (
+	StringViewPrefixLen  = 4
+	stringViewInlineSize = 12

Review Comment:
   ```suggestion
   	ViewPrefixLen  = 4
   	viewInlineSize = 12
   ```



##########
go/arrow/array/binary.go:
##########
@@ -318,6 +319,126 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool {
 	return true
 }
 
+type ViewLike interface {
+	arrow.Array
+	ValueHeader(int) *arrow.ViewHeader
+}
+
+type BinaryView struct {
+	array
+	values      []arrow.ViewHeader
+	dataBuffers []*memory.Buffer
+}
+
+func NewBinaryViewData(data arrow.ArrayData) *BinaryView {
+	a := &BinaryView{}
+	a.refCount = 1
+	a.setData(data.(*Data))
+	return a
+}
+
+func (a *BinaryView) setData(data *Data) {
+	if len(data.buffers) < 2 {
+		panic("len(data.buffers) < 2")
+	}
+	a.array.setData(data)
+
+	if valueData := data.buffers[1]; valueData != nil {
+		a.values = arrow.ViewHeaderTraits.CastFromBytes(valueData.Bytes())
+	}
+
+	a.dataBuffers = data.buffers[2:]
+}
+
+func (a *BinaryView) ValueHeader(i int) *arrow.ViewHeader {
+	if i < 0 || i >= a.array.data.length {
+		panic("arrow/array: index out of range")
+	}
+	return &a.values[a.array.data.offset+i]
+}
+
+func (a *BinaryView) Value(i int) []byte {
+	s := a.ValueHeader(i)
+	if s.IsInline() {
+		return s.InlineBytes()
+	}
+	start := s.BufferOffset()
+	buf := a.dataBuffers[s.BufferIndex()]
+	return buf.Bytes()[start : start+int32(s.Len())]
+}
+
+// ValueString returns the value at index i as a string instead of
+// a byte slice, without copying the underlying data.
+func (a *BinaryView) ValueString(i int) string {
+	b := a.Value(i)
+	return *(*string)(unsafe.Pointer(&b))
+}
+
+func (a *BinaryView) String() string {
+	var o strings.Builder
+	o.WriteString("[")
+	for i := 0; i < a.Len(); i++ {
+		if i > 0 {
+			o.WriteString(" ")
+		}
+		switch {
+		case a.IsNull(i):
+			o.WriteString(NullValueStr)
+		default:
+			fmt.Fprintf(&o, "%q", a.ValueString(i))
+		}
+	}
+	o.WriteString("]")
+	return o.String()
+}
+
+// ValueStr is paired with AppendValueFromString in that it returns
+// the value at index i as a string: Semantically this means that for
+// a null value it will return the string "(null)", otherwise it will
+// return the value as a base64 encoded string suitable for CSV/JSON.
+//
+// This is always going to be less performant than just using ValueString
+// and exists to fulfill the Array interface to provide a method which
+// can produce a human readable string for a given index.
+func (a *BinaryView) ValueStr(i int) string {

Review Comment:
   please add tests for `ValueStr` <-> `AppendValueFromString`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] bkietz commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "bkietz (via GitHub)" <gi...@apache.org>.
bkietz commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1283616566


##########
dev/archery/archery/integration/datagen.py:
##########
@@ -743,6 +763,72 @@ class LargeStringColumn(_BaseStringColumn, _LargeOffsetsMixin):
     pass
 
 
+class BinaryViewColumn(PrimitiveColumn):
+
+    def _encode_value(self, x):
+        return frombytes(binascii.hexlify(x).upper())
+
+    def _get_buffers(self):
+        char_buffers = []
+        DEFAULT_BUFFER_SIZE = 32  # ¯\_(ツ)_/¯
+        INLINE_SIZE = 12
+
+        data = []
+        for i, v in enumerate(self.values):
+            if not self.is_valid[i]:
+                v = b''
+            assert isinstance(v, bytes)
+
+            if len(v) > INLINE_SIZE:
+                offset = 0
+                if len(v) > DEFAULT_BUFFER_SIZE:
+                    char_buffers.append(v)
+                else:
+                    if len(char_buffers) == 0:
+                        char_buffers.append(v)
+                    elif len(char_buffers[-1]) + len(v) > DEFAULT_BUFFER_SIZE:
+                        char_buffers.append(v)
+                    else:
+                        offset = len(char_buffers[-1])
+                        char_buffers[-1] += v
+                        assert len(char_buffers[-1]) <= DEFAULT_BUFFER_SIZE

Review Comment:
   This was intended to be didactic/a sanity check. It could probably be replaced with comments
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] bkietz commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "bkietz (via GitHub)" <gi...@apache.org>.
bkietz commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1224446874


##########
go/arrow/datatype_binary.go:
##########
@@ -83,16 +93,168 @@ func (t *LargeStringType) Layout() DataTypeLayout {
 func (t *LargeStringType) OffsetTypeTraits() OffsetTraits { return Int64Traits }
 func (LargeStringType) IsUtf8() bool                      { return true }
 
+const (
+	StringHeaderPrefixLen  = 4
+	stringHeaderInlineSize = 12
+)
+
+func IsStringHeaderInline(length int) bool {
+	return length < stringHeaderInlineSize
+}
+
+// StringHeader is a variable length string (utf8) or byte slice with
+// a 4 byte prefix and inline optimization for small values (12 bytes
+// or fewer). This is similar to Go's standard string  but limited by
+// a length of Uint32Max and up to the first four bytes of the string
+// are copied into the struct. This prefix allows failing comparisons
+// early and can reduce CPU cache working set when dealing with short
+// strings.
+//
+// There are two situations:
+//
+//	Short string   |----|----|--------|
+//	                ^    ^    ^
+//	                |    |    |
+//	              size prefix remaining in-line portion, zero padded
+//
+//	IO Long String |----|----|----|----|
+//	                ^    ^     ^     ^
+//	                |    |     |     |
+//	              size prefix buffer index and offset to out-of-line portion
+//
+// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+//
+// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+type StringHeader struct {

Review Comment:
   You're correct that the alignment only really matters if Go is communicating with C++ through shared memory or the C-ABI, so getting it up to 8 is currently speculative generality at this point. That said, adding string view to the C-ABI is a pretty natural follow up to this work so I think it'd be worthwhile to be prepared. I won't insist for now



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1238725740


##########
go/arrow/array/binarybuilder.go:
##########
@@ -370,6 +371,327 @@ func (b *BinaryBuilder) UnmarshalJSON(data []byte) error {
 	return b.Unmarshal(dec)
 }
 
+const (
+	dfltBlockSize             = 1 << 20 // 1 MB
+	viewValueSizeLimit uint32 = math.MaxUint32
+)
+
+type BinaryViewBuilder struct {
+	builder
+	dtype arrow.BinaryDataType
+
+	data    *memory.Buffer
+	rawData []arrow.StringHeader
+
+	blockBuilder multiBufferBuilder
+}
+
+func NewBinaryViewBuilder(mem memory.Allocator) *BinaryViewBuilder {
+	return &BinaryViewBuilder{
+		dtype: arrow.BinaryTypes.BinaryView,
+		builder: builder{
+			refCount: 1,
+			mem:      mem,
+		},
+		blockBuilder: multiBufferBuilder{
+			refCount:  1,
+			blockSize: dfltBlockSize,
+			mem:       mem,
+		},
+	}
+}
+
+func (b *BinaryViewBuilder) Type() arrow.DataType { return b.dtype }
+
+func (b *BinaryViewBuilder) Release() {
+	debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases")
+
+	if atomic.AddInt64(&b.refCount, -1) == 0 {
+		if b.nullBitmap != nil {
+			b.nullBitmap.Release()
+			b.nullBitmap = nil
+		}
+		if b.data != nil {
+			b.data.Release()
+			b.data = nil
+			b.rawData = nil
+		}
+	}
+}
+
+func (b *BinaryViewBuilder) init(capacity int) {
+	b.builder.init(capacity)
+	b.data = memory.NewResizableBuffer(b.mem)
+	bytesN := arrow.StringHeaderTraits.BytesRequired(capacity)
+	b.data.Resize(bytesN)
+	b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+}
+
+func (b *BinaryViewBuilder) Resize(n int) {
+	nbuild := n
+	if n < minBuilderCapacity {
+		n = minBuilderCapacity
+	}
+
+	if b.capacity == 0 {
+		b.init(n)
+	} else {
+		b.builder.resize(nbuild, b.init)
+		b.data.Resize(arrow.StringHeaderTraits.BytesRequired(n))
+		b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+	}
+}
+
+func (b *BinaryViewBuilder) ReserveData(length int) {
+	if uint32(length) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB",
+			arrow.ErrInvalid))
+	}
+	b.blockBuilder.Reserve(int(length))
+}
+
+func (b *BinaryViewBuilder) Reserve(n int) {
+	b.builder.reserve(n, b.Resize)
+}
+
+func (b *BinaryViewBuilder) Append(v []byte) {
+	if uint32(len(v)) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB", arrow.ErrInvalid))
+	}
+
+	if !arrow.IsStringHeaderInline(len(v)) {
+		b.ReserveData(len(v))
+	}
+
+	b.Reserve(1)
+	b.UnsafeAppend(v)
+}
+
+// AppendString is identical to Append, only accepting a string instead
+// of a byte slice, avoiding the extra copy that would occur if you simply
+// did []byte(v).
+//
+// This is different than AppendValueFromString which exists for the
+// Builder interface, in that this expects raw binary data which is
+// appended as such. AppendValueFromString expects base64 encoded binary
+// data instead.
+func (b *BinaryViewBuilder) AppendString(v string) {
+	// create a []byte without copying the bytes
+	// in go1.20 this would be unsafe.StringData
+	val := *(*[]byte)(unsafe.Pointer(&struct {
+		string
+		int
+	}{v, len(v)}))
+	b.Append(val)
+}
+
+func (b *BinaryViewBuilder) AppendNull() {
+	b.Reserve(1)
+	b.UnsafeAppendBoolToBitmap(false)
+}
+
+func (b *BinaryViewBuilder) AppendNulls(n int) {
+	b.Reserve(n)
+	for i := 0; i < n; i++ {
+		b.UnsafeAppendBoolToBitmap(false)
+	}
+}
+
+func (b *BinaryViewBuilder) AppendEmptyValue() {
+	b.Reserve(1)
+	b.UnsafeAppendBoolToBitmap(true)
+}
+
+func (b *BinaryViewBuilder) AppendEmptyValues(n int) {
+	b.Reserve(n)
+	b.unsafeAppendBoolsToBitmap(nil, n)
+}
+
+func (b *BinaryViewBuilder) UnsafeAppend(v []byte) {
+	hdr := &b.rawData[b.length]
+	hdr.SetBytes(v)
+	if !hdr.IsInline() {
+		b.blockBuilder.UnsafeAppend(hdr, v)
+	}
+	b.UnsafeAppendBoolToBitmap(true)
+}
+
+func (b *BinaryViewBuilder) AppendValues(v [][]byte, valid []bool) {
+	if len(v) != len(valid) && len(valid) != 0 {
+		panic("len(v) != len(valid) && len(valid) != 0")
+	}
+
+	if len(v) == 0 {
+		return
+	}
+
+	b.Reserve(len(v))
+	outOfLineTotal := 0
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			if !arrow.IsStringHeaderInline(len(vv)) {
+				outOfLineTotal += len(vv)
+			}
+		}
+	}
+
+	b.ReserveData(outOfLineTotal)
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			hdr := &b.rawData[b.length+i]
+			hdr.SetBytes(vv)
+			if !hdr.IsInline() {
+				b.blockBuilder.UnsafeAppend(hdr, vv)
+			}
+		}
+	}
+
+	b.builder.unsafeAppendBoolsToBitmap(valid, len(v))
+}
+
+func (b *BinaryViewBuilder) AppendStringValues(v []string, valid []bool) {
+	if len(v) != len(valid) && len(valid) != 0 {
+		panic("len(v) != len(valid) && len(valid) != 0")
+	}
+
+	if len(v) == 0 {
+		return
+	}
+
+	b.Reserve(len(v))
+	outOfLineTotal := 0
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			if !arrow.IsStringHeaderInline(len(vv)) {
+				outOfLineTotal += len(vv)
+			}
+		}
+	}
+
+	b.ReserveData(outOfLineTotal)
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			hdr := &b.rawData[b.length+i]
+			hdr.SetString(vv)
+			if !hdr.IsInline() {
+				b.blockBuilder.UnsafeAppendString(hdr, vv)
+			}
+		}
+	}
+
+	b.builder.unsafeAppendBoolsToBitmap(valid, len(v))
+}
+
+// AppendValueFromString is paired with ValueStr for fulfilling the
+// base Builder interface. This is intended to read in a human-readable
+// string such as from CSV or JSON and append it to the array.
+//
+// For Binary values are expected to be base64 encoded (and will be
+// decoded as such before being appended).
+func (b *BinaryViewBuilder) AppendValueFromString(s string) error {
+	if s == NullValueStr {
+		b.AppendNull()
+		return nil
+	}
+
+	if b.dtype.IsUtf8() {
+		b.Append([]byte(s))
+		return nil
+	}
+
+	decodedVal, err := base64.StdEncoding.DecodeString(s)
+	if err != nil {
+		return fmt.Errorf("could not decode base64 string: %w", err)
+	}
+	b.Append(decodedVal)
+	return nil
+}
+
+func (b *BinaryViewBuilder) UnmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case string:
+		data, err := base64.StdEncoding.DecodeString(v)
+		if err != nil {
+			return err
+		}
+		b.Append(data)
+	case []byte:
+		b.Append(v)
+	case nil:
+		b.AppendNull()
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf([]byte{}),
+			Offset: dec.InputOffset(),
+		}
+	}
+	return nil
+}
+
+func (b *BinaryViewBuilder) Unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.UnmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *BinaryViewBuilder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary view builder must unpack from json array, found %s", delim)
+	}
+
+	return b.Unmarshal(dec)
+}
+
+func (b *BinaryViewBuilder) newData() (data *Data) {
+	bytesRequired := arrow.StringHeaderTraits.BytesRequired(b.length)
+	if bytesRequired > 0 && bytesRequired < b.data.Len() {
+		// trim buffers
+		b.data.Resize(bytesRequired)
+	}
+
+	dataBuffers := b.blockBuilder.Finish()
+	data = NewData(b.dtype, b.length, append([]*memory.Buffer{
+		b.nullBitmap, b.data}, dataBuffers...), nil, b.nulls, 0)
+	b.reset()
+
+	if b.data != nil {
+		b.data.Release()
+		b.data = nil

Review Comment:
   That is handled by the call to `b.reset()` above this



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation [arrow]

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1391688480


##########
go/arrow/array/binary.go:
##########
@@ -318,6 +319,126 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool {
 	return true
 }
 
+type ViewLike interface {
+	arrow.Array
+	ValueHeader(int) *arrow.ViewHeader
+}
+
+type BinaryView struct {
+	array
+	values      []arrow.ViewHeader
+	dataBuffers []*memory.Buffer
+}
+
+func NewBinaryViewData(data arrow.ArrayData) *BinaryView {
+	a := &BinaryView{}
+	a.refCount = 1
+	a.setData(data.(*Data))
+	return a
+}
+
+func (a *BinaryView) setData(data *Data) {
+	if len(data.buffers) < 2 {
+		panic("len(data.buffers) < 2")
+	}
+	a.array.setData(data)
+
+	if valueData := data.buffers[1]; valueData != nil {
+		a.values = arrow.ViewHeaderTraits.CastFromBytes(valueData.Bytes())
+	}
+
+	a.dataBuffers = data.buffers[2:]
+}
+
+func (a *BinaryView) ValueHeader(i int) *arrow.ViewHeader {
+	if i < 0 || i >= a.array.data.length {
+		panic("arrow/array: index out of range")
+	}
+	return &a.values[a.array.data.offset+i]
+}
+
+func (a *BinaryView) Value(i int) []byte {
+	s := a.ValueHeader(i)
+	if s.IsInline() {
+		return s.InlineBytes()
+	}
+	start := s.BufferOffset()
+	buf := a.dataBuffers[s.BufferIndex()]
+	return buf.Bytes()[start : start+int32(s.Len())]
+}
+
+// ValueString returns the value at index i as a string instead of
+// a byte slice, without copying the underlying data.
+func (a *BinaryView) ValueString(i int) string {
+	b := a.Value(i)
+	return *(*string)(unsafe.Pointer(&b))
+}
+
+func (a *BinaryView) String() string {
+	var o strings.Builder
+	o.WriteString("[")
+	for i := 0; i < a.Len(); i++ {
+		if i > 0 {
+			o.WriteString(" ")
+		}
+		switch {
+		case a.IsNull(i):
+			o.WriteString(NullValueStr)
+		default:
+			fmt.Fprintf(&o, "%q", a.ValueString(i))
+		}
+	}
+	o.WriteString("]")
+	return o.String()
+}
+
+// ValueStr is paired with AppendValueFromString in that it returns
+// the value at index i as a string: Semantically this means that for
+// a null value it will return the string "(null)", otherwise it will
+// return the value as a base64 encoded string suitable for CSV/JSON.
+//
+// This is always going to be less performant than just using ValueString
+// and exists to fulfill the Array interface to provide a method which
+// can produce a human readable string for a given index.
+func (a *BinaryView) ValueStr(i int) string {
+	if a.IsNull(i) {
+		return NullValueStr
+	}
+	return base64.StdEncoding.EncodeToString(a.Value(i))
+}
+
+func (a *BinaryView) GetOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+	return a.Value(i)
+}
+
+func (a *BinaryView) MarshalJSON() ([]byte, error) {

Review Comment:
   The tests in `util_test.go` for `TestArrRecordsJSONRoundTrip` cover this as we have String/BinaryView records in the test data which get run as round trip using `RecordFromJSON` and `json.Encode`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1224415868


##########
go/arrow/datatype_binary.go:
##########
@@ -83,16 +93,168 @@ func (t *LargeStringType) Layout() DataTypeLayout {
 func (t *LargeStringType) OffsetTypeTraits() OffsetTraits { return Int64Traits }
 func (LargeStringType) IsUtf8() bool                      { return true }
 
+const (
+	StringHeaderPrefixLen  = 4
+	stringHeaderInlineSize = 12
+)
+
+func IsStringHeaderInline(length int) bool {
+	return length < stringHeaderInlineSize
+}
+
+// StringHeader is a variable length string (utf8) or byte slice with
+// a 4 byte prefix and inline optimization for small values (12 bytes
+// or fewer). This is similar to Go's standard string  but limited by
+// a length of Uint32Max and up to the first four bytes of the string
+// are copied into the struct. This prefix allows failing comparisons
+// early and can reduce CPU cache working set when dealing with short
+// strings.
+//
+// There are two situations:
+//
+//	Short string   |----|----|--------|
+//	                ^    ^    ^
+//	                |    |    |
+//	              size prefix remaining in-line portion, zero padded
+//
+//	IO Long String |----|----|----|----|
+//	                ^    ^     ^     ^
+//	                |    |     |     |
+//	              size prefix buffer index and offset to out-of-line portion
+//
+// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+//
+// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+type StringHeader struct {

Review Comment:
   That's what I was thinking and is why i didn't do that in the first place. I mean at that rate why not just make the entire thing `[16]byte` right? lol



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1238745855


##########
go/arrow/ipc/file_reader.go:
##########
@@ -430,13 +430,18 @@ func (src *ipcSource) fieldMetadata(i int) *flatbuf.FieldNode {
 	return &node
 }
 
+func (src *ipcSource) variadicCount(i int) int64 {
+	return src.meta.VariadicCounts(i)
+}
+
 type arrayLoaderContext struct {
-	src     ipcSource
-	ifield  int
-	ibuffer int
-	max     int
-	memo    *dictutils.Memo
-	version MetadataVersion
+	src       ipcSource
+	ifield    int
+	ibuffer   int
+	ivariadic int

Review Comment:
   since they are only used in the `arrayLoaderContext` i'm not sure what the benefit would be to doing so? 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lquerel commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "lquerel (via GitHub)" <gi...@apache.org>.
lquerel commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1235818429


##########
go/arrow/array/binary.go:
##########
@@ -318,6 +319,116 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool {
 	return true
 }
 
+type ViewLike interface {
+	arrow.Array
+	ValueHeader(int) *arrow.StringHeader
+}
+
+type BinaryView struct {
+	array
+	values      []arrow.StringHeader
+	dataBuffers []*memory.Buffer
+}
+
+func NewBinaryViewData(data arrow.ArrayData) *BinaryView {
+	a := &BinaryView{}
+	a.refCount = 1
+	a.setData(data.(*Data))
+	return a
+}
+
+func (a *BinaryView) setData(data *Data) {
+	if len(data.buffers) < 2 {
+		panic("len(data.buffers) < 2")
+	}
+	a.array.setData(data)
+
+	if valueData := data.buffers[1]; valueData != nil {
+		a.values = arrow.StringHeaderTraits.CastFromBytes(valueData.Bytes())
+	}
+
+	a.dataBuffers = data.buffers[2:]
+}
+
+func (a *BinaryView) ValueHeader(i int) *arrow.StringHeader {
+	if i < 0 || i >= a.array.data.length {
+		panic("arrow/array: index out of range")
+	}
+	return &a.values[a.array.data.offset+i]
+}
+
+func (a *BinaryView) Value(i int) []byte {

Review Comment:
   Since there is no compiler guarantee in Go that the slice returned by this method will remain unchanged (whether intentionally or unintentionally) by users of this API, I would suggest adding a comment to this method. This comment should clearly specify that it's unsafe to make any kind of changes to the returned slice.



##########
go/arrow/array/binary.go:
##########
@@ -318,6 +319,116 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool {
 	return true
 }
 
+type ViewLike interface {
+	arrow.Array
+	ValueHeader(int) *arrow.StringHeader
+}
+
+type BinaryView struct {
+	array
+	values      []arrow.StringHeader
+	dataBuffers []*memory.Buffer
+}
+
+func NewBinaryViewData(data arrow.ArrayData) *BinaryView {
+	a := &BinaryView{}
+	a.refCount = 1
+	a.setData(data.(*Data))
+	return a
+}
+
+func (a *BinaryView) setData(data *Data) {
+	if len(data.buffers) < 2 {
+		panic("len(data.buffers) < 2")
+	}
+	a.array.setData(data)
+
+	if valueData := data.buffers[1]; valueData != nil {
+		a.values = arrow.StringHeaderTraits.CastFromBytes(valueData.Bytes())
+	}
+
+	a.dataBuffers = data.buffers[2:]
+}
+
+func (a *BinaryView) ValueHeader(i int) *arrow.StringHeader {
+	if i < 0 || i >= a.array.data.length {
+		panic("arrow/array: index out of range")
+	}
+	return &a.values[a.array.data.offset+i]
+}
+
+func (a *BinaryView) Value(i int) []byte {
+	s := a.ValueHeader(i)
+	if s.IsInline() {
+		return s.InlineBytes()
+	}
+	start := s.BufferOffset()
+	buf := a.dataBuffers[s.BufferIndex()]
+	return buf.Bytes()[start : start+uint32(s.Len())]
+}
+
+func (a *BinaryView) ValueString(i int) string {
+	b := a.Value(i)
+	return *(*string)(unsafe.Pointer(&b))
+}
+
+func (a *BinaryView) String() string {
+	var o strings.Builder
+	o.WriteString("[")
+	for i := 0; i < a.Len(); i++ {
+		if i > 0 {
+			o.WriteString(" ")
+		}
+		switch {
+		case a.IsNull(i):
+			o.WriteString(NullValueStr)
+		default:
+			fmt.Fprintf(&o, "%q", a.ValueString(i))
+		}
+	}
+	o.WriteString("]")
+	return o.String()
+}
+
+func (a *BinaryView) ValueStr(i int) string {

Review Comment:
   It would likely help users to have a comment describing the differences between this method and `ValueString`.



##########
go/arrow/array/binarybuilder.go:
##########
@@ -358,6 +359,301 @@ func (b *BinaryBuilder) UnmarshalJSON(data []byte) error {
 	return b.Unmarshal(dec)
 }
 
+const (
+	dfltBlockSize             = 1 << 20 // 1 MB
+	viewValueSizeLimit uint32 = math.MaxUint32
+)
+
+type BinaryViewBuilder struct {
+	builder
+	dtype arrow.BinaryDataType
+
+	data    *memory.Buffer
+	rawData []arrow.StringHeader
+
+	blockBuilder multiBufferBuilder
+}
+
+func NewBinaryViewBuilder(mem memory.Allocator) *BinaryViewBuilder {
+	return &BinaryViewBuilder{
+		dtype: arrow.BinaryTypes.BinaryView,
+		builder: builder{
+			refCount: 1,
+			mem:      mem,
+		},
+		blockBuilder: multiBufferBuilder{
+			refCount:  1,
+			blockSize: dfltBlockSize,
+			mem:       mem,
+		},
+	}
+}
+
+func (b *BinaryViewBuilder) Type() arrow.DataType { return b.dtype }
+
+func (b *BinaryViewBuilder) Release() {
+	debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases")
+
+	if atomic.AddInt64(&b.refCount, -1) == 0 {
+		if b.nullBitmap != nil {
+			b.nullBitmap.Release()
+			b.nullBitmap = nil
+		}
+		if b.data != nil {
+			b.data.Release()
+			b.data = nil
+			b.rawData = nil
+		}
+	}
+}
+
+func (b *BinaryViewBuilder) init(capacity int) {
+	b.builder.init(capacity)
+	b.data = memory.NewResizableBuffer(b.mem)
+	bytesN := arrow.StringHeaderTraits.BytesRequired(capacity)
+	b.data.Resize(bytesN)
+	b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+}
+
+func (b *BinaryViewBuilder) Resize(n int) {
+	nbuild := n
+	if n < minBuilderCapacity {
+		n = minBuilderCapacity
+	}
+
+	if b.capacity == 0 {
+		b.init(n)
+	} else {
+		b.builder.resize(nbuild, b.init)
+		b.data.Resize(arrow.StringHeaderTraits.BytesRequired(n))
+		b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+	}
+}
+
+func (b *BinaryViewBuilder) ReserveData(length int) {
+	if uint32(length) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB",
+			arrow.ErrInvalid))
+	}
+	b.blockBuilder.Reserve(int(length))
+}
+
+func (b *BinaryViewBuilder) Reserve(n int) {
+	b.builder.reserve(n, b.Resize)
+}
+
+func (b *BinaryViewBuilder) Append(v []byte) {
+	if uint32(len(v)) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB", arrow.ErrInvalid))
+	}
+
+	if !arrow.IsStringHeaderInline(len(v)) {
+		b.ReserveData(len(v))
+	}
+
+	b.Reserve(1)
+	b.UnsafeAppend(v)
+}
+
+func (b *BinaryViewBuilder) AppendString(v string) {
+	// create a []byte without copying the bytes
+	// in go1.20 this would be unsafe.StringData
+	val := *(*[]byte)(unsafe.Pointer(&struct {
+		string
+		int
+	}{v, len(v)}))
+	b.Append(val)
+}
+
+func (b *BinaryViewBuilder) AppendNull() {
+	b.Reserve(1)
+	b.UnsafeAppendBoolToBitmap(false)
+}
+
+func (b *BinaryViewBuilder) AppendEmptyValue() {
+	b.Reserve(1)
+	b.UnsafeAppendBoolToBitmap(true)
+}
+
+func (b *BinaryViewBuilder) UnsafeAppend(v []byte) {
+	hdr := &b.rawData[b.length]
+	hdr.SetBytes(v)
+	if !hdr.IsInline() {
+		b.blockBuilder.UnsafeAppend(hdr, v)
+	}
+	b.UnsafeAppendBoolToBitmap(true)
+}
+
+func (b *BinaryViewBuilder) AppendValues(v [][]byte, valid []bool) {
+	if len(v) != len(valid) && len(valid) != 0 {
+		panic("len(v) != len(valid) && len(valid) != 0")
+	}
+
+	if len(v) == 0 {
+		return
+	}
+
+	b.Reserve(len(v))
+	outOfLineTotal := 0
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			if !arrow.IsStringHeaderInline(len(vv)) {
+				outOfLineTotal += len(vv)
+			}
+		}
+	}
+
+	b.ReserveData(outOfLineTotal)
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			hdr := &b.rawData[b.length+i]
+			hdr.SetBytes(vv)
+			if !hdr.IsInline() {
+				b.blockBuilder.UnsafeAppend(hdr, vv)
+			}
+		}
+	}
+
+	b.builder.unsafeAppendBoolsToBitmap(valid, len(v))
+}
+
+func (b *BinaryViewBuilder) AppendStringValues(v []string, valid []bool) {
+	if len(v) != len(valid) && len(valid) != 0 {
+		panic("len(v) != len(valid) && len(valid) != 0")
+	}
+
+	if len(v) == 0 {
+		return
+	}
+
+	b.Reserve(len(v))
+	outOfLineTotal := 0
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			if !arrow.IsStringHeaderInline(len(vv)) {
+				outOfLineTotal += len(vv)
+			}
+		}
+	}
+
+	b.ReserveData(outOfLineTotal)
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			hdr := &b.rawData[b.length+i]
+			hdr.SetString(vv)
+			if !hdr.IsInline() {
+				b.blockBuilder.UnsafeAppendString(hdr, vv)
+			}
+		}
+	}
+
+	b.builder.unsafeAppendBoolsToBitmap(valid, len(v))
+}
+
+func (b *BinaryViewBuilder) AppendValueFromString(s string) error {

Review Comment:
   It would likely help users to have a comment describing the differences between this method and `AppendString`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1237270576


##########
go/arrow/type_traits_string_view.go:
##########
@@ -0,0 +1,53 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arrow
+
+import (
+	"reflect"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v13/arrow/endian"
+)
+
+var StringHeaderTraits stringHeaderTraits

Review Comment:
   They use the same internal struct which is called `StringHeader`. The only difference between them is semantics (just like String/Binary arrays which are physically structured identically and it's only a semantic difference where Strings are expected to be entirely valid UTF8 and Binary can be any binary data.)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation [arrow]

Posted by "bkietz (via GitHub)" <gi...@apache.org>.
bkietz commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1374605138


##########
go/arrow/array/binarybuilder.go:
##########
@@ -370,6 +371,330 @@ func (b *BinaryBuilder) UnmarshalJSON(data []byte) error {
 	return b.Unmarshal(dec)
 }
 
+const (
+	dfltBlockSize             = 1 << 20 // 1 MB

Review Comment:
   FWIW, the default in Velox (which I've adopted in the c++ impl) is 32KB
   ```suggestion
   	dfltBlockSize             = 32 << 10 // 32 KB
   ```



##########
go/arrow/datatype_viewheader.go:
##########
@@ -0,0 +1,138 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arrow
+
+import (
+	"bytes"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v14/arrow/endian"
+	"github.com/apache/arrow/go/v14/arrow/internal/debug"
+	"github.com/apache/arrow/go/v14/arrow/memory"
+)
+
+const (
+	StringViewPrefixLen  = 4
+	stringViewInlineSize = 12
+)
+
+func IsViewInline(length int) bool {
+	return length < stringViewInlineSize
+}
+
+// ViewHeader is a variable length string (utf8) or byte slice with
+// a 4 byte prefix and inline optimization for small values (12 bytes
+// or fewer). This is similar to Go's standard string but limited by
+// a length of Uint32Max and up to the first four bytes of the string
+// are copied into the struct. This prefix allows failing comparisons
+// early and can reduce CPU cache working set when dealing with short
+// strings.
+//
+// There are two situations:
+//
+//		Entirely inlined string data
+//	                |----|------------|
+//		                ^    ^
+//		                |    |
+//		              size  inline string data, zero padded
+//
+//		Reference into buffer
+//	                |----|----|----|----|
+//		                ^    ^     ^     ^
+//		                |    |     |     |
+//		              size prefix buffer index and offset to out-of-line portion
+//
+// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+//
+// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+type ViewHeader struct {
+	size uint32
+	// the first 4 bytes of this are the prefix for the string
+	// if size <= StringHeaderInlineSize, then the entire string
+	// is in the data array and is zero padded.
+	// if size > StringHeaderInlineSize, the next 8 bytes are 2 uint32
+	// values which are the buffer index and offset in that buffer
+	// containing the full string.
+	data [stringViewInlineSize]byte
+}
+
+func (sh *ViewHeader) IsInline() bool {
+	return sh.size <= uint32(stringViewInlineSize)
+}
+
+func (sh *ViewHeader) Len() int { return int(sh.size) }
+func (sh *ViewHeader) Prefix() [StringViewPrefixLen]byte {
+	return *(*[4]byte)(unsafe.Pointer(&sh.data))
+}
+
+func (sh *ViewHeader) BufferIndex() uint32 {

Review Comment:
   Buffer index and offset were both changed to signed 32 bit integers in https://lists.apache.org/thread/gl2hf6v0ct5xqpjf832vhmdgkqyxghlg
   ```suggestion
   func (sh *ViewHeader) BufferIndex() int {
   ```



##########
go/arrow/datatype_viewheader.go:
##########
@@ -0,0 +1,138 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arrow
+
+import (
+	"bytes"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v14/arrow/endian"
+	"github.com/apache/arrow/go/v14/arrow/internal/debug"
+	"github.com/apache/arrow/go/v14/arrow/memory"
+)
+
+const (
+	StringViewPrefixLen  = 4
+	stringViewInlineSize = 12
+)
+
+func IsViewInline(length int) bool {
+	return length < stringViewInlineSize
+}
+
+// ViewHeader is a variable length string (utf8) or byte slice with
+// a 4 byte prefix and inline optimization for small values (12 bytes
+// or fewer). This is similar to Go's standard string but limited by
+// a length of Uint32Max and up to the first four bytes of the string
+// are copied into the struct. This prefix allows failing comparisons
+// early and can reduce CPU cache working set when dealing with short
+// strings.
+//
+// There are two situations:
+//
+//		Entirely inlined string data
+//	                |----|------------|
+//		                ^    ^
+//		                |    |
+//		              size  inline string data, zero padded
+//
+//		Reference into buffer
+//	                |----|----|----|----|
+//		                ^    ^     ^     ^
+//		                |    |     |     |
+//		              size prefix buffer index and offset to out-of-line portion
+//
+// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+//
+// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+type ViewHeader struct {
+	size uint32

Review Comment:
   `size` is signed too now
   ```suggestion
   	size int
   ```



##########
go/arrow/datatype_viewheader_inline_tinygo.go:
##########
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !go1.20 && tinygo
+
+package arrow
+
+import (
+	"reflect"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v14/arrow/internal/debug"
+)
+
+func (sh *ViewHeader) InlineData() (data string) {

Review Comment:
   The utility of these over `InlineBytes` isn't clear to me



##########
go/arrow/array/bufferbuilder.go:
##########
@@ -151,3 +153,112 @@ func (b *bufferBuilder) unsafeAppend(data []byte) {
 	copy(b.bytes[b.length:], data)
 	b.length += len(data)
 }
+
+type multiBufferBuilder struct {
+	refCount  int64
+	blockSize int

Review Comment:
   This should be settable so that users (and tests) can configure their own block size
   ```suggestion
   	BlockSize int
   ```



##########
go/arrow/array/binarybuilder.go:
##########
@@ -370,6 +371,330 @@ func (b *BinaryBuilder) UnmarshalJSON(data []byte) error {
 	return b.Unmarshal(dec)
 }
 
+const (
+	dfltBlockSize             = 1 << 20 // 1 MB
+	viewValueSizeLimit uint32 = math.MaxUint32
+)
+
+type BinaryViewBuilder struct {
+	builder
+	dtype arrow.BinaryDataType
+
+	data    *memory.Buffer
+	rawData []arrow.ViewHeader
+
+	blockBuilder multiBufferBuilder
+}
+
+func NewBinaryViewBuilder(mem memory.Allocator) *BinaryViewBuilder {
+	return &BinaryViewBuilder{
+		dtype: arrow.BinaryTypes.BinaryView,
+		builder: builder{
+			refCount: 1,
+			mem:      mem,
+		},
+		blockBuilder: multiBufferBuilder{
+			refCount:  1,
+			blockSize: dfltBlockSize,
+			mem:       mem,
+		},
+	}
+}
+
+func (b *BinaryViewBuilder) Type() arrow.DataType { return b.dtype }
+
+func (b *BinaryViewBuilder) Release() {
+	debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases")
+
+	if atomic.AddInt64(&b.refCount, -1) != 0 {
+		return
+	}
+
+	if b.nullBitmap != nil {
+		b.nullBitmap.Release()
+		b.nullBitmap = nil
+	}
+	if b.data != nil {
+		b.data.Release()
+		b.data = nil
+		b.rawData = nil
+	}
+}
+
+func (b *BinaryViewBuilder) init(capacity int) {
+	b.builder.init(capacity)
+	b.data = memory.NewResizableBuffer(b.mem)
+	bytesN := arrow.StringHeaderTraits.BytesRequired(capacity)
+	b.data.Resize(bytesN)
+	b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+}
+
+func (b *BinaryViewBuilder) Resize(n int) {
+	nbuild := n
+	if n < minBuilderCapacity {
+		n = minBuilderCapacity
+	}
+
+	if b.capacity == 0 {
+		b.init(n)
+		return
+	}
+
+	b.builder.resize(nbuild, b.init)
+	b.data.Resize(arrow.StringHeaderTraits.BytesRequired(n))
+	b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+}
+
+func (b *BinaryViewBuilder) ReserveData(length int) {
+	if uint32(length) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB",
+			arrow.ErrInvalid))
+	}
+	b.blockBuilder.Reserve(int(length))
+}
+
+func (b *BinaryViewBuilder) Reserve(n int) {
+	b.builder.reserve(n, b.Resize)
+}
+
+func (b *BinaryViewBuilder) Append(v []byte) {
+	if uint32(len(v)) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB", arrow.ErrInvalid))
+	}
+
+	if !arrow.IsViewInline(len(v)) {
+		b.ReserveData(len(v))
+	}
+
+	b.Reserve(1)
+	b.UnsafeAppend(v)
+}
+
+// AppendString is identical to Append, only accepting a string instead
+// of a byte slice, avoiding the extra copy that would occur if you simply
+// did []byte(v).
+//
+// This is different than AppendValueFromString which exists for the
+// Builder interface, in that this expects raw binary data which is
+// appended as such. AppendValueFromString expects base64 encoded binary

Review Comment:
   ```suggestion
   // appended unmodified. AppendValueFromString expects base64 encoded binary
   ```



##########
go/arrow/internal/testing/gen/random_array_gen.go:
##########
@@ -351,6 +351,40 @@ func (r *RandomArrayGenerator) LargeString(size int64, minLength, maxLength int6
 	return bldr.NewArray()
 }
 
+func (r *RandomArrayGenerator) StringView(size int64, minLength, maxLength int64, nullProb float64) arrow.Array {

Review Comment:
   specifying max buffer size might also be useful here



##########
go/arrow/datatype_viewheader.go:
##########
@@ -0,0 +1,138 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arrow
+
+import (
+	"bytes"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v14/arrow/endian"
+	"github.com/apache/arrow/go/v14/arrow/internal/debug"
+	"github.com/apache/arrow/go/v14/arrow/memory"
+)
+
+const (
+	StringViewPrefixLen  = 4
+	stringViewInlineSize = 12
+)
+
+func IsViewInline(length int) bool {
+	return length < stringViewInlineSize
+}
+
+// ViewHeader is a variable length string (utf8) or byte slice with
+// a 4 byte prefix and inline optimization for small values (12 bytes
+// or fewer). This is similar to Go's standard string but limited by
+// a length of Uint32Max and up to the first four bytes of the string
+// are copied into the struct. This prefix allows failing comparisons
+// early and can reduce CPU cache working set when dealing with short
+// strings.
+//
+// There are two situations:
+//
+//		Entirely inlined string data
+//	                |----|------------|
+//		                ^    ^
+//		                |    |
+//		              size  inline string data, zero padded
+//
+//		Reference into buffer
+//	                |----|----|----|----|
+//		                ^    ^     ^     ^
+//		                |    |     |     |
+//		              size prefix buffer index and offset to out-of-line portion
+//
+// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+//
+// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+type ViewHeader struct {
+	size uint32
+	// the first 4 bytes of this are the prefix for the string
+	// if size <= StringHeaderInlineSize, then the entire string
+	// is in the data array and is zero padded.
+	// if size > StringHeaderInlineSize, the next 8 bytes are 2 uint32
+	// values which are the buffer index and offset in that buffer
+	// containing the full string.
+	data [stringViewInlineSize]byte
+}
+
+func (sh *ViewHeader) IsInline() bool {
+	return sh.size <= uint32(stringViewInlineSize)
+}
+
+func (sh *ViewHeader) Len() int { return int(sh.size) }
+func (sh *ViewHeader) Prefix() [StringViewPrefixLen]byte {
+	return *(*[4]byte)(unsafe.Pointer(&sh.data))
+}
+
+func (sh *ViewHeader) BufferIndex() uint32 {
+	return endian.Native.Uint32(sh.data[StringViewPrefixLen:])
+}
+
+func (sh *ViewHeader) BufferOffset() uint32 {
+	return endian.Native.Uint32(sh.data[StringViewPrefixLen+4:])
+}
+
+func (sh *ViewHeader) InlineBytes() (data []byte) {
+	debug.Assert(sh.IsInline(), "calling InlineBytes on non-inline StringHeader")

Review Comment:
   At the risk of offering an opinion well out of my expertise, would it be preferable to return nil here instead?
   ```suggestion
     if !sh.IsInline() {
       return nil
     }
   ```



##########
go/arrow/array/binarybuilder.go:
##########
@@ -370,6 +371,330 @@ func (b *BinaryBuilder) UnmarshalJSON(data []byte) error {
 	return b.Unmarshal(dec)
 }
 
+const (
+	dfltBlockSize             = 1 << 20 // 1 MB
+	viewValueSizeLimit uint32 = math.MaxUint32
+)
+
+type BinaryViewBuilder struct {
+	builder
+	dtype arrow.BinaryDataType
+
+	data    *memory.Buffer
+	rawData []arrow.ViewHeader
+
+	blockBuilder multiBufferBuilder
+}
+
+func NewBinaryViewBuilder(mem memory.Allocator) *BinaryViewBuilder {
+	return &BinaryViewBuilder{
+		dtype: arrow.BinaryTypes.BinaryView,
+		builder: builder{
+			refCount: 1,
+			mem:      mem,
+		},
+		blockBuilder: multiBufferBuilder{
+			refCount:  1,
+			blockSize: dfltBlockSize,
+			mem:       mem,
+		},
+	}
+}
+
+func (b *BinaryViewBuilder) Type() arrow.DataType { return b.dtype }
+
+func (b *BinaryViewBuilder) Release() {
+	debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases")
+
+	if atomic.AddInt64(&b.refCount, -1) != 0 {
+		return
+	}
+
+	if b.nullBitmap != nil {
+		b.nullBitmap.Release()
+		b.nullBitmap = nil
+	}
+	if b.data != nil {
+		b.data.Release()
+		b.data = nil
+		b.rawData = nil
+	}
+}
+
+func (b *BinaryViewBuilder) init(capacity int) {
+	b.builder.init(capacity)
+	b.data = memory.NewResizableBuffer(b.mem)
+	bytesN := arrow.StringHeaderTraits.BytesRequired(capacity)
+	b.data.Resize(bytesN)
+	b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+}
+
+func (b *BinaryViewBuilder) Resize(n int) {
+	nbuild := n
+	if n < minBuilderCapacity {
+		n = minBuilderCapacity
+	}
+
+	if b.capacity == 0 {
+		b.init(n)
+		return
+	}
+
+	b.builder.resize(nbuild, b.init)
+	b.data.Resize(arrow.StringHeaderTraits.BytesRequired(n))
+	b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+}
+
+func (b *BinaryViewBuilder) ReserveData(length int) {
+	if uint32(length) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB",

Review Comment:
   After adopting signed sizes, this limit is slightly reduced
   ```suggestion
   		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 2GB",
   ```



##########
go/arrow/internal/arrdata/arrdata.go:
##########
@@ -1155,6 +1156,65 @@ func makeRunEndEncodedRecords() []arrow.Record {
 	return recs
 }
 
+func makeStringViewRecords() []arrow.Record {

Review Comment:
   It might be worthwhile to ensure at least one chunk has multiple data buffers



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] candiduslynx commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "candiduslynx (via GitHub)" <gi...@apache.org>.
candiduslynx commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1236488243


##########
format/Schema.fbs:
##########
@@ -171,6 +172,16 @@ table LargeUtf8 {
 table LargeBinary {
 }
 
+/// Same as Utf8, but string characters are delimited with a packed
+/// length/pointer instead of offsets.
+table Utf8View {
+}
+
+/// Same as Binary, but string characters are delimited with a packed
+/// length/pointeBinary of offsets.

Review Comment:
   This needs rephrasing



##########
go/arrow/datatype.go:
##########
@@ -152,6 +152,13 @@ const (
 
 	RUN_END_ENCODED
 
+	// String (UTF8) view type with 4-byte prefix and inline
+	// small string optimizations
+	STRING_VIEW
+
+	// Bytes view with 4-byte prefix and inline small string optimization

Review Comment:
   ```suggestion
   	// Bytes view with 4-byte prefix and inline small byte arrays optimization
   ```



##########
dev/archery/archery/integration/datagen.py:
##########
@@ -743,6 +763,72 @@ class LargeStringColumn(_BaseStringColumn, _LargeOffsetsMixin):
     pass
 
 
+class BinaryViewColumn(PrimitiveColumn):
+
+    def _encode_value(self, x):
+        return frombytes(binascii.hexlify(x).upper())
+
+    def _get_buffers(self):
+        char_buffers = []
+        DEFAULT_BUFFER_SIZE = 32  # ¯\_(ツ)_/¯
+        INLINE_SIZE = 12
+
+        data = []
+        for i, v in enumerate(self.values):
+            if not self.is_valid[i]:
+                v = b''
+            assert isinstance(v, bytes)
+
+            if len(v) > INLINE_SIZE:
+                offset = 0
+                if len(v) > DEFAULT_BUFFER_SIZE:
+                    char_buffers.append(v)
+                else:
+                    if len(char_buffers) == 0:
+                        char_buffers.append(v)
+                    elif len(char_buffers[-1]) + len(v) > DEFAULT_BUFFER_SIZE:
+                        char_buffers.append(v)
+                    else:
+                        offset = len(char_buffers[-1])
+                        char_buffers[-1] += v
+                        assert len(char_buffers[-1]) <= DEFAULT_BUFFER_SIZE

Review Comment:
   isn't it already checked above for the  case of `len(char_buffers[-1]) + len(v) > DEFAULT_BUFFER_SIZE`?



##########
go/arrow/datatype_binary.go:
##########
@@ -83,16 +83,50 @@ func (t *LargeStringType) Layout() DataTypeLayout {
 func (t *LargeStringType) OffsetTypeTraits() OffsetTraits { return Int64Traits }
 func (LargeStringType) IsUtf8() bool                      { return true }
 
+type BinaryViewType struct{}
+
+func (*BinaryViewType) ID() Type              { return BINARY_VIEW }
+func (*BinaryViewType) Name() string          { return "binary_view" }
+func (*BinaryViewType) String() string        { return "binary_view" }
+func (*BinaryViewType) IsUtf8() bool          { return false }
+func (*BinaryViewType) binary()               {}
+func (*BinaryViewType) view()                 {}
+func (t *BinaryViewType) Fingerprint() string { return typeFingerprint(t) }
+func (*BinaryViewType) Layout() DataTypeLayout {
+	variadic := SpecVariableWidth()
+	return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(),
+		SpecFixedWidth(StringHeaderSizeBytes)}, VariadicSpec: &variadic}
+}
+
+type StringViewType struct{}

Review Comment:
   please add compile-time assertions for `BinaryViewType` conformation



##########
go/arrow/datatype_binary.go:
##########
@@ -83,16 +83,50 @@ func (t *LargeStringType) Layout() DataTypeLayout {
 func (t *LargeStringType) OffsetTypeTraits() OffsetTraits { return Int64Traits }
 func (LargeStringType) IsUtf8() bool                      { return true }
 
+type BinaryViewType struct{}
+
+func (*BinaryViewType) ID() Type              { return BINARY_VIEW }
+func (*BinaryViewType) Name() string          { return "binary_view" }
+func (*BinaryViewType) String() string        { return "binary_view" }
+func (*BinaryViewType) IsUtf8() bool          { return false }
+func (*BinaryViewType) binary()               {}
+func (*BinaryViewType) view()                 {}
+func (t *BinaryViewType) Fingerprint() string { return typeFingerprint(t) }
+func (*BinaryViewType) Layout() DataTypeLayout {
+	variadic := SpecVariableWidth()
+	return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(),
+		SpecFixedWidth(StringHeaderSizeBytes)}, VariadicSpec: &variadic}

Review Comment:
   ```suggestion
   	return DataTypeLayout{
   		Buffers:      []BufferSpec{SpecBitmap(), SpecFixedWidth(StringHeaderSizeBytes)},
   		VariadicSpec: &variadic,
   	}
   ```



##########
go/arrow/internal/flatbuf/BinaryView.go:
##########
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one

Review Comment:
   why are those in pascal case instead of snake case names?



##########
go/arrow/type_traits_string_view.go:
##########
@@ -0,0 +1,53 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arrow
+
+import (
+	"reflect"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v13/arrow/endian"
+)
+
+var StringHeaderTraits stringHeaderTraits

Review Comment:
   should there be one for binary view as well?



##########
go/arrow/array/binary.go:
##########
@@ -318,6 +319,116 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool {
 	return true
 }
 
+type ViewLike interface {
+	arrow.Array
+	ValueHeader(int) *arrow.StringHeader
+}
+
+type BinaryView struct {
+	array
+	values      []arrow.StringHeader
+	dataBuffers []*memory.Buffer
+}
+
+func NewBinaryViewData(data arrow.ArrayData) *BinaryView {
+	a := &BinaryView{}
+	a.refCount = 1
+	a.setData(data.(*Data))
+	return a
+}
+
+func (a *BinaryView) setData(data *Data) {
+	if len(data.buffers) < 2 {
+		panic("len(data.buffers) < 2")
+	}
+	a.array.setData(data)
+
+	if valueData := data.buffers[1]; valueData != nil {
+		a.values = arrow.StringHeaderTraits.CastFromBytes(valueData.Bytes())
+	}
+
+	a.dataBuffers = data.buffers[2:]
+}
+
+func (a *BinaryView) ValueHeader(i int) *arrow.StringHeader {
+	if i < 0 || i >= a.array.data.length {
+		panic("arrow/array: index out of range")
+	}
+	return &a.values[a.array.data.offset+i]
+}
+
+func (a *BinaryView) Value(i int) []byte {
+	s := a.ValueHeader(i)
+	if s.IsInline() {
+		return s.InlineBytes()
+	}
+	start := s.BufferOffset()
+	buf := a.dataBuffers[s.BufferIndex()]
+	return buf.Bytes()[start : start+uint32(s.Len())]
+}
+
+func (a *BinaryView) ValueString(i int) string {
+	b := a.Value(i)
+	return *(*string)(unsafe.Pointer(&b))
+}
+
+func (a *BinaryView) String() string {
+	var o strings.Builder
+	o.WriteString("[")
+	for i := 0; i < a.Len(); i++ {
+		if i > 0 {
+			o.WriteString(" ")
+		}
+		switch {
+		case a.IsNull(i):
+			o.WriteString(NullValueStr)
+		default:
+			fmt.Fprintf(&o, "%q", a.ValueString(i))
+		}
+	}
+	o.WriteString("]")
+	return o.String()
+}
+
+func (a *BinaryView) ValueStr(i int) string {

Review Comment:
   `ValueStr` is paired with `AppendValueFromString` and has a contract of being able to reconstruct the same data with `b.AppendValueFromString(a.ValueStr(i))`



##########
go/arrow/array/binary.go:
##########
@@ -318,6 +319,116 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool {
 	return true
 }
 
+type ViewLike interface {
+	arrow.Array
+	ValueHeader(int) *arrow.StringHeader
+}
+
+type BinaryView struct {
+	array
+	values      []arrow.StringHeader
+	dataBuffers []*memory.Buffer
+}
+
+func NewBinaryViewData(data arrow.ArrayData) *BinaryView {
+	a := &BinaryView{}
+	a.refCount = 1

Review Comment:
   ```suggestion
   	a := &BinaryView{refCount: 1}
   ```



##########
go/arrow/datatype_binary.go:
##########
@@ -83,16 +83,50 @@ func (t *LargeStringType) Layout() DataTypeLayout {
 func (t *LargeStringType) OffsetTypeTraits() OffsetTraits { return Int64Traits }
 func (LargeStringType) IsUtf8() bool                      { return true }
 
+type BinaryViewType struct{}
+
+func (*BinaryViewType) ID() Type              { return BINARY_VIEW }
+func (*BinaryViewType) Name() string          { return "binary_view" }
+func (*BinaryViewType) String() string        { return "binary_view" }
+func (*BinaryViewType) IsUtf8() bool          { return false }
+func (*BinaryViewType) binary()               {}
+func (*BinaryViewType) view()                 {}
+func (t *BinaryViewType) Fingerprint() string { return typeFingerprint(t) }
+func (*BinaryViewType) Layout() DataTypeLayout {
+	variadic := SpecVariableWidth()
+	return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(),
+		SpecFixedWidth(StringHeaderSizeBytes)}, VariadicSpec: &variadic}
+}
+
+type StringViewType struct{}
+
+func (*StringViewType) ID() Type              { return STRING_VIEW }
+func (*StringViewType) Name() string          { return "string_view" }
+func (*StringViewType) String() string        { return "string_view" }
+func (*StringViewType) IsUtf8() bool          { return true }
+func (*StringViewType) binary()               {}
+func (*StringViewType) view()                 {}
+func (t *StringViewType) Fingerprint() string { return typeFingerprint(t) }
+func (*StringViewType) Layout() DataTypeLayout {
+	variadic := SpecVariableWidth()
+	return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(),
+		SpecFixedWidth(StringHeaderSizeBytes)}, VariadicSpec: &variadic}

Review Comment:
   ```suggestion
   	return DataTypeLayout{
   		Buffers:      []BufferSpec{SpecBitmap(), SpecFixedWidth(StringHeaderSizeBytes)},
   		VariadicSpec: &variadic,
   	}
   ```



##########
go/arrow/datatype_binary.go:
##########
@@ -83,16 +83,50 @@ func (t *LargeStringType) Layout() DataTypeLayout {
 func (t *LargeStringType) OffsetTypeTraits() OffsetTraits { return Int64Traits }
 func (LargeStringType) IsUtf8() bool                      { return true }
 
+type BinaryViewType struct{}

Review Comment:
   please add compile-time assertions for `BinaryViewType` conformation



##########
go/arrow/datatype_stringheader_inline_tinygo.go:
##########
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !go1.20 && tinygo
+
+package arrow
+
+import (
+	"reflect"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v13/arrow/internal/debug"
+)
+
+func (sh *StringHeader) InlineData() (data string) {
+	debug.Assert(sh.IsInline(), "calling InlineData on non-inline StringHeader")
+
+	h := (*reflect.StringHeader)(unsafe.Pointer(&data))

Review Comment:
   any diff with go1.19 implementation?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1237262436


##########
go/arrow/datatype_stringheader.go:
##########
@@ -0,0 +1,136 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arrow
+
+import (
+	"bytes"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v13/arrow/endian"
+	"github.com/apache/arrow/go/v13/arrow/internal/debug"
+	"github.com/apache/arrow/go/v13/arrow/memory"

Review Comment:
   Yea, the version bumping scripts already do find and replace for all the files. Ideally we'd be able to update our release process to do more minor releases and fewer major releases (leaving major releases *only* for breaking changes) but that's an entirely different conversation.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1238623711


##########
go/arrow/internal/flatbuf/RecordBatch.go:
##########
@@ -128,8 +128,42 @@ func (rcv *RecordBatch) Compression(obj *BodyCompression) *BodyCompression {
 }
 
 /// Optional compression of the message body
+/// Some types such as Utf8View are represented using a variable number of buffers.

Review Comment:
   You'd have to ask the maintainers of `flatc` lol



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1238625042


##########
go/arrow/internal/testing/gen/random_array_gen.go:
##########
@@ -350,6 +350,40 @@ func (r *RandomArrayGenerator) LargeString(size int64, minLength, maxLength int6
 	return bldr.NewArray()
 }
 
+func (r *RandomArrayGenerator) StringView(size int64, minLength, maxLength int64, nullProb float64) arrow.Array {
+	return r.generateBinaryView(arrow.BinaryTypes.StringView, size, minLength, maxLength, nullProb)
+}
+
+func (r *RandomArrayGenerator) generateBinaryView(dt arrow.DataType, size int64, minLength, maxLength int64, nullProb float64) arrow.Array {
+	lengths := r.Int32(size, int32(minLength), int32(maxLength), nullProb).(*array.Int32)
+	defer lengths.Release()
+
+	bldr := array.NewBuilder(r.mem, dt).(array.StringLikeBuilder)
+	defer bldr.Release()
+
+	r.extra++
+	dist := rand.New(rand.NewSource(r.seed + r.extra))
+
+	buf := make([]byte, 0, maxLength)
+	gen := func(n int32) string {
+		out := buf[:n]
+		for i := range out {
+			out[i] = uint8(dist.Int31n(int32('z')-int32('A')+1) + int32('A'))
+		}
+		return string(out)
+	}
+
+	for i := 0; i < lengths.Len(); i++ {
+		if lengths.IsValid(i) {
+			bldr.Append(gen(lengths.Value(i)))
+		} else {
+			bldr.AppendNull()
+		}

Review Comment:
   I'm not going to modify the generated code. Everything in this folder is generated by flatc from the .fbs files.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] bkietz commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "bkietz (via GitHub)" <gi...@apache.org>.
bkietz commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1205985596


##########
go/arrow/datatype_binary.go:
##########
@@ -83,16 +93,168 @@ func (t *LargeStringType) Layout() DataTypeLayout {
 func (t *LargeStringType) OffsetTypeTraits() OffsetTraits { return Int64Traits }
 func (LargeStringType) IsUtf8() bool                      { return true }
 
+const (
+	StringHeaderPrefixLen  = 4
+	stringHeaderInlineSize = 12
+)
+
+func IsStringHeaderInline(length int) bool {
+	return length < stringHeaderInlineSize
+}
+
+// StringHeader is a variable length string (utf8) or byte slice with
+// a 4 byte prefix and inline optimization for small values (12 bytes
+// or fewer). This is similar to Go's standard string  but limited by
+// a length of Uint32Max and up to the first four bytes of the string
+// are copied into the struct. This prefix allows failing comparisons
+// early and can reduce CPU cache working set when dealing with short
+// strings.
+//
+// There are two situations:
+//
+//	Short string   |----|----|--------|
+//	                ^    ^    ^
+//	                |    |    |
+//	              size prefix remaining in-line portion, zero padded
+//
+//	IO Long String |----|----|----|----|
+//	                ^    ^     ^     ^
+//	                |    |     |     |
+//	              size prefix buffer index and offset to out-of-line portion
+//
+// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+//
+// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+type StringHeader struct {

Review Comment:
   I'm not familiar enough with Go to know whether you can attach alignment requirements to a struct. Reading this it seems that it'd be only aligned to a 4 byte boundary, which would cause problems for the c++ impl where we expect them to be aligned to 8 bytes.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation [arrow]

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1376452131


##########
go/arrow/ipc/metadata.go:
##########
@@ -1186,18 +1202,25 @@ func writeDictionaryMessage(mem memory.Allocator, id int64, isDelta bool, size,
 	return writeMessageFB(b, mem, flatbuf.MessageHeaderDictionaryBatch, dictFB, bodyLength)
 }
 
-func recordToFB(b *flatbuffers.Builder, size, bodyLength int64, fields []fieldMetadata, meta []bufferMetadata, codec flatbuf.CompressionType) flatbuffers.UOffsetT {
+func recordToFB(b *flatbuffers.Builder, size, bodyLength int64, fields []fieldMetadata, meta []bufferMetadata, codec flatbuf.CompressionType, variadicCounts []int64) flatbuffers.UOffsetT {
 	fieldsFB := writeFieldNodes(b, fields, flatbuf.RecordBatchStartNodesVector)
 	metaFB := writeBuffers(b, meta, flatbuf.RecordBatchStartBuffersVector)
 	var bodyCompressFB flatbuffers.UOffsetT
 	if codec != -1 {
 		bodyCompressFB = writeBodyCompression(b, codec)
 	}
 
+	flatbuf.RecordBatchStartVariadicBufferCountsVector(b, len(variadicCounts))
+	for i := len(variadicCounts) - 1; i >= 0; i-- {
+		b.PrependInt64(variadicCounts[i])
+	}
+	vcFB := b.EndVector(len(variadicCounts))
+

Review Comment:
   good point, i'll make that update



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-38718: [Go][Format][Integration] Add StringView/BinaryView to Go implementation [arrow]

Posted by "conbench-apache-arrow[bot] (via GitHub)" <gi...@apache.org>.
conbench-apache-arrow[bot] commented on PR #35769:
URL: https://github.com/apache/arrow/pull/35769#issuecomment-1811509118

   After merging your PR, Conbench analyzed the 3 benchmarking runs that have been run so far on merge-commit 26149d9fab0360e6d4d9a295f934100470c4bc37.
   
   There were no benchmark performance regressions. 🎉
   
   The [full Conbench report](https://github.com/apache/arrow/runs/18685204416) has more details.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] candiduslynx commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "candiduslynx (via GitHub)" <gi...@apache.org>.
candiduslynx commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1238003117


##########
go/arrow/datatype_stringheader_inline_tinygo.go:
##########
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !go1.20 && tinygo
+
+package arrow
+
+import (
+	"reflect"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v13/arrow/internal/debug"
+)
+
+func (sh *StringHeader) InlineData() (data string) {
+	debug.Assert(sh.IsInline(), "calling InlineData on non-inline StringHeader")
+
+	h := (*reflect.StringHeader)(unsafe.Pointer(&data))

Review Comment:
   how sad, but now I see why it's warranted.
   However, a comment to note the diff for those implementations would be nice to have (not required, but nice to have nonetheless).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] candiduslynx commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "candiduslynx (via GitHub)" <gi...@apache.org>.
candiduslynx commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1239824231


##########
go/arrow/ipc/file_reader.go:
##########
@@ -430,13 +430,18 @@ func (src *ipcSource) fieldMetadata(i int) *flatbuf.FieldNode {
 	return &node
 }
 
+func (src *ipcSource) variadicCount(i int) int64 {
+	return src.meta.VariadicCounts(i)
+}
+
 type arrayLoaderContext struct {
-	src     ipcSource
-	ifield  int
-	ibuffer int
-	max     int
-	memo    *dictutils.Memo
-	version MetadataVersion
+	src       ipcSource
+	ifield    int
+	ibuffer   int
+	ivariadic int

Review Comment:
   a nit, so probably none



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1237268730


##########
go/arrow/internal/flatbuf/BinaryView.go:
##########
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one

Review Comment:
   this is auto generated via calling `flatc`, so the pascal case is what the Flatbuffers compiler is generating.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] candiduslynx commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "candiduslynx (via GitHub)" <gi...@apache.org>.
candiduslynx commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1238202529


##########
go/arrow/ipc/writer.go:
##########
@@ -605,6 +606,33 @@ func (w *recordEncoder) visit(p *Payload, arr arrow.Array) error {
 		p.body = append(p.body, voffsets)
 		p.body = append(p.body, values)
 
+	case arrow.BinaryViewDataType:
+		data := arr.Data()
+		values := data.Buffers()[1]
+		arrLen := int64(arr.Len())
+		typeWidth := int64(arrow.StringHeaderSizeBytes)
+		minLength := paddedLength(arrLen*typeWidth, kArrowAlignment)
+
+		switch {
+		case needTruncate(int64(data.Offset()), values, minLength):
+			// non-zero offset: slice the buffer
+			offset := int64(data.Offset()) * typeWidth
+			// send padding if available
+			len := minI64(bitutil.CeilByte64(arrLen*typeWidth), int64(values.Len())-offset)
+			values = memory.NewBufferBytes(values.Bytes()[offset : offset+len])

Review Comment:
   shouldn't `memory.SliceBuffer` be used instead (for `value.Retain()` underneath?



##########
go/arrow/ipc/writer.go:
##########
@@ -605,6 +606,33 @@ func (w *recordEncoder) visit(p *Payload, arr arrow.Array) error {
 		p.body = append(p.body, voffsets)
 		p.body = append(p.body, values)
 
+	case arrow.BinaryViewDataType:
+		data := arr.Data()
+		values := data.Buffers()[1]
+		arrLen := int64(arr.Len())
+		typeWidth := int64(arrow.StringHeaderSizeBytes)
+		minLength := paddedLength(arrLen*typeWidth, kArrowAlignment)
+
+		switch {
+		case needTruncate(int64(data.Offset()), values, minLength):
+			// non-zero offset: slice the buffer
+			offset := int64(data.Offset()) * typeWidth
+			// send padding if available
+			len := minI64(bitutil.CeilByte64(arrLen*typeWidth), int64(values.Len())-offset)
+			values = memory.NewBufferBytes(values.Bytes()[offset : offset+len])

Review Comment:
   shouldn't `memory.SliceBuffer` be used instead (for `value.Retain()` underneath)?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1205999970


##########
go/arrow/datatype_binary.go:
##########
@@ -83,16 +93,168 @@ func (t *LargeStringType) Layout() DataTypeLayout {
 func (t *LargeStringType) OffsetTypeTraits() OffsetTraits { return Int64Traits }
 func (LargeStringType) IsUtf8() bool                      { return true }
 
+const (
+	StringHeaderPrefixLen  = 4
+	stringHeaderInlineSize = 12
+)
+
+func IsStringHeaderInline(length int) bool {
+	return length < stringHeaderInlineSize
+}
+
+// StringHeader is a variable length string (utf8) or byte slice with
+// a 4 byte prefix and inline optimization for small values (12 bytes
+// or fewer). This is similar to Go's standard string  but limited by
+// a length of Uint32Max and up to the first four bytes of the string
+// are copied into the struct. This prefix allows failing comparisons
+// early and can reduce CPU cache working set when dealing with short
+// strings.
+//
+// There are two situations:
+//
+//	Short string   |----|----|--------|
+//	                ^    ^    ^
+//	                |    |    |
+//	              size prefix remaining in-line portion, zero padded
+//
+//	IO Long String |----|----|----|----|
+//	                ^    ^     ^     ^
+//	                |    |     |     |
+//	              size prefix buffer index and offset to out-of-line portion
+//
+// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+//
+// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+type StringHeader struct {

Review Comment:
   The only way to modify the alignment requirements of a struct in go is to modify the types of its fields. It is going to pick the alignment for the struct based on the largest alignment needed for a given member. In this case using `unsafe.Alignof(arrow.StringHeader{})` reports an alignment of 4 bytes. That said, as long as the size of 16 bytes for the struct is maintained, I don't think there'll be an issue? In addition I believe we explicitly handle the alignment for IPC input/output.
   
   I guess there could be an issue with the C-Data interface if the buffer ends up not 8-byte aligned? but then again, I don't think you added a proposal for the new format string to indicate this across the C-Data interface yet :smile:



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1207042419


##########
go/arrow/datatype_binary.go:
##########
@@ -83,16 +93,168 @@ func (t *LargeStringType) Layout() DataTypeLayout {
 func (t *LargeStringType) OffsetTypeTraits() OffsetTraits { return Int64Traits }
 func (LargeStringType) IsUtf8() bool                      { return true }
 
+const (
+	StringHeaderPrefixLen  = 4
+	stringHeaderInlineSize = 12
+)
+
+func IsStringHeaderInline(length int) bool {
+	return length < stringHeaderInlineSize
+}
+
+// StringHeader is a variable length string (utf8) or byte slice with
+// a 4 byte prefix and inline optimization for small values (12 bytes
+// or fewer). This is similar to Go's standard string  but limited by
+// a length of Uint32Max and up to the first four bytes of the string
+// are copied into the struct. This prefix allows failing comparisons
+// early and can reduce CPU cache working set when dealing with short
+// strings.
+//
+// There are two situations:
+//
+//	Short string   |----|----|--------|
+//	                ^    ^    ^
+//	                |    |    |
+//	              size prefix remaining in-line portion, zero padded
+//
+//	IO Long String |----|----|----|----|
+//	                ^    ^     ^     ^
+//	                |    |     |     |
+//	              size prefix buffer index and offset to out-of-line portion
+//
+// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+//
+// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+type StringHeader struct {
+	size uint32
+	// the first 4 bytes of this are the prefix for the string
+	// if size <= StringHeaderInlineSize, then the entire string
+	// is in the data array and is zero padded.
+	// if size > StringHeaderInlineSize, the next 8 bytes are 2 uint32
+	// values which are the buffer index and offset in that buffer
+	// containing the full string.
+	data [stringHeaderInlineSize]byte
+}
+
+func (sh *StringHeader) IsInline() bool {
+	return sh.size <= uint32(stringHeaderInlineSize)
+}
+
+func (sh *StringHeader) Len() int { return int(sh.size) }
+func (sh *StringHeader) Prefix() [StringHeaderPrefixLen]byte {
+	return *(*[4]byte)(unsafe.Pointer(&sh.data))
+}
+
+func (sh *StringHeader) BufferIndex() uint32 {
+	return endian.Native.Uint32(sh.data[StringHeaderPrefixLen:])
+}
+
+func (sh *StringHeader) BufferOffset() uint32 {
+	return endian.Native.Uint32(sh.data[StringHeaderPrefixLen+4:])
+}
+
+func (sh *StringHeader) InlineBytes() (data []byte) {
+	debug.Assert(sh.IsInline(), "calling InlineBytes on non-inline StringHeader")

Review Comment:
   fair point, i'll add those



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation [arrow]

Posted by "candiduslynx (via GitHub)" <gi...@apache.org>.
candiduslynx commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1375768862


##########
go/arrow/type_traits_string_view.go:
##########
@@ -0,0 +1,53 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arrow
+
+import (
+	"reflect"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v13/arrow/endian"
+)
+
+var StringHeaderTraits stringHeaderTraits

Review Comment:
   Here we see usage of `ViewHeader` for `StringHeaderSizeBytes`, so I suggest renaming this to `ViewHeaderSizeBytes` & rename file as well.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] github-actions[bot] commented on pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "github-actions[bot] (via GitHub)" <gi...@apache.org>.
github-actions[bot] commented on PR #35769:
URL: https://github.com/apache/arrow/pull/35769#issuecomment-1563297077

   * Closes: #35627


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] felipecrv commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "felipecrv (via GitHub)" <gi...@apache.org>.
felipecrv commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1236179917


##########
go/arrow/datatype_stringheader.go:
##########
@@ -0,0 +1,136 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arrow
+
+import (
+	"bytes"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v13/arrow/endian"
+	"github.com/apache/arrow/go/v13/arrow/internal/debug"
+	"github.com/apache/arrow/go/v13/arrow/memory"

Review Comment:
   v13 here needs to be updated on every release?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1237592557


##########
go/arrow/datatype_binary.go:
##########
@@ -83,16 +83,50 @@ func (t *LargeStringType) Layout() DataTypeLayout {
 func (t *LargeStringType) OffsetTypeTraits() OffsetTraits { return Int64Traits }
 func (LargeStringType) IsUtf8() bool                      { return true }
 
+type BinaryViewType struct{}
+
+func (*BinaryViewType) ID() Type              { return BINARY_VIEW }
+func (*BinaryViewType) Name() string          { return "binary_view" }
+func (*BinaryViewType) String() string        { return "binary_view" }
+func (*BinaryViewType) IsUtf8() bool          { return false }
+func (*BinaryViewType) binary()               {}
+func (*BinaryViewType) view()                 {}
+func (t *BinaryViewType) Fingerprint() string { return typeFingerprint(t) }
+func (*BinaryViewType) Layout() DataTypeLayout {
+	variadic := SpecVariableWidth()
+	return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(),
+		SpecFixedWidth(StringHeaderSizeBytes)}, VariadicSpec: &variadic}
+}
+
+type StringViewType struct{}

Review Comment:
   done



##########
go/arrow/datatype_binary.go:
##########
@@ -83,16 +83,50 @@ func (t *LargeStringType) Layout() DataTypeLayout {
 func (t *LargeStringType) OffsetTypeTraits() OffsetTraits { return Int64Traits }
 func (LargeStringType) IsUtf8() bool                      { return true }
 
+type BinaryViewType struct{}

Review Comment:
   done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on PR #35769:
URL: https://github.com/apache/arrow/pull/35769#issuecomment-1593730753

   CC @felipecrv @benibus


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation [arrow]

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1376449715


##########
go/arrow/type_traits_string_view.go:
##########
@@ -0,0 +1,53 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arrow
+
+import (
+	"reflect"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v13/arrow/endian"
+)
+
+var StringHeaderTraits stringHeaderTraits

Review Comment:
   bah i missed a spot when doing the renaming! Thanks for catching this!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation [arrow]

Posted by "candiduslynx (via GitHub)" <gi...@apache.org>.
candiduslynx commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1375778085


##########
go/arrow/ipc/metadata.go:
##########
@@ -1186,18 +1202,25 @@ func writeDictionaryMessage(mem memory.Allocator, id int64, isDelta bool, size,
 	return writeMessageFB(b, mem, flatbuf.MessageHeaderDictionaryBatch, dictFB, bodyLength)
 }
 
-func recordToFB(b *flatbuffers.Builder, size, bodyLength int64, fields []fieldMetadata, meta []bufferMetadata, codec flatbuf.CompressionType) flatbuffers.UOffsetT {
+func recordToFB(b *flatbuffers.Builder, size, bodyLength int64, fields []fieldMetadata, meta []bufferMetadata, codec flatbuf.CompressionType, variadicCounts []int64) flatbuffers.UOffsetT {
 	fieldsFB := writeFieldNodes(b, fields, flatbuf.RecordBatchStartNodesVector)
 	metaFB := writeBuffers(b, meta, flatbuf.RecordBatchStartBuffersVector)
 	var bodyCompressFB flatbuffers.UOffsetT
 	if codec != -1 {
 		bodyCompressFB = writeBodyCompression(b, codec)
 	}
 
+	flatbuf.RecordBatchStartVariadicBufferCountsVector(b, len(variadicCounts))
+	for i := len(variadicCounts) - 1; i >= 0; i-- {
+		b.PrependInt64(variadicCounts[i])
+	}
+	vcFB := b.EndVector(len(variadicCounts))
+

Review Comment:
   Should this be performed conditionally on `len(variadicCounts)`?
   Like so:
   ```suggestion
   	var vcFB *flatbuffers.UOffsetT
   	if len(variadicCounts) > 0 {
   		flatbuf.RecordBatchStartVariadicBufferCountsVector(b, len(variadicCounts))
   		for i := len(variadicCounts) - 1; i >= 0; i-- {
   			b.PrependInt64(variadicCounts[i])
   		}
   		vcFBVal := b.EndVector(len(variadicCounts))
   		vcFB = &vcFBVal
   	}
   ```
   
   & then we perform `flatbuf.RecordBatchAddVariadicBufferCounts` conditionally:
   ```go
   if vcFB != nil {
   	flatbuf.RecordBatchAddVariadicBufferCounts(b, *vcFB)
   }
   ```



##########
go/arrow/internal/arrjson/arrjson.go:
##########
@@ -818,6 +826,7 @@ type Array struct {
 	Offset   interface{}           `json:"OFFSET,omitempty"`
 	Size     interface{}           `json:"SIZE,omitempty"`
 	Children []Array               `json:"children,omitempty"`
+	Variadic []string              `json:"VARIADIC_BUFFERS,omitempty"`

Review Comment:
   Why are these JSON tags in different cases? `VARIADIC_BUFFERS` vs `children`
   



##########
go/arrow/array/string_test.go:
##########
@@ -619,3 +619,176 @@ func TestStringValueLen(t *testing.T) {
 		assert.Equal(t, len(v), slice.ValueLen(i))
 	}
 }
+func TestStringViewArray(t *testing.T) {

Review Comment:
   Also requires `AppendValueFromString`/`ValueStr` contract test.



##########
go/arrow/array/bufferbuilder.go:
##########
@@ -151,3 +153,112 @@ func (b *bufferBuilder) unsafeAppend(data []byte) {
 	copy(b.bytes[b.length:], data)
 	b.length += len(data)
 }
+
+type multiBufferBuilder struct {
+	refCount  int64
+	blockSize int
+
+	mem              memory.Allocator
+	blocks           []*memory.Buffer
+	currentOutBuffer int
+}
+
+// Retain increases the reference count by 1.
+// Retain may be called simultaneously from multiple goroutines.
+func (b *multiBufferBuilder) Retain() {
+	atomic.AddInt64(&b.refCount, 1)
+}
+
+// Release decreases the reference count by 1.
+// When the reference count goes to zero, the memory is freed.
+// Release may be called simultaneously from multiple goroutines.
+func (b *multiBufferBuilder) Release() {
+	debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases")
+
+	if atomic.AddInt64(&b.refCount, -1) == 0 {
+		b.Reset()
+	}
+}
+
+func (b *multiBufferBuilder) Reserve(nbytes int) {
+	if len(b.blocks) == 0 {
+		out := memory.NewResizableBuffer(b.mem)
+		if nbytes < b.blockSize {
+			nbytes = b.blockSize
+		}
+		out.Reserve(nbytes)
+		b.currentOutBuffer = 0
+		b.blocks = []*memory.Buffer{out}
+		return
+	}
+
+	curBuf := b.blocks[b.currentOutBuffer]
+	remain := curBuf.Cap() - curBuf.Len()
+	if nbytes <= remain {
+		return
+	}
+
+	// search for underfull block that has enough bytes
+	for i, block := range b.blocks {
+		remaining := block.Cap() - block.Len()
+		if nbytes <= remaining {
+			b.currentOutBuffer = i
+			return
+		}
+	}
+
+	// current buffer doesn't have enough space, no underfull buffers
+	// make new buffer and set that as our current.
+	newBuf := memory.NewResizableBuffer(b.mem)
+	if nbytes < b.blockSize {
+		nbytes = b.blockSize
+	}
+
+	newBuf.Reserve(nbytes)
+	b.currentOutBuffer = len(b.blocks)
+	b.blocks = append(b.blocks, newBuf)
+}
+
+func (b *multiBufferBuilder) RemainingBytes() int {
+	if len(b.blocks) == 0 {
+		return 0
+	}
+
+	buf := b.blocks[b.currentOutBuffer]
+	return buf.Cap() - buf.Len()
+}
+
+func (b *multiBufferBuilder) Reset() {
+	b.currentOutBuffer = 0
+	for i, block := range b.blocks {
+		block.Release()
+		b.blocks[i] = nil
+	}
+	b.blocks = nil

Review Comment:
   Is setting `b.blocks[i] = nil` necessary?
   Could also be simplifies with `b.Finish()`:
   ```suggestion
   	out := b.Finish()
   	for i, block := range out {
   		block.Release()
   	}
   ```



##########
go/arrow/datatype_viewheader.go:
##########
@@ -0,0 +1,138 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arrow
+
+import (
+	"bytes"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v14/arrow/endian"
+	"github.com/apache/arrow/go/v14/arrow/internal/debug"
+	"github.com/apache/arrow/go/v14/arrow/memory"
+)
+
+const (
+	StringViewPrefixLen  = 4
+	stringViewInlineSize = 12
+)
+
+func IsViewInline(length int) bool {
+	return length < stringViewInlineSize
+}
+
+// ViewHeader is a variable length string (utf8) or byte slice with
+// a 4 byte prefix and inline optimization for small values (12 bytes
+// or fewer). This is similar to Go's standard string but limited by
+// a length of Uint32Max and up to the first four bytes of the string
+// are copied into the struct. This prefix allows failing comparisons
+// early and can reduce CPU cache working set when dealing with short
+// strings.
+//
+// There are two situations:
+//
+//		Entirely inlined string data
+//	                |----|------------|
+//		                ^    ^
+//		                |    |
+//		              size  inline string data, zero padded
+//
+//		Reference into buffer
+//	                |----|----|----|----|
+//		                ^    ^     ^     ^
+//		                |    |     |     |
+//		              size prefix buffer index and offset to out-of-line portion
+//
+// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+//
+// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+type ViewHeader struct {
+	size uint32
+	// the first 4 bytes of this are the prefix for the string
+	// if size <= StringHeaderInlineSize, then the entire string
+	// is in the data array and is zero padded.
+	// if size > StringHeaderInlineSize, the next 8 bytes are 2 uint32
+	// values which are the buffer index and offset in that buffer
+	// containing the full string.
+	data [stringViewInlineSize]byte
+}
+
+func (sh *ViewHeader) IsInline() bool {
+	return sh.size <= uint32(stringViewInlineSize)
+}
+
+func (sh *ViewHeader) Len() int { return int(sh.size) }
+func (sh *ViewHeader) Prefix() [StringViewPrefixLen]byte {
+	return *(*[4]byte)(unsafe.Pointer(&sh.data))
+}
+
+func (sh *ViewHeader) BufferIndex() uint32 {
+	return endian.Native.Uint32(sh.data[StringViewPrefixLen:])
+}
+
+func (sh *ViewHeader) BufferOffset() uint32 {
+	return endian.Native.Uint32(sh.data[StringViewPrefixLen+4:])
+}
+
+func (sh *ViewHeader) InlineBytes() (data []byte) {
+	debug.Assert(sh.IsInline(), "calling InlineBytes on non-inline StringHeader")

Review Comment:
   I think it's a common practice in Go implementation to add these asserts. This way it will produce a human-readable reason for panic when used with proper build tags (e.g., for tests).



##########
go/arrow/datatype.go:
##########
@@ -272,6 +277,10 @@ func (b BufferSpec) Equals(other BufferSpec) bool {
 type DataTypeLayout struct {
 	Buffers []BufferSpec
 	HasDict bool
+	// if this is non-nil, the number of buffers expected is only
+	// lower-bounded by len(buffers). Buffers beyond this lower bound
+	// are expected to conform to this variadic spec.

Review Comment:
   ```suggestion
   	// VariadicSpec is what the buffers beyond len(Buffers) are expected to conform to.
   ```



##########
go/arrow/array/binarybuilder.go:
##########
@@ -370,6 +371,334 @@ func (b *BinaryBuilder) UnmarshalJSON(data []byte) error {
 	return b.Unmarshal(dec)
 }
 
+const (
+	dfltBlockSize            = 32 << 10 // 32 KB
+	viewValueSizeLimit int32 = math.MaxInt32
+)
+
+type BinaryViewBuilder struct {
+	builder
+	dtype arrow.BinaryDataType
+
+	data    *memory.Buffer
+	rawData []arrow.ViewHeader
+
+	blockBuilder multiBufferBuilder
+}
+
+func NewBinaryViewBuilder(mem memory.Allocator) *BinaryViewBuilder {
+	return &BinaryViewBuilder{
+		dtype: arrow.BinaryTypes.BinaryView,
+		builder: builder{
+			refCount: 1,
+			mem:      mem,
+		},
+		blockBuilder: multiBufferBuilder{
+			refCount:  1,
+			blockSize: dfltBlockSize,
+			mem:       mem,
+		},
+	}
+}
+
+func (b *BinaryViewBuilder) SetBlockSize(sz uint) {
+	b.blockBuilder.blockSize = int(sz)
+}
+
+func (b *BinaryViewBuilder) Type() arrow.DataType { return b.dtype }
+
+func (b *BinaryViewBuilder) Release() {
+	debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases")
+
+	if atomic.AddInt64(&b.refCount, -1) != 0 {
+		return
+	}
+
+	if b.nullBitmap != nil {
+		b.nullBitmap.Release()
+		b.nullBitmap = nil
+	}
+	if b.data != nil {
+		b.data.Release()
+		b.data = nil
+		b.rawData = nil
+	}
+}
+
+func (b *BinaryViewBuilder) init(capacity int) {
+	b.builder.init(capacity)
+	b.data = memory.NewResizableBuffer(b.mem)
+	bytesN := arrow.StringHeaderTraits.BytesRequired(capacity)
+	b.data.Resize(bytesN)
+	b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+}
+
+func (b *BinaryViewBuilder) Resize(n int) {
+	nbuild := n
+	if n < minBuilderCapacity {
+		n = minBuilderCapacity
+	}
+
+	if b.capacity == 0 {
+		b.init(n)
+		return
+	}
+
+	b.builder.resize(nbuild, b.init)
+	b.data.Resize(arrow.StringHeaderTraits.BytesRequired(n))
+	b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+}
+
+func (b *BinaryViewBuilder) ReserveData(length int) {
+	if int32(length) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 2GB",
+			arrow.ErrInvalid))
+	}
+	b.blockBuilder.Reserve(int(length))
+}
+
+func (b *BinaryViewBuilder) Reserve(n int) {
+	b.builder.reserve(n, b.Resize)
+}
+
+func (b *BinaryViewBuilder) Append(v []byte) {
+	if int32(len(v)) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 2GB", arrow.ErrInvalid))
+	}
+
+	if !arrow.IsViewInline(len(v)) {
+		b.ReserveData(len(v))
+	}
+
+	b.Reserve(1)
+	b.UnsafeAppend(v)
+}
+
+// AppendString is identical to Append, only accepting a string instead
+// of a byte slice, avoiding the extra copy that would occur if you simply
+// did []byte(v).
+//
+// This is different than AppendValueFromString which exists for the
+// Builder interface, in that this expects raw binary data which is
+// appended unmodified. AppendValueFromString expects base64 encoded binary
+// data instead.
+func (b *BinaryViewBuilder) AppendString(v string) {
+	// create a []byte without copying the bytes
+	// in go1.20 this would be unsafe.StringData

Review Comment:
   You could make the `go1.20` & `!go1.20` files for different implementations given you already did this for view headers.



##########
go/arrow/datatype_viewheader.go:
##########
@@ -0,0 +1,138 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arrow
+
+import (
+	"bytes"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v14/arrow/endian"
+	"github.com/apache/arrow/go/v14/arrow/internal/debug"
+	"github.com/apache/arrow/go/v14/arrow/memory"
+)
+
+const (
+	StringViewPrefixLen  = 4
+	stringViewInlineSize = 12
+)
+
+func IsViewInline(length int) bool {
+	return length < stringViewInlineSize
+}
+
+// ViewHeader is a variable length string (utf8) or byte slice with
+// a 4 byte prefix and inline optimization for small values (12 bytes
+// or fewer). This is similar to Go's standard string but limited by
+// a length of Uint32Max and up to the first four bytes of the string
+// are copied into the struct. This prefix allows failing comparisons
+// early and can reduce CPU cache working set when dealing with short
+// strings.
+//
+// There are two situations:
+//
+//		Entirely inlined string data
+//	                |----|------------|
+//		                ^    ^
+//		                |    |
+//		              size  inline string data, zero padded
+//
+//		Reference into buffer
+//	                |----|----|----|----|
+//		                ^    ^     ^     ^
+//		                |    |     |     |
+//		              size prefix buffer index and offset to out-of-line portion
+//
+// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+//
+// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+type ViewHeader struct {
+	size int32
+	// the first 4 bytes of this are the prefix for the string
+	// if size <= StringHeaderInlineSize, then the entire string
+	// is in the data array and is zero padded.
+	// if size > StringHeaderInlineSize, the next 8 bytes are 2 uint32
+	// values which are the buffer index and offset in that buffer
+	// containing the full string.
+	data [stringViewInlineSize]byte
+}
+
+func (sh *ViewHeader) IsInline() bool {
+	return sh.size <= int32(stringViewInlineSize)
+}
+
+func (sh *ViewHeader) Len() int { return int(sh.size) }
+func (sh *ViewHeader) Prefix() [StringViewPrefixLen]byte {
+	return *(*[4]byte)(unsafe.Pointer(&sh.data))
+}
+
+func (sh *ViewHeader) BufferIndex() int32 {
+	return int32(endian.Native.Uint32(sh.data[StringViewPrefixLen:]))
+}
+
+func (sh *ViewHeader) BufferOffset() int32 {
+	return int32(endian.Native.Uint32(sh.data[StringViewPrefixLen+4:]))
+}
+
+func (sh *ViewHeader) InlineBytes() (data []byte) {
+	debug.Assert(sh.IsInline(), "calling InlineBytes on non-inline StringHeader")
+	return sh.data[:sh.size]
+}
+
+func (sh *ViewHeader) SetBytes(data []byte) int {
+	sh.size = int32(len(data))
+	if sh.IsInline() {
+		return copy(sh.data[:], data)
+	}
+	return copy(sh.data[:4], data)
+}
+
+func (sh *ViewHeader) SetString(data string) int {
+	sh.size = int32(len(data))
+	if sh.IsInline() {
+		return copy(sh.data[:], data)
+	}
+	return copy(sh.data[:4], data)
+}
+
+func (sh *ViewHeader) SetIndexOffset(bufferIndex, offset int32) {
+	endian.Native.PutUint32(sh.data[StringViewPrefixLen:], uint32(bufferIndex))
+	endian.Native.PutUint32(sh.data[StringViewPrefixLen+4:], uint32(offset))
+}
+
+func (sh *ViewHeader) Equals(buffers []*memory.Buffer, other *ViewHeader, otherBuffers []*memory.Buffer) bool {
+	if sh.sizeAndPrefixAsInt() != other.sizeAndPrefixAsInt() {
+		return false
+	}
+
+	if sh.IsInline() {
+		return sh.inlinedAsInt64() == other.inlinedAsInt64()
+	}
+
+	data := buffers[sh.BufferIndex()].Bytes()[sh.BufferOffset() : sh.BufferOffset()+sh.size]
+	otherData := otherBuffers[other.BufferIndex()].Bytes()[other.BufferOffset() : other.BufferOffset()+other.size]
+	return bytes.Equal(data, otherData)
+}
+
+func (sh *ViewHeader) inlinedAsInt64() int64 {
+	s := unsafe.Slice((*int64)(unsafe.Pointer(sh)), 2)
+	return s[1]
+}
+
+func (sh *ViewHeader) sizeAndPrefixAsInt() int64 {

Review Comment:
   ```suggestion
   func (sh *ViewHeader) sizeAndPrefixAsInt64() int64 {
   ```



##########
go/arrow/datatype_viewheader.go:
##########
@@ -0,0 +1,138 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arrow
+
+import (
+	"bytes"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v14/arrow/endian"
+	"github.com/apache/arrow/go/v14/arrow/internal/debug"
+	"github.com/apache/arrow/go/v14/arrow/memory"
+)
+
+const (
+	StringViewPrefixLen  = 4
+	stringViewInlineSize = 12
+)
+
+func IsViewInline(length int) bool {
+	return length < stringViewInlineSize
+}
+
+// ViewHeader is a variable length string (utf8) or byte slice with
+// a 4 byte prefix and inline optimization for small values (12 bytes
+// or fewer). This is similar to Go's standard string but limited by
+// a length of Uint32Max and up to the first four bytes of the string
+// are copied into the struct. This prefix allows failing comparisons
+// early and can reduce CPU cache working set when dealing with short
+// strings.
+//
+// There are two situations:
+//
+//		Entirely inlined string data
+//	                |----|------------|
+//		                ^    ^
+//		                |    |
+//		              size  inline string data, zero padded
+//
+//		Reference into buffer
+//	                |----|----|----|----|
+//		                ^    ^     ^     ^
+//		                |    |     |     |
+//		              size prefix buffer index and offset to out-of-line portion
+//
+// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+//
+// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+type ViewHeader struct {
+	size int32
+	// the first 4 bytes of this are the prefix for the string
+	// if size <= StringHeaderInlineSize, then the entire string
+	// is in the data array and is zero padded.
+	// if size > StringHeaderInlineSize, the next 8 bytes are 2 uint32
+	// values which are the buffer index and offset in that buffer
+	// containing the full string.
+	data [stringViewInlineSize]byte
+}
+
+func (sh *ViewHeader) IsInline() bool {
+	return sh.size <= int32(stringViewInlineSize)
+}
+
+func (sh *ViewHeader) Len() int { return int(sh.size) }
+func (sh *ViewHeader) Prefix() [StringViewPrefixLen]byte {
+	return *(*[4]byte)(unsafe.Pointer(&sh.data))
+}
+
+func (sh *ViewHeader) BufferIndex() int32 {
+	return int32(endian.Native.Uint32(sh.data[StringViewPrefixLen:]))
+}
+
+func (sh *ViewHeader) BufferOffset() int32 {
+	return int32(endian.Native.Uint32(sh.data[StringViewPrefixLen+4:]))
+}
+
+func (sh *ViewHeader) InlineBytes() (data []byte) {
+	debug.Assert(sh.IsInline(), "calling InlineBytes on non-inline StringHeader")
+	return sh.data[:sh.size]
+}
+
+func (sh *ViewHeader) SetBytes(data []byte) int {
+	sh.size = int32(len(data))
+	if sh.IsInline() {
+		return copy(sh.data[:], data)
+	}
+	return copy(sh.data[:4], data)
+}
+
+func (sh *ViewHeader) SetString(data string) int {
+	sh.size = int32(len(data))
+	if sh.IsInline() {
+		return copy(sh.data[:], data)
+	}
+	return copy(sh.data[:4], data)
+}
+
+func (sh *ViewHeader) SetIndexOffset(bufferIndex, offset int32) {
+	endian.Native.PutUint32(sh.data[StringViewPrefixLen:], uint32(bufferIndex))
+	endian.Native.PutUint32(sh.data[StringViewPrefixLen+4:], uint32(offset))
+}
+
+func (sh *ViewHeader) Equals(buffers []*memory.Buffer, other *ViewHeader, otherBuffers []*memory.Buffer) bool {
+	if sh.sizeAndPrefixAsInt() != other.sizeAndPrefixAsInt() {
+		return false
+	}
+
+	if sh.IsInline() {
+		return sh.inlinedAsInt64() == other.inlinedAsInt64()
+	}
+
+	data := buffers[sh.BufferIndex()].Bytes()[sh.BufferOffset() : sh.BufferOffset()+sh.size]

Review Comment:
   You could add a helper func (maybe with asserts?):
   ```go
   func (sh *ViewHeader) variadicData(buffers []*memory.Buffer) []byte {
   	offset := sh.BufferOffset()
   	return buffers[sh.BufferIndex()].Bytes()[offset : offset+sh.size]
   }
   ```
   & then use it as `return bytes.Equal(sh.variadicData(buffers), other.variadicData(otherBuffers))`



##########
go/arrow/array/binarybuilder.go:
##########
@@ -358,6 +359,301 @@ func (b *BinaryBuilder) UnmarshalJSON(data []byte) error {
 	return b.Unmarshal(dec)
 }
 
+const (
+	dfltBlockSize             = 1 << 20 // 1 MB
+	viewValueSizeLimit uint32 = math.MaxUint32
+)
+
+type BinaryViewBuilder struct {
+	builder
+	dtype arrow.BinaryDataType
+
+	data    *memory.Buffer
+	rawData []arrow.StringHeader
+
+	blockBuilder multiBufferBuilder
+}
+
+func NewBinaryViewBuilder(mem memory.Allocator) *BinaryViewBuilder {
+	return &BinaryViewBuilder{
+		dtype: arrow.BinaryTypes.BinaryView,
+		builder: builder{
+			refCount: 1,
+			mem:      mem,
+		},
+		blockBuilder: multiBufferBuilder{
+			refCount:  1,
+			blockSize: dfltBlockSize,
+			mem:       mem,
+		},
+	}
+}
+
+func (b *BinaryViewBuilder) Type() arrow.DataType { return b.dtype }
+
+func (b *BinaryViewBuilder) Release() {
+	debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases")
+
+	if atomic.AddInt64(&b.refCount, -1) == 0 {
+		if b.nullBitmap != nil {
+			b.nullBitmap.Release()
+			b.nullBitmap = nil
+		}
+		if b.data != nil {
+			b.data.Release()
+			b.data = nil
+			b.rawData = nil
+		}
+	}
+}
+
+func (b *BinaryViewBuilder) init(capacity int) {
+	b.builder.init(capacity)
+	b.data = memory.NewResizableBuffer(b.mem)
+	bytesN := arrow.StringHeaderTraits.BytesRequired(capacity)
+	b.data.Resize(bytesN)
+	b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+}
+
+func (b *BinaryViewBuilder) Resize(n int) {
+	nbuild := n
+	if n < minBuilderCapacity {
+		n = minBuilderCapacity
+	}
+
+	if b.capacity == 0 {
+		b.init(n)
+	} else {
+		b.builder.resize(nbuild, b.init)
+		b.data.Resize(arrow.StringHeaderTraits.BytesRequired(n))
+		b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+	}
+}
+
+func (b *BinaryViewBuilder) ReserveData(length int) {
+	if uint32(length) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB",
+			arrow.ErrInvalid))
+	}
+	b.blockBuilder.Reserve(int(length))
+}
+
+func (b *BinaryViewBuilder) Reserve(n int) {
+	b.builder.reserve(n, b.Resize)
+}
+
+func (b *BinaryViewBuilder) Append(v []byte) {
+	if uint32(len(v)) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB", arrow.ErrInvalid))
+	}
+
+	if !arrow.IsStringHeaderInline(len(v)) {
+		b.ReserveData(len(v))
+	}
+
+	b.Reserve(1)
+	b.UnsafeAppend(v)
+}
+
+func (b *BinaryViewBuilder) AppendString(v string) {
+	// create a []byte without copying the bytes
+	// in go1.20 this would be unsafe.StringData
+	val := *(*[]byte)(unsafe.Pointer(&struct {
+		string
+		int
+	}{v, len(v)}))
+	b.Append(val)
+}
+
+func (b *BinaryViewBuilder) AppendNull() {
+	b.Reserve(1)
+	b.UnsafeAppendBoolToBitmap(false)
+}
+
+func (b *BinaryViewBuilder) AppendEmptyValue() {
+	b.Reserve(1)
+	b.UnsafeAppendBoolToBitmap(true)
+}
+
+func (b *BinaryViewBuilder) UnsafeAppend(v []byte) {
+	hdr := &b.rawData[b.length]
+	hdr.SetBytes(v)
+	if !hdr.IsInline() {
+		b.blockBuilder.UnsafeAppend(hdr, v)
+	}
+	b.UnsafeAppendBoolToBitmap(true)
+}
+
+func (b *BinaryViewBuilder) AppendValues(v [][]byte, valid []bool) {
+	if len(v) != len(valid) && len(valid) != 0 {
+		panic("len(v) != len(valid) && len(valid) != 0")
+	}
+
+	if len(v) == 0 {
+		return
+	}
+
+	b.Reserve(len(v))
+	outOfLineTotal := 0
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			if !arrow.IsStringHeaderInline(len(vv)) {
+				outOfLineTotal += len(vv)
+			}
+		}
+	}
+
+	b.ReserveData(outOfLineTotal)
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			hdr := &b.rawData[b.length+i]
+			hdr.SetBytes(vv)
+			if !hdr.IsInline() {
+				b.blockBuilder.UnsafeAppend(hdr, vv)
+			}
+		}
+	}
+
+	b.builder.unsafeAppendBoolsToBitmap(valid, len(v))
+}
+
+func (b *BinaryViewBuilder) AppendStringValues(v []string, valid []bool) {
+	if len(v) != len(valid) && len(valid) != 0 {
+		panic("len(v) != len(valid) && len(valid) != 0")
+	}
+
+	if len(v) == 0 {
+		return
+	}
+
+	b.Reserve(len(v))
+	outOfLineTotal := 0
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			if !arrow.IsStringHeaderInline(len(vv)) {
+				outOfLineTotal += len(vv)
+			}
+		}
+	}
+
+	b.ReserveData(outOfLineTotal)
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			hdr := &b.rawData[b.length+i]
+			hdr.SetString(vv)
+			if !hdr.IsInline() {
+				b.blockBuilder.UnsafeAppendString(hdr, vv)
+			}
+		}
+	}
+
+	b.builder.unsafeAppendBoolsToBitmap(valid, len(v))
+}
+
+func (b *BinaryViewBuilder) AppendValueFromString(s string) error {

Review Comment:
   Also requires `AppendValueFromString`/`ValueStr` contract test for binary view.



##########
go/arrow/array/bufferbuilder.go:
##########
@@ -151,3 +153,112 @@ func (b *bufferBuilder) unsafeAppend(data []byte) {
 	copy(b.bytes[b.length:], data)
 	b.length += len(data)
 }
+
+type multiBufferBuilder struct {
+	refCount  int64
+	blockSize int
+
+	mem              memory.Allocator
+	blocks           []*memory.Buffer
+	currentOutBuffer int
+}
+
+// Retain increases the reference count by 1.
+// Retain may be called simultaneously from multiple goroutines.
+func (b *multiBufferBuilder) Retain() {
+	atomic.AddInt64(&b.refCount, 1)
+}
+
+// Release decreases the reference count by 1.
+// When the reference count goes to zero, the memory is freed.
+// Release may be called simultaneously from multiple goroutines.
+func (b *multiBufferBuilder) Release() {
+	debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases")
+
+	if atomic.AddInt64(&b.refCount, -1) == 0 {
+		b.Reset()
+	}
+}
+
+func (b *multiBufferBuilder) Reserve(nbytes int) {
+	if len(b.blocks) == 0 {
+		out := memory.NewResizableBuffer(b.mem)
+		if nbytes < b.blockSize {
+			nbytes = b.blockSize
+		}
+		out.Reserve(nbytes)
+		b.currentOutBuffer = 0
+		b.blocks = []*memory.Buffer{out}
+		return
+	}
+
+	curBuf := b.blocks[b.currentOutBuffer]
+	remain := curBuf.Cap() - curBuf.Len()
+	if nbytes <= remain {
+		return
+	}
+
+	// search for underfull block that has enough bytes
+	for i, block := range b.blocks {
+		remaining := block.Cap() - block.Len()
+		if nbytes <= remaining {
+			b.currentOutBuffer = i
+			return
+		}
+	}
+
+	// current buffer doesn't have enough space, no underfull buffers
+	// make new buffer and set that as our current.
+	newBuf := memory.NewResizableBuffer(b.mem)
+	if nbytes < b.blockSize {
+		nbytes = b.blockSize
+	}
+
+	newBuf.Reserve(nbytes)
+	b.currentOutBuffer = len(b.blocks)
+	b.blocks = append(b.blocks, newBuf)
+}
+
+func (b *multiBufferBuilder) RemainingBytes() int {
+	if len(b.blocks) == 0 {
+		return 0
+	}
+
+	buf := b.blocks[b.currentOutBuffer]
+	return buf.Cap() - buf.Len()
+}
+
+func (b *multiBufferBuilder) Reset() {
+	b.currentOutBuffer = 0
+	for i, block := range b.blocks {
+		block.Release()
+		b.blocks[i] = nil
+	}
+	b.blocks = nil
+}
+
+func (b *multiBufferBuilder) UnsafeAppend(hdr *arrow.ViewHeader, val []byte) {
+	buf := b.blocks[b.currentOutBuffer]
+	idx, offset := b.currentOutBuffer, buf.Len()
+	hdr.SetIndexOffset(int32(idx), int32(offset))
+
+	n := copy(buf.Buf()[offset:], val)
+	buf.ResizeNoShrink(offset + n)
+}
+
+func (b *multiBufferBuilder) UnsafeAppendString(hdr *arrow.ViewHeader, val string) {
+	// create a byte slice with zero-copies
+	// in go1.20 this would be equivalent to unsafe.StringData
+	v := *(*[]byte)(unsafe.Pointer(&struct {
+		string
+		int
+	}{val, len(val)}))
+	b.UnsafeAppend(hdr, v)
+}
+
+func (b *multiBufferBuilder) Finish() (out []*memory.Buffer) {
+	b.currentOutBuffer = 0
+	out = b.blocks
+	b.blocks = nil

Review Comment:
   ```suggestion
   	out, b.blocks = b.blocks, nil
   ```



##########
go/arrow/ipc/file_reader.go:
##########
@@ -476,6 +487,9 @@ func (ctx *arrayLoaderContext) loadArray(dt arrow.DataType) arrow.ArrayData {
 	case *arrow.BinaryType, *arrow.StringType, *arrow.LargeStringType, *arrow.LargeBinaryType:
 		return ctx.loadBinary(dt)
 
+	case arrow.BinaryViewDataType:

Review Comment:
   Shouldn't there also be a case for `arrow.StringViewDataType`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1207041723


##########
go/arrow/datatype_binary.go:
##########
@@ -83,16 +93,168 @@ func (t *LargeStringType) Layout() DataTypeLayout {
 func (t *LargeStringType) OffsetTypeTraits() OffsetTraits { return Int64Traits }
 func (LargeStringType) IsUtf8() bool                      { return true }
 
+const (
+	StringHeaderPrefixLen  = 4
+	stringHeaderInlineSize = 12
+)
+
+func IsStringHeaderInline(length int) bool {
+	return length < stringHeaderInlineSize
+}
+
+// StringHeader is a variable length string (utf8) or byte slice with
+// a 4 byte prefix and inline optimization for small values (12 bytes
+// or fewer). This is similar to Go's standard string  but limited by
+// a length of Uint32Max and up to the first four bytes of the string
+// are copied into the struct. This prefix allows failing comparisons
+// early and can reduce CPU cache working set when dealing with short
+// strings.
+//
+// There are two situations:
+//
+//	Short string   |----|----|--------|
+//	                ^    ^    ^
+//	                |    |    |
+//	              size prefix remaining in-line portion, zero padded
+//
+//	IO Long String |----|----|----|----|
+//	                ^    ^     ^     ^
+//	                |    |     |     |
+//	              size prefix buffer index and offset to out-of-line portion
+//
+// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+//
+// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+type StringHeader struct {
+	size uint32
+	// the first 4 bytes of this are the prefix for the string
+	// if size <= StringHeaderInlineSize, then the entire string
+	// is in the data array and is zero padded.
+	// if size > StringHeaderInlineSize, the next 8 bytes are 2 uint32
+	// values which are the buffer index and offset in that buffer
+	// containing the full string.
+	data [stringHeaderInlineSize]byte
+}
+
+func (sh *StringHeader) IsInline() bool {
+	return sh.size <= uint32(stringHeaderInlineSize)
+}
+
+func (sh *StringHeader) Len() int { return int(sh.size) }
+func (sh *StringHeader) Prefix() [StringHeaderPrefixLen]byte {
+	return *(*[4]byte)(unsafe.Pointer(&sh.data))
+}
+
+func (sh *StringHeader) BufferIndex() uint32 {
+	return endian.Native.Uint32(sh.data[StringHeaderPrefixLen:])
+}
+
+func (sh *StringHeader) BufferOffset() uint32 {
+	return endian.Native.Uint32(sh.data[StringHeaderPrefixLen+4:])
+}
+
+func (sh *StringHeader) InlineBytes() (data []byte) {
+	debug.Assert(sh.IsInline(), "calling InlineBytes on non-inline StringHeader")
+	return sh.data[:sh.size]
+}
+
+func (sh *StringHeader) InlineData() (data string) {
+	debug.Assert(sh.IsInline(), "calling InlineData on non-inline StringHeader")
+	h := (*reflect.StringHeader)(unsafe.Pointer(&data))
+	h.Data = uintptr(unsafe.Pointer(&sh.data))
+	h.Len = int(sh.size)
+	return
+}
+
+func (sh *StringHeader) SetBytes(data []byte) int {
+	sh.size = uint32(len(data))
+	if sh.IsInline() {
+		return copy(sh.data[:], data)
+	}
+	return copy(sh.data[:4], data)
+}
+
+func (sh *StringHeader) SetString(data string) int {
+	sh.size = uint32(len(data))
+	if sh.IsInline() {
+		return copy(sh.data[:], data)
+	}
+	return copy(sh.data[:4], data)
+}
+
+func (sh *StringHeader) SetIndexOffset(bufferIndex, offset uint32) {
+	endian.Native.PutUint32(sh.data[StringHeaderPrefixLen:], bufferIndex)
+	endian.Native.PutUint32(sh.data[StringHeaderPrefixLen+4:], offset)
+}
+
+func (sh *StringHeader) Equals(buffers []*memory.Buffer, other *StringHeader, otherBuffers []*memory.Buffer) bool {
+	if sh.sizeAndPrefixAsInt() != other.sizeAndPrefixAsInt() {
+		return false
+	}
+
+	if sh.IsInline() {
+		return sh.inlinedAsInt64() == other.inlinedAsInt64()
+	}
+
+	data := buffers[sh.BufferIndex()].Bytes()[sh.BufferOffset() : sh.BufferOffset()+sh.size]
+	otherData := otherBuffers[other.BufferIndex()].Bytes()[other.BufferOffset() : other.BufferOffset()+other.size]
+	return bytes.Equal(data, otherData)
+}
+
+func (sh *StringHeader) inlinedAsInt64() int64 {
+	s := unsafe.Slice((*int64)(unsafe.Pointer(sh)), 2)
+	return s[1]
+}
+
+func (sh *StringHeader) sizeAndPrefixAsInt() int64 {
+	s := unsafe.Slice((*int64)(unsafe.Pointer(sh)), 2)
+	return s[0]
+}
+
+type BinaryViewType struct{}
+
+func (*BinaryViewType) ID() Type              { return BINARY_VIEW }
+func (*BinaryViewType) Name() string          { return "binary_view" }
+func (*BinaryViewType) String() string        { return "binary_view" }
+func (*BinaryViewType) IsUtf8() bool          { return false }
+func (*BinaryViewType) binary()               {}
+func (*BinaryViewType) view()                 {}

Review Comment:
   convenience interfaces. in `datatype.go` there's `BinaryDataType` which defines an interface adding `IsUtf8() bool` which requires a `binary()` method so that you can easily write code that works for *all* of these binary interfaces via type switches or interfaces.
   
   Then I added this `view()` method for a `BinaryViewDataType` so you can write code targeting both BinaryView and StringView without needing to duplicate code by just using the interface in the type switch.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] candiduslynx commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "candiduslynx (via GitHub)" <gi...@apache.org>.
candiduslynx commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1238213844


##########
go/arrow/internal/testing/gen/random_array_gen.go:
##########
@@ -350,6 +350,40 @@ func (r *RandomArrayGenerator) LargeString(size int64, minLength, maxLength int6
 	return bldr.NewArray()
 }
 
+func (r *RandomArrayGenerator) StringView(size int64, minLength, maxLength int64, nullProb float64) arrow.Array {
+	return r.generateBinaryView(arrow.BinaryTypes.StringView, size, minLength, maxLength, nullProb)
+}
+
+func (r *RandomArrayGenerator) generateBinaryView(dt arrow.DataType, size int64, minLength, maxLength int64, nullProb float64) arrow.Array {
+	lengths := r.Int32(size, int32(minLength), int32(maxLength), nullProb).(*array.Int32)
+	defer lengths.Release()
+
+	bldr := array.NewBuilder(r.mem, dt).(array.StringLikeBuilder)
+	defer bldr.Release()
+
+	r.extra++
+	dist := rand.New(rand.NewSource(r.seed + r.extra))
+
+	buf := make([]byte, 0, maxLength)
+	gen := func(n int32) string {
+		out := buf[:n]
+		for i := range out {
+			out[i] = uint8(dist.Int31n(int32('z')-int32('A')+1) + int32('A'))
+		}
+		return string(out)
+	}
+
+	for i := 0; i < lengths.Len(); i++ {
+		if lengths.IsValid(i) {
+			bldr.Append(gen(lengths.Value(i)))
+		} else {
+			bldr.AppendNull()
+		}

Review Comment:
   ```suggestion
   		if lengths.IsNull(i) {
   			bldr.AppendNull()
   			continue
   		}
   		bldr.Append(gen(lengths.Value(i)))
   ```
   
   exit-early can be utilized here as well, nit



##########
go/arrow/array/binarybuilder.go:
##########
@@ -370,6 +371,327 @@ func (b *BinaryBuilder) UnmarshalJSON(data []byte) error {
 	return b.Unmarshal(dec)
 }
 
+const (
+	dfltBlockSize             = 1 << 20 // 1 MB
+	viewValueSizeLimit uint32 = math.MaxUint32
+)
+
+type BinaryViewBuilder struct {
+	builder
+	dtype arrow.BinaryDataType
+
+	data    *memory.Buffer
+	rawData []arrow.StringHeader
+
+	blockBuilder multiBufferBuilder
+}
+
+func NewBinaryViewBuilder(mem memory.Allocator) *BinaryViewBuilder {
+	return &BinaryViewBuilder{
+		dtype: arrow.BinaryTypes.BinaryView,
+		builder: builder{
+			refCount: 1,
+			mem:      mem,
+		},
+		blockBuilder: multiBufferBuilder{
+			refCount:  1,
+			blockSize: dfltBlockSize,
+			mem:       mem,
+		},
+	}
+}
+
+func (b *BinaryViewBuilder) Type() arrow.DataType { return b.dtype }
+
+func (b *BinaryViewBuilder) Release() {
+	debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases")
+
+	if atomic.AddInt64(&b.refCount, -1) == 0 {
+		if b.nullBitmap != nil {
+			b.nullBitmap.Release()
+			b.nullBitmap = nil
+		}
+		if b.data != nil {
+			b.data.Release()
+			b.data = nil
+			b.rawData = nil
+		}
+	}
+}
+
+func (b *BinaryViewBuilder) init(capacity int) {
+	b.builder.init(capacity)
+	b.data = memory.NewResizableBuffer(b.mem)
+	bytesN := arrow.StringHeaderTraits.BytesRequired(capacity)
+	b.data.Resize(bytesN)
+	b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+}
+
+func (b *BinaryViewBuilder) Resize(n int) {
+	nbuild := n
+	if n < minBuilderCapacity {
+		n = minBuilderCapacity
+	}
+
+	if b.capacity == 0 {
+		b.init(n)
+	} else {
+		b.builder.resize(nbuild, b.init)
+		b.data.Resize(arrow.StringHeaderTraits.BytesRequired(n))
+		b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+	}
+}
+
+func (b *BinaryViewBuilder) ReserveData(length int) {
+	if uint32(length) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB",
+			arrow.ErrInvalid))
+	}
+	b.blockBuilder.Reserve(int(length))
+}
+
+func (b *BinaryViewBuilder) Reserve(n int) {
+	b.builder.reserve(n, b.Resize)
+}
+
+func (b *BinaryViewBuilder) Append(v []byte) {
+	if uint32(len(v)) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB", arrow.ErrInvalid))
+	}
+
+	if !arrow.IsStringHeaderInline(len(v)) {
+		b.ReserveData(len(v))
+	}
+
+	b.Reserve(1)
+	b.UnsafeAppend(v)
+}
+
+// AppendString is identical to Append, only accepting a string instead
+// of a byte slice, avoiding the extra copy that would occur if you simply
+// did []byte(v).
+//
+// This is different than AppendValueFromString which exists for the
+// Builder interface, in that this expects raw binary data which is
+// appended as such. AppendValueFromString expects base64 encoded binary
+// data instead.
+func (b *BinaryViewBuilder) AppendString(v string) {
+	// create a []byte without copying the bytes
+	// in go1.20 this would be unsafe.StringData
+	val := *(*[]byte)(unsafe.Pointer(&struct {
+		string
+		int
+	}{v, len(v)}))
+	b.Append(val)
+}
+
+func (b *BinaryViewBuilder) AppendNull() {
+	b.Reserve(1)
+	b.UnsafeAppendBoolToBitmap(false)
+}
+
+func (b *BinaryViewBuilder) AppendNulls(n int) {
+	b.Reserve(n)
+	for i := 0; i < n; i++ {
+		b.UnsafeAppendBoolToBitmap(false)
+	}
+}
+
+func (b *BinaryViewBuilder) AppendEmptyValue() {
+	b.Reserve(1)
+	b.UnsafeAppendBoolToBitmap(true)
+}
+
+func (b *BinaryViewBuilder) AppendEmptyValues(n int) {
+	b.Reserve(n)
+	b.unsafeAppendBoolsToBitmap(nil, n)
+}
+
+func (b *BinaryViewBuilder) UnsafeAppend(v []byte) {
+	hdr := &b.rawData[b.length]
+	hdr.SetBytes(v)
+	if !hdr.IsInline() {
+		b.blockBuilder.UnsafeAppend(hdr, v)
+	}
+	b.UnsafeAppendBoolToBitmap(true)
+}
+
+func (b *BinaryViewBuilder) AppendValues(v [][]byte, valid []bool) {
+	if len(v) != len(valid) && len(valid) != 0 {
+		panic("len(v) != len(valid) && len(valid) != 0")
+	}
+
+	if len(v) == 0 {
+		return
+	}
+
+	b.Reserve(len(v))
+	outOfLineTotal := 0
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			if !arrow.IsStringHeaderInline(len(vv)) {
+				outOfLineTotal += len(vv)
+			}
+		}
+	}
+
+	b.ReserveData(outOfLineTotal)
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			hdr := &b.rawData[b.length+i]
+			hdr.SetBytes(vv)
+			if !hdr.IsInline() {
+				b.blockBuilder.UnsafeAppend(hdr, vv)
+			}
+		}
+	}
+
+	b.builder.unsafeAppendBoolsToBitmap(valid, len(v))
+}
+
+func (b *BinaryViewBuilder) AppendStringValues(v []string, valid []bool) {
+	if len(v) != len(valid) && len(valid) != 0 {
+		panic("len(v) != len(valid) && len(valid) != 0")
+	}
+
+	if len(v) == 0 {
+		return
+	}
+
+	b.Reserve(len(v))
+	outOfLineTotal := 0
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			if !arrow.IsStringHeaderInline(len(vv)) {
+				outOfLineTotal += len(vv)
+			}
+		}
+	}
+
+	b.ReserveData(outOfLineTotal)
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			hdr := &b.rawData[b.length+i]
+			hdr.SetString(vv)
+			if !hdr.IsInline() {
+				b.blockBuilder.UnsafeAppendString(hdr, vv)
+			}
+		}
+	}
+
+	b.builder.unsafeAppendBoolsToBitmap(valid, len(v))
+}
+
+// AppendValueFromString is paired with ValueStr for fulfilling the
+// base Builder interface. This is intended to read in a human-readable
+// string such as from CSV or JSON and append it to the array.
+//
+// For Binary values are expected to be base64 encoded (and will be
+// decoded as such before being appended).
+func (b *BinaryViewBuilder) AppendValueFromString(s string) error {
+	if s == NullValueStr {
+		b.AppendNull()
+		return nil
+	}
+
+	if b.dtype.IsUtf8() {
+		b.Append([]byte(s))
+		return nil
+	}
+
+	decodedVal, err := base64.StdEncoding.DecodeString(s)
+	if err != nil {
+		return fmt.Errorf("could not decode base64 string: %w", err)
+	}
+	b.Append(decodedVal)
+	return nil
+}
+
+func (b *BinaryViewBuilder) UnmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case string:
+		data, err := base64.StdEncoding.DecodeString(v)
+		if err != nil {
+			return err
+		}
+		b.Append(data)
+	case []byte:
+		b.Append(v)
+	case nil:
+		b.AppendNull()
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf([]byte{}),
+			Offset: dec.InputOffset(),
+		}
+	}
+	return nil
+}
+
+func (b *BinaryViewBuilder) Unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.UnmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *BinaryViewBuilder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary view builder must unpack from json array, found %s", delim)
+	}
+
+	return b.Unmarshal(dec)
+}
+
+func (b *BinaryViewBuilder) newData() (data *Data) {
+	bytesRequired := arrow.StringHeaderTraits.BytesRequired(b.length)
+	if bytesRequired > 0 && bytesRequired < b.data.Len() {
+		// trim buffers
+		b.data.Resize(bytesRequired)
+	}
+
+	dataBuffers := b.blockBuilder.Finish()
+	data = NewData(b.dtype, b.length, append([]*memory.Buffer{
+		b.nullBitmap, b.data}, dataBuffers...), nil, b.nulls, 0)
+	b.reset()
+
+	if b.data != nil {
+		b.data.Release()
+		b.data = nil

Review Comment:
   should `b.nullBitmap` be also released somewhere around here?



##########
go/arrow/internal/arrjson/arrjson.go:
##########
@@ -1024,6 +1033,30 @@ func arrayFromJSON(mem memory.Allocator, dt arrow.DataType, arr Array) arrow.Arr
 		bldr.AppendValues(data, valids)
 		return returnNewArrayData(bldr)
 
+	case *arrow.BinaryViewType:

Review Comment:
   can you use the interface instead? the 2 cases are identical



##########
go/arrow/internal/flatbuf/RecordBatch.go:
##########
@@ -128,8 +128,42 @@ func (rcv *RecordBatch) Compression(obj *BodyCompression) *BodyCompression {
 }
 
 /// Optional compression of the message body
+/// Some types such as Utf8View are represented using a variable number of buffers.
+/// For each such Field in the pre-ordered flattened logical schema, there will be
+/// an entry in variadicCounts to indicate the number of extra buffers which belong
+/// to that Field.
+func (rcv *RecordBatch) VariadicCounts(j int) int64 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(12))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.GetInt64(a + flatbuffers.UOffsetT(j*8))
+	}
+	return 0

Review Comment:
   better to use exit-early:
   ```suggestion
   	if o == 0 {
   		return 0
   	}
   	return rcv._tab.GetInt64(rcv._tab.Vector(o) + flatbuffers.UOffsetT(j*8))
   ```



##########
go/arrow/internal/flatbuf/RecordBatch.go:
##########
@@ -128,8 +128,42 @@ func (rcv *RecordBatch) Compression(obj *BodyCompression) *BodyCompression {
 }
 
 /// Optional compression of the message body
+/// Some types such as Utf8View are represented using a variable number of buffers.
+/// For each such Field in the pre-ordered flattened logical schema, there will be
+/// an entry in variadicCounts to indicate the number of extra buffers which belong
+/// to that Field.
+func (rcv *RecordBatch) VariadicCounts(j int) int64 {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(12))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.GetInt64(a + flatbuffers.UOffsetT(j*8))
+	}
+	return 0
+}
+
+func (rcv *RecordBatch) VariadicCountsLength() int {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(12))
+	if o != 0 {
+		return rcv._tab.VectorLen(o)
+	}
+	return 0
+}
+
+/// Some types such as Utf8View are represented using a variable number of buffers.
+/// For each such Field in the pre-ordered flattened logical schema, there will be
+/// an entry in variadicCounts to indicate the number of extra buffers which belong
+/// to that Field.
+func (rcv *RecordBatch) MutateVariadicCounts(j int, n int64) bool {
+	o := flatbuffers.UOffsetT(rcv._tab.Offset(12))
+	if o != 0 {
+		a := rcv._tab.Vector(o)
+		return rcv._tab.MutateInt64(a+flatbuffers.UOffsetT(j*8), n)
+	}
+	return false

Review Comment:
   exit early
   ```suggestion
   	if o == 0 {
   		return false
   	}
   	return rcv._tab.MutateInt64(rcv._tab.Vector(o)+flatbuffers.UOffsetT(j*8), n)
   ```



##########
go/arrow/internal/arrjson/arrjson.go:
##########
@@ -2179,3 +2230,113 @@ func durationToJSON(arr *array.Duration) []interface{} {
 	}
 	return o
 }
+
+func variadicBuffersFromJSON(bufs []string) []*memory.Buffer {
+	out := make([]*memory.Buffer, len(bufs))
+	for i, data := range bufs {
+		rawData, err := hex.DecodeString(data)
+		if err != nil {
+			panic(err)
+		}
+
+		out[i] = memory.NewBufferBytes(rawData)
+	}
+	return out
+}
+
+func variadicBuffersToJSON(bufs []*memory.Buffer) []string {
+	out := make([]string, len(bufs))
+	for i, data := range bufs {
+		out[i] = strings.ToUpper(hex.EncodeToString(data.Bytes()))
+	}
+	return out
+}
+
+func stringHeadersFromJSON(mem memory.Allocator, isBinary bool, data []interface{}) *memory.Buffer {
+	buf := memory.NewResizableBuffer(mem)
+	buf.Resize(arrow.StringHeaderTraits.BytesRequired(len(data)))
+
+	values := arrow.StringHeaderTraits.CastFromBytes(buf.Bytes())
+
+	for i, d := range data {
+		switch v := d.(type) {
+		case nil:
+			continue
+		case map[string]interface{}:
+			if inlined, ok := v["INLINED"]; ok {
+				if isBinary {
+					val, err := hex.DecodeString(inlined.(string))
+					if err != nil {
+						panic(fmt.Errorf("could not decode %v: %v", inlined, err))
+					}
+					values[i].SetBytes(val)
+				} else {
+					values[i].SetString(inlined.(string))
+				}
+				continue
+			}
+
+			idx, offset := v["BUFFER_INDEX"].(json.Number), v["OFFSET"].(json.Number)
+			bufIdx, err := idx.Int64()
+			if err != nil {
+				panic(err)
+			}
+
+			bufOffset, err := offset.Int64()
+			if err != nil {
+				panic(err)
+			}
+
+			values[i].SetIndexOffset(uint32(bufIdx), uint32(bufOffset))
+			prefix, err := hex.DecodeString(v["PREFIX"].(string))
+			if err != nil {
+				panic(err)
+			}
+			sz, err := v["SIZE"].(json.Number).Int64()
+			if err != nil {
+				panic(err)
+			}
+
+			rawData := make([]byte, sz)
+			copy(rawData, prefix)
+			values[i].SetBytes(rawData)
+		}
+	}
+	return buf
+}
+
+func stringHeadersToJSON(arr array.ViewLike, isBinary bool) []interface{} {
+	type StringHeader struct {
+		Size      int     `json:"SIZE"`
+		Prefix    *string `json:"PREFIX,omitempty"`
+		BufferIdx *int    `json:"BUFFER_INDEX,omitempty"`
+		BufferOff *int    `json:"OFFSET,omitempty"`
+		Inlined   *string `json:"INLINED,omitempty"`
+	}
+
+	o := make([]interface{}, arr.Len())
+	for i := range o {
+		hdr := arr.ValueHeader(i)
+		if hdr.IsInline() {
+			data := hdr.InlineData()
+			if isBinary {
+				data = strings.ToUpper(hex.EncodeToString(hdr.InlineBytes()))
+			}
+			o[i] = StringHeader{
+				Size:    hdr.Len(),
+				Inlined: &data,
+			}
+		} else {
+			idx, off := int(hdr.BufferIndex()), int(hdr.BufferOffset())
+			prefix := hdr.Prefix()
+			encodedPrefix := strings.ToUpper(hex.EncodeToString(prefix[:]))

Review Comment:
   is `ToUpper` necessary?



##########
go/arrow/internal/arrjson/arrjson.go:
##########
@@ -1394,6 +1427,24 @@ func arrayToJSON(field arrow.Field, arr arrow.Array) Array {
 			Offset: strOffsets,
 		}
 
+	case *array.StringView:
+		variadic := variadicBuffersToJSON(arr.Data().Buffers()[2:])

Review Comment:
   same here (use interface, as these are the same cases)



##########
go/arrow/internal/arrjson/arrjson.go:
##########
@@ -2179,3 +2230,113 @@ func durationToJSON(arr *array.Duration) []interface{} {
 	}
 	return o
 }
+
+func variadicBuffersFromJSON(bufs []string) []*memory.Buffer {
+	out := make([]*memory.Buffer, len(bufs))
+	for i, data := range bufs {
+		rawData, err := hex.DecodeString(data)
+		if err != nil {
+			panic(err)
+		}
+
+		out[i] = memory.NewBufferBytes(rawData)
+	}
+	return out
+}
+
+func variadicBuffersToJSON(bufs []*memory.Buffer) []string {
+	out := make([]string, len(bufs))
+	for i, data := range bufs {
+		out[i] = strings.ToUpper(hex.EncodeToString(data.Bytes()))
+	}
+	return out
+}
+
+func stringHeadersFromJSON(mem memory.Allocator, isBinary bool, data []interface{}) *memory.Buffer {
+	buf := memory.NewResizableBuffer(mem)
+	buf.Resize(arrow.StringHeaderTraits.BytesRequired(len(data)))
+
+	values := arrow.StringHeaderTraits.CastFromBytes(buf.Bytes())
+
+	for i, d := range data {
+		switch v := d.(type) {
+		case nil:
+			continue
+		case map[string]interface{}:
+			if inlined, ok := v["INLINED"]; ok {
+				if isBinary {
+					val, err := hex.DecodeString(inlined.(string))
+					if err != nil {
+						panic(fmt.Errorf("could not decode %v: %v", inlined, err))
+					}
+					values[i].SetBytes(val)
+				} else {
+					values[i].SetString(inlined.(string))
+				}
+				continue
+			}
+
+			idx, offset := v["BUFFER_INDEX"].(json.Number), v["OFFSET"].(json.Number)
+			bufIdx, err := idx.Int64()
+			if err != nil {
+				panic(err)
+			}
+
+			bufOffset, err := offset.Int64()
+			if err != nil {
+				panic(err)
+			}
+
+			values[i].SetIndexOffset(uint32(bufIdx), uint32(bufOffset))
+			prefix, err := hex.DecodeString(v["PREFIX"].(string))
+			if err != nil {
+				panic(err)
+			}
+			sz, err := v["SIZE"].(json.Number).Int64()
+			if err != nil {
+				panic(err)
+			}
+
+			rawData := make([]byte, sz)
+			copy(rawData, prefix)
+			values[i].SetBytes(rawData)
+		}
+	}
+	return buf
+}
+
+func stringHeadersToJSON(arr array.ViewLike, isBinary bool) []interface{} {
+	type StringHeader struct {
+		Size      int     `json:"SIZE"`
+		Prefix    *string `json:"PREFIX,omitempty"`
+		BufferIdx *int    `json:"BUFFER_INDEX,omitempty"`
+		BufferOff *int    `json:"OFFSET,omitempty"`
+		Inlined   *string `json:"INLINED,omitempty"`
+	}
+
+	o := make([]interface{}, arr.Len())
+	for i := range o {
+		hdr := arr.ValueHeader(i)
+		if hdr.IsInline() {
+			data := hdr.InlineData()
+			if isBinary {
+				data = strings.ToUpper(hex.EncodeToString(hdr.InlineBytes()))
+			}
+			o[i] = StringHeader{
+				Size:    hdr.Len(),
+				Inlined: &data,
+			}
+		} else {

Review Comment:
   can use `continue` here & ditch `else` nesting



##########
go/arrow/internal/arrjson/arrjson.go:
##########
@@ -2179,3 +2230,113 @@ func durationToJSON(arr *array.Duration) []interface{} {
 	}
 	return o
 }
+
+func variadicBuffersFromJSON(bufs []string) []*memory.Buffer {
+	out := make([]*memory.Buffer, len(bufs))
+	for i, data := range bufs {
+		rawData, err := hex.DecodeString(data)
+		if err != nil {
+			panic(err)
+		}
+
+		out[i] = memory.NewBufferBytes(rawData)
+	}
+	return out
+}
+
+func variadicBuffersToJSON(bufs []*memory.Buffer) []string {
+	out := make([]string, len(bufs))
+	for i, data := range bufs {
+		out[i] = strings.ToUpper(hex.EncodeToString(data.Bytes()))
+	}
+	return out
+}
+
+func stringHeadersFromJSON(mem memory.Allocator, isBinary bool, data []interface{}) *memory.Buffer {
+	buf := memory.NewResizableBuffer(mem)
+	buf.Resize(arrow.StringHeaderTraits.BytesRequired(len(data)))
+
+	values := arrow.StringHeaderTraits.CastFromBytes(buf.Bytes())
+
+	for i, d := range data {
+		switch v := d.(type) {
+		case nil:
+			continue
+		case map[string]interface{}:
+			if inlined, ok := v["INLINED"]; ok {
+				if isBinary {
+					val, err := hex.DecodeString(inlined.(string))
+					if err != nil {
+						panic(fmt.Errorf("could not decode %v: %v", inlined, err))
+					}
+					values[i].SetBytes(val)
+				} else {
+					values[i].SetString(inlined.(string))
+				}
+				continue
+			}
+
+			idx, offset := v["BUFFER_INDEX"].(json.Number), v["OFFSET"].(json.Number)
+			bufIdx, err := idx.Int64()
+			if err != nil {
+				panic(err)
+			}
+
+			bufOffset, err := offset.Int64()
+			if err != nil {
+				panic(err)
+			}
+
+			values[i].SetIndexOffset(uint32(bufIdx), uint32(bufOffset))
+			prefix, err := hex.DecodeString(v["PREFIX"].(string))
+			if err != nil {
+				panic(err)
+			}
+			sz, err := v["SIZE"].(json.Number).Int64()
+			if err != nil {
+				panic(err)
+			}
+
+			rawData := make([]byte, sz)
+			copy(rawData, prefix)
+			values[i].SetBytes(rawData)
+		}
+	}
+	return buf
+}
+
+func stringHeadersToJSON(arr array.ViewLike, isBinary bool) []interface{} {
+	type StringHeader struct {
+		Size      int     `json:"SIZE"`
+		Prefix    *string `json:"PREFIX,omitempty"`
+		BufferIdx *int    `json:"BUFFER_INDEX,omitempty"`
+		BufferOff *int    `json:"OFFSET,omitempty"`
+		Inlined   *string `json:"INLINED,omitempty"`
+	}
+
+	o := make([]interface{}, arr.Len())
+	for i := range o {
+		hdr := arr.ValueHeader(i)
+		if hdr.IsInline() {
+			data := hdr.InlineData()
+			if isBinary {
+				data = strings.ToUpper(hex.EncodeToString(hdr.InlineBytes()))

Review Comment:
   is `ToUpper` necessary?



##########
go/arrow/array/binarybuilder.go:
##########
@@ -370,6 +371,327 @@ func (b *BinaryBuilder) UnmarshalJSON(data []byte) error {
 	return b.Unmarshal(dec)
 }
 
+const (
+	dfltBlockSize             = 1 << 20 // 1 MB
+	viewValueSizeLimit uint32 = math.MaxUint32
+)
+
+type BinaryViewBuilder struct {
+	builder
+	dtype arrow.BinaryDataType
+
+	data    *memory.Buffer
+	rawData []arrow.StringHeader
+
+	blockBuilder multiBufferBuilder
+}
+
+func NewBinaryViewBuilder(mem memory.Allocator) *BinaryViewBuilder {
+	return &BinaryViewBuilder{
+		dtype: arrow.BinaryTypes.BinaryView,
+		builder: builder{
+			refCount: 1,
+			mem:      mem,
+		},
+		blockBuilder: multiBufferBuilder{
+			refCount:  1,
+			blockSize: dfltBlockSize,
+			mem:       mem,
+		},
+	}
+}
+
+func (b *BinaryViewBuilder) Type() arrow.DataType { return b.dtype }
+
+func (b *BinaryViewBuilder) Release() {
+	debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases")
+
+	if atomic.AddInt64(&b.refCount, -1) == 0 {
+		if b.nullBitmap != nil {
+			b.nullBitmap.Release()
+			b.nullBitmap = nil
+		}
+		if b.data != nil {
+			b.data.Release()
+			b.data = nil
+			b.rawData = nil
+		}
+	}

Review Comment:
   I do feel like `exit early` should be utilized heavily in the `Release()` implementations, like so:
   ```suggestion
   	if atomic.AddInt64(&b.refCount, -1) != 0 {
   		return
   	}
   
   	if b.nullBitmap != nil {
   		b.nullBitmap.Release()
   		b.nullBitmap = nil
   	}
   	if b.data != nil {
   		b.data.Release()
   		b.data = nil
   		b.rawData = nil
   	}
   ```



##########
go/arrow/array/bufferbuilder.go:
##########
@@ -151,3 +153,115 @@ func (b *bufferBuilder) unsafeAppend(data []byte) {
 	copy(b.bytes[b.length:], data)
 	b.length += len(data)
 }
+
+type multiBufferBuilder struct {
+	refCount  int64
+	blockSize int
+
+	mem              memory.Allocator
+	blocks           []*memory.Buffer
+	currentOutBuffer int
+}
+
+// Retain increases the reference count by 1.
+// Retain may be called simultaneously from multiple goroutines.
+func (b *multiBufferBuilder) Retain() {
+	atomic.AddInt64(&b.refCount, 1)
+}
+
+// Release decreases the reference count by 1.
+// When the reference count goes to zero, the memory is freed.
+// Release may be called simultaneously from multiple goroutines.
+func (b *multiBufferBuilder) Release() {
+	debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases")
+
+	if atomic.AddInt64(&b.refCount, -1) == 0 {
+		for i, buf := range b.blocks {
+			buf.Release()
+			b.blocks[i] = nil
+		}
+	}

Review Comment:
   ```suggestion
   	if atomic.AddInt64(&b.refCount, -1) == 0 {
   		b.Reset()
   	}
   ```



##########
go/arrow/array/binarybuilder.go:
##########
@@ -370,6 +371,327 @@ func (b *BinaryBuilder) UnmarshalJSON(data []byte) error {
 	return b.Unmarshal(dec)
 }
 
+const (
+	dfltBlockSize             = 1 << 20 // 1 MB
+	viewValueSizeLimit uint32 = math.MaxUint32
+)
+
+type BinaryViewBuilder struct {
+	builder
+	dtype arrow.BinaryDataType
+
+	data    *memory.Buffer
+	rawData []arrow.StringHeader
+
+	blockBuilder multiBufferBuilder
+}
+
+func NewBinaryViewBuilder(mem memory.Allocator) *BinaryViewBuilder {
+	return &BinaryViewBuilder{
+		dtype: arrow.BinaryTypes.BinaryView,
+		builder: builder{
+			refCount: 1,
+			mem:      mem,
+		},
+		blockBuilder: multiBufferBuilder{
+			refCount:  1,
+			blockSize: dfltBlockSize,
+			mem:       mem,
+		},
+	}
+}
+
+func (b *BinaryViewBuilder) Type() arrow.DataType { return b.dtype }
+
+func (b *BinaryViewBuilder) Release() {
+	debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases")
+
+	if atomic.AddInt64(&b.refCount, -1) == 0 {
+		if b.nullBitmap != nil {
+			b.nullBitmap.Release()
+			b.nullBitmap = nil
+		}
+		if b.data != nil {
+			b.data.Release()
+			b.data = nil
+			b.rawData = nil
+		}
+	}
+}
+
+func (b *BinaryViewBuilder) init(capacity int) {
+	b.builder.init(capacity)
+	b.data = memory.NewResizableBuffer(b.mem)
+	bytesN := arrow.StringHeaderTraits.BytesRequired(capacity)
+	b.data.Resize(bytesN)
+	b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+}
+
+func (b *BinaryViewBuilder) Resize(n int) {
+	nbuild := n
+	if n < minBuilderCapacity {
+		n = minBuilderCapacity
+	}
+
+	if b.capacity == 0 {
+		b.init(n)
+	} else {
+		b.builder.resize(nbuild, b.init)
+		b.data.Resize(arrow.StringHeaderTraits.BytesRequired(n))
+		b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+	}

Review Comment:
   ```suggestion
   	if b.capacity == 0 {
   		b.init(n)
   		return
   	}
   	
   	b.builder.resize(nbuild, b.init)
   	b.data.Resize(arrow.StringHeaderTraits.BytesRequired(n))
   	b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
   ```



##########
go/arrow/array/binarybuilder.go:
##########
@@ -370,6 +371,327 @@ func (b *BinaryBuilder) UnmarshalJSON(data []byte) error {
 	return b.Unmarshal(dec)
 }
 
+const (
+	dfltBlockSize             = 1 << 20 // 1 MB
+	viewValueSizeLimit uint32 = math.MaxUint32
+)
+
+type BinaryViewBuilder struct {
+	builder
+	dtype arrow.BinaryDataType
+
+	data    *memory.Buffer
+	rawData []arrow.StringHeader
+
+	blockBuilder multiBufferBuilder
+}
+
+func NewBinaryViewBuilder(mem memory.Allocator) *BinaryViewBuilder {
+	return &BinaryViewBuilder{
+		dtype: arrow.BinaryTypes.BinaryView,
+		builder: builder{
+			refCount: 1,
+			mem:      mem,
+		},
+		blockBuilder: multiBufferBuilder{
+			refCount:  1,
+			blockSize: dfltBlockSize,
+			mem:       mem,
+		},
+	}
+}
+
+func (b *BinaryViewBuilder) Type() arrow.DataType { return b.dtype }
+
+func (b *BinaryViewBuilder) Release() {
+	debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases")
+
+	if atomic.AddInt64(&b.refCount, -1) == 0 {
+		if b.nullBitmap != nil {
+			b.nullBitmap.Release()
+			b.nullBitmap = nil
+		}
+		if b.data != nil {
+			b.data.Release()
+			b.data = nil
+			b.rawData = nil
+		}
+	}
+}
+
+func (b *BinaryViewBuilder) init(capacity int) {
+	b.builder.init(capacity)
+	b.data = memory.NewResizableBuffer(b.mem)
+	bytesN := arrow.StringHeaderTraits.BytesRequired(capacity)
+	b.data.Resize(bytesN)
+	b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+}
+
+func (b *BinaryViewBuilder) Resize(n int) {
+	nbuild := n
+	if n < minBuilderCapacity {
+		n = minBuilderCapacity
+	}
+
+	if b.capacity == 0 {
+		b.init(n)
+	} else {
+		b.builder.resize(nbuild, b.init)
+		b.data.Resize(arrow.StringHeaderTraits.BytesRequired(n))
+		b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+	}
+}
+
+func (b *BinaryViewBuilder) ReserveData(length int) {
+	if uint32(length) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB",
+			arrow.ErrInvalid))
+	}
+	b.blockBuilder.Reserve(int(length))
+}
+
+func (b *BinaryViewBuilder) Reserve(n int) {
+	b.builder.reserve(n, b.Resize)
+}
+
+func (b *BinaryViewBuilder) Append(v []byte) {
+	if uint32(len(v)) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB", arrow.ErrInvalid))
+	}
+
+	if !arrow.IsStringHeaderInline(len(v)) {
+		b.ReserveData(len(v))
+	}
+
+	b.Reserve(1)
+	b.UnsafeAppend(v)
+}
+
+// AppendString is identical to Append, only accepting a string instead
+// of a byte slice, avoiding the extra copy that would occur if you simply
+// did []byte(v).
+//
+// This is different than AppendValueFromString which exists for the
+// Builder interface, in that this expects raw binary data which is
+// appended as such. AppendValueFromString expects base64 encoded binary
+// data instead.
+func (b *BinaryViewBuilder) AppendString(v string) {
+	// create a []byte without copying the bytes
+	// in go1.20 this would be unsafe.StringData
+	val := *(*[]byte)(unsafe.Pointer(&struct {
+		string
+		int
+	}{v, len(v)}))
+	b.Append(val)
+}
+
+func (b *BinaryViewBuilder) AppendNull() {
+	b.Reserve(1)
+	b.UnsafeAppendBoolToBitmap(false)
+}
+
+func (b *BinaryViewBuilder) AppendNulls(n int) {
+	b.Reserve(n)
+	for i := 0; i < n; i++ {
+		b.UnsafeAppendBoolToBitmap(false)
+	}
+}
+
+func (b *BinaryViewBuilder) AppendEmptyValue() {
+	b.Reserve(1)
+	b.UnsafeAppendBoolToBitmap(true)
+}
+
+func (b *BinaryViewBuilder) AppendEmptyValues(n int) {
+	b.Reserve(n)
+	b.unsafeAppendBoolsToBitmap(nil, n)
+}
+
+func (b *BinaryViewBuilder) UnsafeAppend(v []byte) {
+	hdr := &b.rawData[b.length]
+	hdr.SetBytes(v)
+	if !hdr.IsInline() {
+		b.blockBuilder.UnsafeAppend(hdr, v)
+	}
+	b.UnsafeAppendBoolToBitmap(true)
+}
+
+func (b *BinaryViewBuilder) AppendValues(v [][]byte, valid []bool) {
+	if len(v) != len(valid) && len(valid) != 0 {
+		panic("len(v) != len(valid) && len(valid) != 0")
+	}
+
+	if len(v) == 0 {
+		return
+	}
+
+	b.Reserve(len(v))
+	outOfLineTotal := 0
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			if !arrow.IsStringHeaderInline(len(vv)) {
+				outOfLineTotal += len(vv)
+			}
+		}
+	}
+
+	b.ReserveData(outOfLineTotal)
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			hdr := &b.rawData[b.length+i]
+			hdr.SetBytes(vv)
+			if !hdr.IsInline() {
+				b.blockBuilder.UnsafeAppend(hdr, vv)
+			}
+		}
+	}
+
+	b.builder.unsafeAppendBoolsToBitmap(valid, len(v))
+}
+
+func (b *BinaryViewBuilder) AppendStringValues(v []string, valid []bool) {
+	if len(v) != len(valid) && len(valid) != 0 {
+		panic("len(v) != len(valid) && len(valid) != 0")
+	}
+
+	if len(v) == 0 {
+		return
+	}
+
+	b.Reserve(len(v))
+	outOfLineTotal := 0
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			if !arrow.IsStringHeaderInline(len(vv)) {
+				outOfLineTotal += len(vv)
+			}
+		}
+	}
+
+	b.ReserveData(outOfLineTotal)
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			hdr := &b.rawData[b.length+i]
+			hdr.SetString(vv)
+			if !hdr.IsInline() {
+				b.blockBuilder.UnsafeAppendString(hdr, vv)
+			}
+		}
+	}
+
+	b.builder.unsafeAppendBoolsToBitmap(valid, len(v))
+}
+
+// AppendValueFromString is paired with ValueStr for fulfilling the
+// base Builder interface. This is intended to read in a human-readable
+// string such as from CSV or JSON and append it to the array.
+//
+// For Binary values are expected to be base64 encoded (and will be
+// decoded as such before being appended).
+func (b *BinaryViewBuilder) AppendValueFromString(s string) error {
+	if s == NullValueStr {
+		b.AppendNull()
+		return nil
+	}
+
+	if b.dtype.IsUtf8() {
+		b.Append([]byte(s))
+		return nil
+	}
+
+	decodedVal, err := base64.StdEncoding.DecodeString(s)
+	if err != nil {
+		return fmt.Errorf("could not decode base64 string: %w", err)
+	}
+	b.Append(decodedVal)
+	return nil
+}
+
+func (b *BinaryViewBuilder) UnmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case string:
+		data, err := base64.StdEncoding.DecodeString(v)
+		if err != nil {
+			return err
+		}
+		b.Append(data)
+	case []byte:
+		b.Append(v)
+	case nil:
+		b.AppendNull()
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf([]byte{}),
+			Offset: dec.InputOffset(),
+		}
+	}
+	return nil
+}
+
+func (b *BinaryViewBuilder) Unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.UnmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *BinaryViewBuilder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary view builder must unpack from json array, found %s", delim)
+	}
+
+	return b.Unmarshal(dec)
+}
+
+func (b *BinaryViewBuilder) newData() (data *Data) {
+	bytesRequired := arrow.StringHeaderTraits.BytesRequired(b.length)
+	if bytesRequired > 0 && bytesRequired < b.data.Len() {
+		// trim buffers
+		b.data.Resize(bytesRequired)
+	}
+
+	dataBuffers := b.blockBuilder.Finish()
+	data = NewData(b.dtype, b.length, append([]*memory.Buffer{
+		b.nullBitmap, b.data}, dataBuffers...), nil, b.nulls, 0)
+	b.reset()
+
+	if b.data != nil {
+		b.data.Release()
+		b.data = nil
+		b.rawData = nil
+		for _, buf := range dataBuffers {
+			buf.Release()
+		}
+	}
+	return
+}
+
+func (b *BinaryViewBuilder) NewBinaryViewArray() (a *BinaryView) {
+	data := b.newData()
+	a = NewBinaryViewData(data)
+	data.Release()
+	return

Review Comment:
   ```suggestion
   	data := b.newData()
   	defer data.Release()
   	return NewBinaryViewData(data)
   ```
   ?



##########
go/arrow/datatype_stringheader.go:
##########
@@ -0,0 +1,136 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arrow
+
+import (
+	"bytes"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v13/arrow/endian"
+	"github.com/apache/arrow/go/v13/arrow/internal/debug"
+	"github.com/apache/arrow/go/v13/arrow/memory"
+)
+
+const (
+	StringHeaderPrefixLen  = 4
+	stringHeaderInlineSize = 12
+)
+
+func IsStringHeaderInline(length int) bool {
+	return length < stringHeaderInlineSize
+}
+
+// StringHeader is a variable length string (utf8) or byte slice with
+// a 4 byte prefix and inline optimization for small values (12 bytes
+// or fewer). This is similar to Go's standard string  but limited by

Review Comment:
   ```suggestion
   // or fewer). This is similar to Go's standard string but limited by
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lquerel commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "lquerel (via GitHub)" <gi...@apache.org>.
lquerel commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1236024831


##########
go/arrow/array/binary.go:
##########
@@ -318,6 +319,116 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool {
 	return true
 }
 
+type ViewLike interface {
+	arrow.Array
+	ValueHeader(int) *arrow.StringHeader
+}
+
+type BinaryView struct {
+	array
+	values      []arrow.StringHeader
+	dataBuffers []*memory.Buffer
+}
+
+func NewBinaryViewData(data arrow.ArrayData) *BinaryView {
+	a := &BinaryView{}
+	a.refCount = 1
+	a.setData(data.(*Data))
+	return a
+}
+
+func (a *BinaryView) setData(data *Data) {
+	if len(data.buffers) < 2 {
+		panic("len(data.buffers) < 2")
+	}
+	a.array.setData(data)
+
+	if valueData := data.buffers[1]; valueData != nil {
+		a.values = arrow.StringHeaderTraits.CastFromBytes(valueData.Bytes())
+	}
+
+	a.dataBuffers = data.buffers[2:]
+}
+
+func (a *BinaryView) ValueHeader(i int) *arrow.StringHeader {
+	if i < 0 || i >= a.array.data.length {
+		panic("arrow/array: index out of range")
+	}
+	return &a.values[a.array.data.offset+i]
+}
+
+func (a *BinaryView) Value(i int) []byte {

Review Comment:
   This is probably a reflex from my Rust experience where this slice will be immutable. In the context of Go, I agree with you that sporadically adding a comment might be counterproductive if we do not apply this globally.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] felipecrv commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "felipecrv (via GitHub)" <gi...@apache.org>.
felipecrv commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1236014819


##########
go/arrow/array/binary.go:
##########
@@ -318,6 +319,116 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool {
 	return true
 }
 
+type ViewLike interface {
+	arrow.Array
+	ValueHeader(int) *arrow.StringHeader
+}
+
+type BinaryView struct {
+	array
+	values      []arrow.StringHeader
+	dataBuffers []*memory.Buffer
+}
+
+func NewBinaryViewData(data arrow.ArrayData) *BinaryView {
+	a := &BinaryView{}
+	a.refCount = 1
+	a.setData(data.(*Data))
+	return a
+}
+
+func (a *BinaryView) setData(data *Data) {
+	if len(data.buffers) < 2 {
+		panic("len(data.buffers) < 2")
+	}
+	a.array.setData(data)
+
+	if valueData := data.buffers[1]; valueData != nil {
+		a.values = arrow.StringHeaderTraits.CastFromBytes(valueData.Bytes())
+	}
+
+	a.dataBuffers = data.buffers[2:]
+}
+
+func (a *BinaryView) ValueHeader(i int) *arrow.StringHeader {
+	if i < 0 || i >= a.array.data.length {
+		panic("arrow/array: index out of range")
+	}
+	return &a.values[a.array.data.offset+i]
+}
+
+func (a *BinaryView) Value(i int) []byte {

Review Comment:
   This is a concern present everywhere in Arrow though, so a comment here could be understood as implying that places without a comment like this allow buffers to be mutated.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] bkietz commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "bkietz (via GitHub)" <gi...@apache.org>.
bkietz commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1283613027


##########
go/arrow/internal/arrjson/arrjson.go:
##########
@@ -2179,3 +2230,113 @@ func durationToJSON(arr *array.Duration) []interface{} {
 	}
 	return o
 }
+
+func variadicBuffersFromJSON(bufs []string) []*memory.Buffer {
+	out := make([]*memory.Buffer, len(bufs))
+	for i, data := range bufs {
+		rawData, err := hex.DecodeString(data)
+		if err != nil {
+			panic(err)
+		}
+
+		out[i] = memory.NewBufferBytes(rawData)
+	}
+	return out
+}
+
+func variadicBuffersToJSON(bufs []*memory.Buffer) []string {
+	out := make([]string, len(bufs))
+	for i, data := range bufs {
+		out[i] = strings.ToUpper(hex.EncodeToString(data.Bytes()))
+	}
+	return out
+}
+
+func stringHeadersFromJSON(mem memory.Allocator, isBinary bool, data []interface{}) *memory.Buffer {
+	buf := memory.NewResizableBuffer(mem)
+	buf.Resize(arrow.StringHeaderTraits.BytesRequired(len(data)))
+
+	values := arrow.StringHeaderTraits.CastFromBytes(buf.Bytes())
+
+	for i, d := range data {
+		switch v := d.(type) {
+		case nil:
+			continue
+		case map[string]interface{}:
+			if inlined, ok := v["INLINED"]; ok {
+				if isBinary {
+					val, err := hex.DecodeString(inlined.(string))
+					if err != nil {
+						panic(fmt.Errorf("could not decode %v: %v", inlined, err))
+					}
+					values[i].SetBytes(val)
+				} else {
+					values[i].SetString(inlined.(string))
+				}
+				continue
+			}
+
+			idx, offset := v["BUFFER_INDEX"].(json.Number), v["OFFSET"].(json.Number)
+			bufIdx, err := idx.Int64()
+			if err != nil {
+				panic(err)
+			}
+
+			bufOffset, err := offset.Int64()
+			if err != nil {
+				panic(err)
+			}
+
+			values[i].SetIndexOffset(uint32(bufIdx), uint32(bufOffset))
+			prefix, err := hex.DecodeString(v["PREFIX"].(string))
+			if err != nil {
+				panic(err)
+			}
+			sz, err := v["SIZE"].(json.Number).Int64()
+			if err != nil {
+				panic(err)
+			}
+
+			rawData := make([]byte, sz)
+			copy(rawData, prefix)
+			values[i].SetBytes(rawData)
+		}
+	}
+	return buf
+}
+
+func stringHeadersToJSON(arr array.ViewLike, isBinary bool) []interface{} {
+	type StringHeader struct {
+		Size      int     `json:"SIZE"`
+		Prefix    *string `json:"PREFIX,omitempty"`
+		BufferIdx *int    `json:"BUFFER_INDEX,omitempty"`
+		BufferOff *int    `json:"OFFSET,omitempty"`
+		Inlined   *string `json:"INLINED,omitempty"`
+	}
+
+	o := make([]interface{}, arr.Len())
+	for i := range o {
+		hdr := arr.ValueHeader(i)
+		if hdr.IsInline() {
+			data := hdr.InlineData()
+			if isBinary {
+				data = strings.ToUpper(hex.EncodeToString(hdr.InlineBytes()))

Review Comment:
   I think relaxing the spec is not a trivial change; it'd require all implementers to update how they consume binary data. That said, although it'd be a lot of work updating the integration JSON (perhaps even including a real JSON schema :scream:) is definitely worth a follow up.
   
   See also:
   - https://github.com/apache/arrow/pull/7110
   - https://github.com/apache/arrow/issues/34898



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation [arrow]

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1374779003


##########
go/arrow/datatype_viewheader_inline_tinygo.go:
##########
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !go1.20 && tinygo
+
+package arrow
+
+import (
+	"reflect"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v14/arrow/internal/debug"
+)
+
+func (sh *ViewHeader) InlineData() (data string) {

Review Comment:
   Provides the value as a `string` instead of `[]byte` for consumers without performing a copy. so rather than consumers having to work out / implement the zero-copy conversion themselves we provide it for them.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation [arrow]

Posted by "bkietz (via GitHub)" <gi...@apache.org>.
bkietz commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1376284554


##########
go/arrow/datatype_viewheader.go:
##########
@@ -0,0 +1,138 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arrow
+
+import (
+	"bytes"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v14/arrow/endian"
+	"github.com/apache/arrow/go/v14/arrow/internal/debug"
+	"github.com/apache/arrow/go/v14/arrow/memory"
+)
+
+const (
+	StringViewPrefixLen  = 4
+	stringViewInlineSize = 12
+)
+
+func IsViewInline(length int) bool {
+	return length < stringViewInlineSize
+}
+
+// ViewHeader is a variable length string (utf8) or byte slice with
+// a 4 byte prefix and inline optimization for small values (12 bytes
+// or fewer). This is similar to Go's standard string but limited by
+// a length of Uint32Max and up to the first four bytes of the string
+// are copied into the struct. This prefix allows failing comparisons
+// early and can reduce CPU cache working set when dealing with short
+// strings.
+//
+// There are two situations:
+//
+//		Entirely inlined string data
+//	                |----|------------|
+//		                ^    ^
+//		                |    |
+//		              size  inline string data, zero padded
+//
+//		Reference into buffer
+//	                |----|----|----|----|
+//		                ^    ^     ^     ^
+//		                |    |     |     |
+//		              size prefix buffer index and offset to out-of-line portion
+//
+// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+//
+// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+type ViewHeader struct {
+	size uint32
+	// the first 4 bytes of this are the prefix for the string
+	// if size <= StringHeaderInlineSize, then the entire string
+	// is in the data array and is zero padded.
+	// if size > StringHeaderInlineSize, the next 8 bytes are 2 uint32
+	// values which are the buffer index and offset in that buffer
+	// containing the full string.
+	data [stringViewInlineSize]byte
+}
+
+func (sh *ViewHeader) IsInline() bool {
+	return sh.size <= uint32(stringViewInlineSize)
+}
+
+func (sh *ViewHeader) Len() int { return int(sh.size) }
+func (sh *ViewHeader) Prefix() [StringViewPrefixLen]byte {
+	return *(*[4]byte)(unsafe.Pointer(&sh.data))
+}
+
+func (sh *ViewHeader) BufferIndex() uint32 {
+	return endian.Native.Uint32(sh.data[StringViewPrefixLen:])
+}
+
+func (sh *ViewHeader) BufferOffset() uint32 {
+	return endian.Native.Uint32(sh.data[StringViewPrefixLen+4:])
+}
+
+func (sh *ViewHeader) InlineBytes() (data []byte) {
+	debug.Assert(sh.IsInline(), "calling InlineBytes on non-inline StringHeader")

Review Comment:
   It just seemed that since `/Inline(Bytes|String)/` is always used inside `if IsInline {}`, it might be preferable to write `if inline := sh.InlineBytes(); inline != nil {}`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "lidavidm (via GitHub)" <gi...@apache.org>.
lidavidm commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1206920219


##########
go/arrow/datatype_binary.go:
##########
@@ -83,16 +93,168 @@ func (t *LargeStringType) Layout() DataTypeLayout {
 func (t *LargeStringType) OffsetTypeTraits() OffsetTraits { return Int64Traits }
 func (LargeStringType) IsUtf8() bool                      { return true }
 
+const (
+	StringHeaderPrefixLen  = 4
+	stringHeaderInlineSize = 12
+)
+
+func IsStringHeaderInline(length int) bool {
+	return length < stringHeaderInlineSize
+}
+
+// StringHeader is a variable length string (utf8) or byte slice with
+// a 4 byte prefix and inline optimization for small values (12 bytes
+// or fewer). This is similar to Go's standard string  but limited by
+// a length of Uint32Max and up to the first four bytes of the string
+// are copied into the struct. This prefix allows failing comparisons
+// early and can reduce CPU cache working set when dealing with short
+// strings.
+//
+// There are two situations:
+//
+//	Short string   |----|----|--------|
+//	                ^    ^    ^
+//	                |    |    |
+//	              size prefix remaining in-line portion, zero padded
+//
+//	IO Long String |----|----|----|----|
+//	                ^    ^     ^     ^
+//	                |    |     |     |
+//	              size prefix buffer index and offset to out-of-line portion
+//
+// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+//
+// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+type StringHeader struct {
+	size uint32
+	// the first 4 bytes of this are the prefix for the string
+	// if size <= StringHeaderInlineSize, then the entire string
+	// is in the data array and is zero padded.
+	// if size > StringHeaderInlineSize, the next 8 bytes are 2 uint32
+	// values which are the buffer index and offset in that buffer
+	// containing the full string.
+	data [stringHeaderInlineSize]byte
+}
+
+func (sh *StringHeader) IsInline() bool {
+	return sh.size <= uint32(stringHeaderInlineSize)
+}
+
+func (sh *StringHeader) Len() int { return int(sh.size) }
+func (sh *StringHeader) Prefix() [StringHeaderPrefixLen]byte {
+	return *(*[4]byte)(unsafe.Pointer(&sh.data))
+}
+
+func (sh *StringHeader) BufferIndex() uint32 {
+	return endian.Native.Uint32(sh.data[StringHeaderPrefixLen:])
+}
+
+func (sh *StringHeader) BufferOffset() uint32 {
+	return endian.Native.Uint32(sh.data[StringHeaderPrefixLen+4:])
+}
+
+func (sh *StringHeader) InlineBytes() (data []byte) {
+	debug.Assert(sh.IsInline(), "calling InlineBytes on non-inline StringHeader")

Review Comment:
   should we assert `!IsInline` for `BufferIndex` and `BufferOffset`?



##########
go/arrow/datatype_binary.go:
##########
@@ -83,16 +93,168 @@ func (t *LargeStringType) Layout() DataTypeLayout {
 func (t *LargeStringType) OffsetTypeTraits() OffsetTraits { return Int64Traits }
 func (LargeStringType) IsUtf8() bool                      { return true }
 
+const (
+	StringHeaderPrefixLen  = 4
+	stringHeaderInlineSize = 12
+)
+
+func IsStringHeaderInline(length int) bool {
+	return length < stringHeaderInlineSize
+}
+
+// StringHeader is a variable length string (utf8) or byte slice with
+// a 4 byte prefix and inline optimization for small values (12 bytes
+// or fewer). This is similar to Go's standard string  but limited by
+// a length of Uint32Max and up to the first four bytes of the string
+// are copied into the struct. This prefix allows failing comparisons
+// early and can reduce CPU cache working set when dealing with short
+// strings.
+//
+// There are two situations:
+//
+//	Short string   |----|----|--------|
+//	                ^    ^    ^
+//	                |    |    |
+//	              size prefix remaining in-line portion, zero padded
+//
+//	IO Long String |----|----|----|----|
+//	                ^    ^     ^     ^
+//	                |    |     |     |
+//	              size prefix buffer index and offset to out-of-line portion
+//
+// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+//
+// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+type StringHeader struct {
+	size uint32
+	// the first 4 bytes of this are the prefix for the string
+	// if size <= StringHeaderInlineSize, then the entire string
+	// is in the data array and is zero padded.
+	// if size > StringHeaderInlineSize, the next 8 bytes are 2 uint32
+	// values which are the buffer index and offset in that buffer
+	// containing the full string.
+	data [stringHeaderInlineSize]byte
+}
+
+func (sh *StringHeader) IsInline() bool {
+	return sh.size <= uint32(stringHeaderInlineSize)
+}
+
+func (sh *StringHeader) Len() int { return int(sh.size) }
+func (sh *StringHeader) Prefix() [StringHeaderPrefixLen]byte {
+	return *(*[4]byte)(unsafe.Pointer(&sh.data))
+}
+
+func (sh *StringHeader) BufferIndex() uint32 {
+	return endian.Native.Uint32(sh.data[StringHeaderPrefixLen:])
+}
+
+func (sh *StringHeader) BufferOffset() uint32 {
+	return endian.Native.Uint32(sh.data[StringHeaderPrefixLen+4:])
+}
+
+func (sh *StringHeader) InlineBytes() (data []byte) {
+	debug.Assert(sh.IsInline(), "calling InlineBytes on non-inline StringHeader")
+	return sh.data[:sh.size]
+}
+
+func (sh *StringHeader) InlineData() (data string) {
+	debug.Assert(sh.IsInline(), "calling InlineData on non-inline StringHeader")
+	h := (*reflect.StringHeader)(unsafe.Pointer(&data))
+	h.Data = uintptr(unsafe.Pointer(&sh.data))
+	h.Len = int(sh.size)
+	return
+}
+
+func (sh *StringHeader) SetBytes(data []byte) int {
+	sh.size = uint32(len(data))
+	if sh.IsInline() {
+		return copy(sh.data[:], data)
+	}
+	return copy(sh.data[:4], data)
+}
+
+func (sh *StringHeader) SetString(data string) int {
+	sh.size = uint32(len(data))
+	if sh.IsInline() {
+		return copy(sh.data[:], data)
+	}
+	return copy(sh.data[:4], data)
+}
+
+func (sh *StringHeader) SetIndexOffset(bufferIndex, offset uint32) {
+	endian.Native.PutUint32(sh.data[StringHeaderPrefixLen:], bufferIndex)
+	endian.Native.PutUint32(sh.data[StringHeaderPrefixLen+4:], offset)
+}
+
+func (sh *StringHeader) Equals(buffers []*memory.Buffer, other *StringHeader, otherBuffers []*memory.Buffer) bool {
+	if sh.sizeAndPrefixAsInt() != other.sizeAndPrefixAsInt() {
+		return false
+	}
+
+	if sh.IsInline() {
+		return sh.inlinedAsInt64() == other.inlinedAsInt64()
+	}
+
+	data := buffers[sh.BufferIndex()].Bytes()[sh.BufferOffset() : sh.BufferOffset()+sh.size]
+	otherData := otherBuffers[other.BufferIndex()].Bytes()[other.BufferOffset() : other.BufferOffset()+other.size]
+	return bytes.Equal(data, otherData)
+}
+
+func (sh *StringHeader) inlinedAsInt64() int64 {
+	s := unsafe.Slice((*int64)(unsafe.Pointer(sh)), 2)
+	return s[1]
+}
+
+func (sh *StringHeader) sizeAndPrefixAsInt() int64 {
+	s := unsafe.Slice((*int64)(unsafe.Pointer(sh)), 2)
+	return s[0]
+}
+
+type BinaryViewType struct{}
+
+func (*BinaryViewType) ID() Type              { return BINARY_VIEW }
+func (*BinaryViewType) Name() string          { return "binary_view" }
+func (*BinaryViewType) String() string        { return "binary_view" }
+func (*BinaryViewType) IsUtf8() bool          { return false }
+func (*BinaryViewType) binary()               {}
+func (*BinaryViewType) view()                 {}

Review Comment:
   What are these for?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1237590506


##########
go/arrow/datatype_stringheader_inline_tinygo.go:
##########
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !go1.20 && tinygo
+
+package arrow
+
+import (
+	"reflect"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v13/arrow/internal/debug"
+)
+
+func (sh *StringHeader) InlineData() (data string) {
+	debug.Assert(sh.IsInline(), "calling InlineData on non-inline StringHeader")
+
+	h := (*reflect.StringHeader)(unsafe.Pointer(&data))

Review Comment:
   yea. The `Len` in the `StringHeader` for tinygo is a `uintptr` instead of an `int` which is what go1.19 has. Compare line 33 of each file.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] candiduslynx commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "candiduslynx (via GitHub)" <gi...@apache.org>.
candiduslynx commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1238190324


##########
go/arrow/ipc/file_reader.go:
##########
@@ -430,13 +430,18 @@ func (src *ipcSource) fieldMetadata(i int) *flatbuf.FieldNode {
 	return &node
 }
 
+func (src *ipcSource) variadicCount(i int) int64 {
+	return src.meta.VariadicCounts(i)
+}
+
 type arrayLoaderContext struct {
-	src     ipcSource
-	ifield  int
-	ibuffer int
-	max     int
-	memo    *dictutils.Memo
-	version MetadataVersion
+	src       ipcSource
+	ifield    int
+	ibuffer   int
+	ivariadic int

Review Comment:
   maybe these should be grouped together into an embedded struct? (`iStruct` :chuckle:)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] bkietz commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "bkietz (via GitHub)" <gi...@apache.org>.
bkietz commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1224267500


##########
go/arrow/datatype_binary.go:
##########
@@ -83,16 +93,168 @@ func (t *LargeStringType) Layout() DataTypeLayout {
 func (t *LargeStringType) OffsetTypeTraits() OffsetTraits { return Int64Traits }
 func (LargeStringType) IsUtf8() bool                      { return true }
 
+const (
+	StringHeaderPrefixLen  = 4
+	stringHeaderInlineSize = 12
+)
+
+func IsStringHeaderInline(length int) bool {
+	return length < stringHeaderInlineSize
+}
+
+// StringHeader is a variable length string (utf8) or byte slice with
+// a 4 byte prefix and inline optimization for small values (12 bytes
+// or fewer). This is similar to Go's standard string  but limited by
+// a length of Uint32Max and up to the first four bytes of the string
+// are copied into the struct. This prefix allows failing comparisons
+// early and can reduce CPU cache working set when dealing with short
+// strings.
+//
+// There are two situations:
+//
+//	Short string   |----|----|--------|
+//	                ^    ^    ^
+//	                |    |    |
+//	              size prefix remaining in-line portion, zero padded
+//
+//	IO Long String |----|----|----|----|
+//	                ^    ^     ^     ^
+//	                |    |     |     |
+//	              size prefix buffer index and offset to out-of-line portion
+//
+// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+//
+// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+type StringHeader struct {

Review Comment:
   I suppose you could force the alignment by using `[2]uint64` for storage. Would that pessimize the struct too much?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation [arrow]

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1374786544


##########
go/arrow/array/bufferbuilder.go:
##########
@@ -151,3 +153,112 @@ func (b *bufferBuilder) unsafeAppend(data []byte) {
 	copy(b.bytes[b.length:], data)
 	b.length += len(data)
 }
+
+type multiBufferBuilder struct {
+	refCount  int64
+	blockSize int

Review Comment:
   the `multiBufferBuilder` isn't exported itself, so a user wouldn't even have access to the buffer builder in order to set the blocksize. I can add a method to the `BinaryViewBuilder` that would allow setting the blocksize



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation [arrow]

Posted by "candiduslynx (via GitHub)" <gi...@apache.org>.
candiduslynx commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1377087159


##########
go/arrow/datatype_viewheader.go:
##########
@@ -0,0 +1,141 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arrow
+
+import (
+	"bytes"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v14/arrow/endian"
+	"github.com/apache/arrow/go/v14/arrow/internal/debug"
+	"github.com/apache/arrow/go/v14/arrow/memory"
+)
+
+const (
+	StringViewPrefixLen  = 4
+	stringViewInlineSize = 12
+)
+
+func IsViewInline(length int) bool {
+	return length < stringViewInlineSize
+}
+
+// ViewHeader is a variable length string (utf8) or byte slice with
+// a 4 byte prefix and inline optimization for small values (12 bytes
+// or fewer). This is similar to Go's standard string but limited by
+// a length of Uint32Max and up to the first four bytes of the string
+// are copied into the struct. This prefix allows failing comparisons
+// early and can reduce CPU cache working set when dealing with short
+// strings.
+//
+// There are two situations:
+//
+//		Entirely inlined string data
+//	                |----|------------|
+//		                ^    ^
+//		                |    |
+//		              size  inline string data, zero padded
+//
+//		Reference into buffer
+//	                |----|----|----|----|
+//		                ^    ^     ^     ^
+//		                |    |     |     |
+//		              size prefix buffer index and offset to out-of-line portion
+//
+// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+//
+// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+type ViewHeader struct {
+	size int32
+	// the first 4 bytes of this are the prefix for the string
+	// if size <= StringHeaderInlineSize, then the entire string
+	// is in the data array and is zero padded.
+	// if size > StringHeaderInlineSize, the next 8 bytes are 2 uint32
+	// values which are the buffer index and offset in that buffer
+	// containing the full string.
+	data [stringViewInlineSize]byte
+}
+
+func (sh *ViewHeader) IsInline() bool {
+	return sh.size <= int32(stringViewInlineSize)
+}
+
+func (sh *ViewHeader) Len() int { return int(sh.size) }
+func (sh *ViewHeader) Prefix() [StringViewPrefixLen]byte {
+	return *(*[4]byte)(unsafe.Pointer(&sh.data))
+}
+
+func (sh *ViewHeader) BufferIndex() int32 {
+	return int32(endian.Native.Uint32(sh.data[StringViewPrefixLen:]))
+}
+
+func (sh *ViewHeader) BufferOffset() int32 {
+	return int32(endian.Native.Uint32(sh.data[StringViewPrefixLen+4:]))
+}
+
+func (sh *ViewHeader) InlineBytes() (data []byte) {
+	debug.Assert(sh.IsInline(), "calling InlineBytes on non-inline StringHeader")

Review Comment:
   ```suggestion
   	debug.Assert(sh.IsInline(), "calling InlineBytes on non-inline ViewHeader")
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on PR #35769:
URL: https://github.com/apache/arrow/pull/35769#issuecomment-1604357018

   Thanks for the reviews @lquerel and @candiduslynx!!


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1238732612


##########
go/arrow/array/binarybuilder.go:
##########
@@ -370,6 +371,327 @@ func (b *BinaryBuilder) UnmarshalJSON(data []byte) error {
 	return b.Unmarshal(dec)
 }
 
+const (
+	dfltBlockSize             = 1 << 20 // 1 MB
+	viewValueSizeLimit uint32 = math.MaxUint32
+)
+
+type BinaryViewBuilder struct {
+	builder
+	dtype arrow.BinaryDataType
+
+	data    *memory.Buffer
+	rawData []arrow.StringHeader
+
+	blockBuilder multiBufferBuilder
+}
+
+func NewBinaryViewBuilder(mem memory.Allocator) *BinaryViewBuilder {
+	return &BinaryViewBuilder{
+		dtype: arrow.BinaryTypes.BinaryView,
+		builder: builder{
+			refCount: 1,
+			mem:      mem,
+		},
+		blockBuilder: multiBufferBuilder{
+			refCount:  1,
+			blockSize: dfltBlockSize,
+			mem:       mem,
+		},
+	}
+}
+
+func (b *BinaryViewBuilder) Type() arrow.DataType { return b.dtype }
+
+func (b *BinaryViewBuilder) Release() {
+	debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases")
+
+	if atomic.AddInt64(&b.refCount, -1) == 0 {
+		if b.nullBitmap != nil {
+			b.nullBitmap.Release()
+			b.nullBitmap = nil
+		}
+		if b.data != nil {
+			b.data.Release()
+			b.data = nil
+			b.rawData = nil
+		}
+	}
+}
+
+func (b *BinaryViewBuilder) init(capacity int) {
+	b.builder.init(capacity)
+	b.data = memory.NewResizableBuffer(b.mem)
+	bytesN := arrow.StringHeaderTraits.BytesRequired(capacity)
+	b.data.Resize(bytesN)
+	b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+}
+
+func (b *BinaryViewBuilder) Resize(n int) {
+	nbuild := n
+	if n < minBuilderCapacity {
+		n = minBuilderCapacity
+	}
+
+	if b.capacity == 0 {
+		b.init(n)
+	} else {
+		b.builder.resize(nbuild, b.init)
+		b.data.Resize(arrow.StringHeaderTraits.BytesRequired(n))
+		b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+	}
+}
+
+func (b *BinaryViewBuilder) ReserveData(length int) {
+	if uint32(length) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB",
+			arrow.ErrInvalid))
+	}
+	b.blockBuilder.Reserve(int(length))
+}
+
+func (b *BinaryViewBuilder) Reserve(n int) {
+	b.builder.reserve(n, b.Resize)
+}
+
+func (b *BinaryViewBuilder) Append(v []byte) {
+	if uint32(len(v)) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB", arrow.ErrInvalid))
+	}
+
+	if !arrow.IsStringHeaderInline(len(v)) {
+		b.ReserveData(len(v))
+	}
+
+	b.Reserve(1)
+	b.UnsafeAppend(v)
+}
+
+// AppendString is identical to Append, only accepting a string instead
+// of a byte slice, avoiding the extra copy that would occur if you simply
+// did []byte(v).
+//
+// This is different than AppendValueFromString which exists for the
+// Builder interface, in that this expects raw binary data which is
+// appended as such. AppendValueFromString expects base64 encoded binary
+// data instead.
+func (b *BinaryViewBuilder) AppendString(v string) {
+	// create a []byte without copying the bytes
+	// in go1.20 this would be unsafe.StringData
+	val := *(*[]byte)(unsafe.Pointer(&struct {
+		string
+		int
+	}{v, len(v)}))
+	b.Append(val)
+}
+
+func (b *BinaryViewBuilder) AppendNull() {
+	b.Reserve(1)
+	b.UnsafeAppendBoolToBitmap(false)
+}
+
+func (b *BinaryViewBuilder) AppendNulls(n int) {
+	b.Reserve(n)
+	for i := 0; i < n; i++ {
+		b.UnsafeAppendBoolToBitmap(false)
+	}
+}
+
+func (b *BinaryViewBuilder) AppendEmptyValue() {
+	b.Reserve(1)
+	b.UnsafeAppendBoolToBitmap(true)
+}
+
+func (b *BinaryViewBuilder) AppendEmptyValues(n int) {
+	b.Reserve(n)
+	b.unsafeAppendBoolsToBitmap(nil, n)
+}
+
+func (b *BinaryViewBuilder) UnsafeAppend(v []byte) {
+	hdr := &b.rawData[b.length]
+	hdr.SetBytes(v)
+	if !hdr.IsInline() {
+		b.blockBuilder.UnsafeAppend(hdr, v)
+	}
+	b.UnsafeAppendBoolToBitmap(true)
+}
+
+func (b *BinaryViewBuilder) AppendValues(v [][]byte, valid []bool) {
+	if len(v) != len(valid) && len(valid) != 0 {
+		panic("len(v) != len(valid) && len(valid) != 0")
+	}
+
+	if len(v) == 0 {
+		return
+	}
+
+	b.Reserve(len(v))
+	outOfLineTotal := 0
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			if !arrow.IsStringHeaderInline(len(vv)) {
+				outOfLineTotal += len(vv)
+			}
+		}
+	}
+
+	b.ReserveData(outOfLineTotal)
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			hdr := &b.rawData[b.length+i]
+			hdr.SetBytes(vv)
+			if !hdr.IsInline() {
+				b.blockBuilder.UnsafeAppend(hdr, vv)
+			}
+		}
+	}
+
+	b.builder.unsafeAppendBoolsToBitmap(valid, len(v))
+}
+
+func (b *BinaryViewBuilder) AppendStringValues(v []string, valid []bool) {
+	if len(v) != len(valid) && len(valid) != 0 {
+		panic("len(v) != len(valid) && len(valid) != 0")
+	}
+
+	if len(v) == 0 {
+		return
+	}
+
+	b.Reserve(len(v))
+	outOfLineTotal := 0
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			if !arrow.IsStringHeaderInline(len(vv)) {
+				outOfLineTotal += len(vv)
+			}
+		}
+	}
+
+	b.ReserveData(outOfLineTotal)
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			hdr := &b.rawData[b.length+i]
+			hdr.SetString(vv)
+			if !hdr.IsInline() {
+				b.blockBuilder.UnsafeAppendString(hdr, vv)
+			}
+		}
+	}
+
+	b.builder.unsafeAppendBoolsToBitmap(valid, len(v))
+}
+
+// AppendValueFromString is paired with ValueStr for fulfilling the
+// base Builder interface. This is intended to read in a human-readable
+// string such as from CSV or JSON and append it to the array.
+//
+// For Binary values are expected to be base64 encoded (and will be
+// decoded as such before being appended).
+func (b *BinaryViewBuilder) AppendValueFromString(s string) error {
+	if s == NullValueStr {
+		b.AppendNull()
+		return nil
+	}
+
+	if b.dtype.IsUtf8() {
+		b.Append([]byte(s))
+		return nil
+	}
+
+	decodedVal, err := base64.StdEncoding.DecodeString(s)
+	if err != nil {
+		return fmt.Errorf("could not decode base64 string: %w", err)
+	}
+	b.Append(decodedVal)
+	return nil
+}
+
+func (b *BinaryViewBuilder) UnmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case string:
+		data, err := base64.StdEncoding.DecodeString(v)
+		if err != nil {
+			return err
+		}
+		b.Append(data)
+	case []byte:
+		b.Append(v)
+	case nil:
+		b.AppendNull()
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf([]byte{}),
+			Offset: dec.InputOffset(),
+		}
+	}
+	return nil
+}
+
+func (b *BinaryViewBuilder) Unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.UnmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *BinaryViewBuilder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary view builder must unpack from json array, found %s", delim)
+	}
+
+	return b.Unmarshal(dec)
+}
+
+func (b *BinaryViewBuilder) newData() (data *Data) {
+	bytesRequired := arrow.StringHeaderTraits.BytesRequired(b.length)
+	if bytesRequired > 0 && bytesRequired < b.data.Len() {
+		// trim buffers
+		b.data.Resize(bytesRequired)
+	}
+
+	dataBuffers := b.blockBuilder.Finish()
+	data = NewData(b.dtype, b.length, append([]*memory.Buffer{
+		b.nullBitmap, b.data}, dataBuffers...), nil, b.nulls, 0)
+	b.reset()
+
+	if b.data != nil {
+		b.data.Release()
+		b.data = nil
+		b.rawData = nil
+		for _, buf := range dataBuffers {
+			buf.Release()
+		}
+	}
+	return
+}
+
+func (b *BinaryViewBuilder) NewBinaryViewArray() (a *BinaryView) {
+	data := b.newData()
+	a = NewBinaryViewData(data)
+	data.Release()
+	return

Review Comment:
   where possible I actually prefer the manual release here as it is *technically* ever so slightly more performant to avoid the `defer` when it is not needed. If there were more lines between the call to `newData` and the return I'd do the defer, but since it's only one line I prefer to follow the same pattern that the rest of the `New*Array` methods are using. What do you think?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] candiduslynx commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "candiduslynx (via GitHub)" <gi...@apache.org>.
candiduslynx commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1238871069


##########
go/arrow/array/binarybuilder.go:
##########
@@ -370,6 +371,327 @@ func (b *BinaryBuilder) UnmarshalJSON(data []byte) error {
 	return b.Unmarshal(dec)
 }
 
+const (
+	dfltBlockSize             = 1 << 20 // 1 MB
+	viewValueSizeLimit uint32 = math.MaxUint32
+)
+
+type BinaryViewBuilder struct {
+	builder
+	dtype arrow.BinaryDataType
+
+	data    *memory.Buffer
+	rawData []arrow.StringHeader
+
+	blockBuilder multiBufferBuilder
+}
+
+func NewBinaryViewBuilder(mem memory.Allocator) *BinaryViewBuilder {
+	return &BinaryViewBuilder{
+		dtype: arrow.BinaryTypes.BinaryView,
+		builder: builder{
+			refCount: 1,
+			mem:      mem,
+		},
+		blockBuilder: multiBufferBuilder{
+			refCount:  1,
+			blockSize: dfltBlockSize,
+			mem:       mem,
+		},
+	}
+}
+
+func (b *BinaryViewBuilder) Type() arrow.DataType { return b.dtype }
+
+func (b *BinaryViewBuilder) Release() {
+	debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases")
+
+	if atomic.AddInt64(&b.refCount, -1) == 0 {
+		if b.nullBitmap != nil {
+			b.nullBitmap.Release()
+			b.nullBitmap = nil
+		}
+		if b.data != nil {
+			b.data.Release()
+			b.data = nil
+			b.rawData = nil
+		}
+	}
+}
+
+func (b *BinaryViewBuilder) init(capacity int) {
+	b.builder.init(capacity)
+	b.data = memory.NewResizableBuffer(b.mem)
+	bytesN := arrow.StringHeaderTraits.BytesRequired(capacity)
+	b.data.Resize(bytesN)
+	b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+}
+
+func (b *BinaryViewBuilder) Resize(n int) {
+	nbuild := n
+	if n < minBuilderCapacity {
+		n = minBuilderCapacity
+	}
+
+	if b.capacity == 0 {
+		b.init(n)
+	} else {
+		b.builder.resize(nbuild, b.init)
+		b.data.Resize(arrow.StringHeaderTraits.BytesRequired(n))
+		b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+	}
+}
+
+func (b *BinaryViewBuilder) ReserveData(length int) {
+	if uint32(length) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB",
+			arrow.ErrInvalid))
+	}
+	b.blockBuilder.Reserve(int(length))
+}
+
+func (b *BinaryViewBuilder) Reserve(n int) {
+	b.builder.reserve(n, b.Resize)
+}
+
+func (b *BinaryViewBuilder) Append(v []byte) {
+	if uint32(len(v)) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB", arrow.ErrInvalid))
+	}
+
+	if !arrow.IsStringHeaderInline(len(v)) {
+		b.ReserveData(len(v))
+	}
+
+	b.Reserve(1)
+	b.UnsafeAppend(v)
+}
+
+// AppendString is identical to Append, only accepting a string instead
+// of a byte slice, avoiding the extra copy that would occur if you simply
+// did []byte(v).
+//
+// This is different than AppendValueFromString which exists for the
+// Builder interface, in that this expects raw binary data which is
+// appended as such. AppendValueFromString expects base64 encoded binary
+// data instead.
+func (b *BinaryViewBuilder) AppendString(v string) {
+	// create a []byte without copying the bytes
+	// in go1.20 this would be unsafe.StringData
+	val := *(*[]byte)(unsafe.Pointer(&struct {
+		string
+		int
+	}{v, len(v)}))
+	b.Append(val)
+}
+
+func (b *BinaryViewBuilder) AppendNull() {
+	b.Reserve(1)
+	b.UnsafeAppendBoolToBitmap(false)
+}
+
+func (b *BinaryViewBuilder) AppendNulls(n int) {
+	b.Reserve(n)
+	for i := 0; i < n; i++ {
+		b.UnsafeAppendBoolToBitmap(false)
+	}
+}
+
+func (b *BinaryViewBuilder) AppendEmptyValue() {
+	b.Reserve(1)
+	b.UnsafeAppendBoolToBitmap(true)
+}
+
+func (b *BinaryViewBuilder) AppendEmptyValues(n int) {
+	b.Reserve(n)
+	b.unsafeAppendBoolsToBitmap(nil, n)
+}
+
+func (b *BinaryViewBuilder) UnsafeAppend(v []byte) {
+	hdr := &b.rawData[b.length]
+	hdr.SetBytes(v)
+	if !hdr.IsInline() {
+		b.blockBuilder.UnsafeAppend(hdr, v)
+	}
+	b.UnsafeAppendBoolToBitmap(true)
+}
+
+func (b *BinaryViewBuilder) AppendValues(v [][]byte, valid []bool) {
+	if len(v) != len(valid) && len(valid) != 0 {
+		panic("len(v) != len(valid) && len(valid) != 0")
+	}
+
+	if len(v) == 0 {
+		return
+	}
+
+	b.Reserve(len(v))
+	outOfLineTotal := 0
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			if !arrow.IsStringHeaderInline(len(vv)) {
+				outOfLineTotal += len(vv)
+			}
+		}
+	}
+
+	b.ReserveData(outOfLineTotal)
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			hdr := &b.rawData[b.length+i]
+			hdr.SetBytes(vv)
+			if !hdr.IsInline() {
+				b.blockBuilder.UnsafeAppend(hdr, vv)
+			}
+		}
+	}
+
+	b.builder.unsafeAppendBoolsToBitmap(valid, len(v))
+}
+
+func (b *BinaryViewBuilder) AppendStringValues(v []string, valid []bool) {
+	if len(v) != len(valid) && len(valid) != 0 {
+		panic("len(v) != len(valid) && len(valid) != 0")
+	}
+
+	if len(v) == 0 {
+		return
+	}
+
+	b.Reserve(len(v))
+	outOfLineTotal := 0
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			if !arrow.IsStringHeaderInline(len(vv)) {
+				outOfLineTotal += len(vv)
+			}
+		}
+	}
+
+	b.ReserveData(outOfLineTotal)
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			hdr := &b.rawData[b.length+i]
+			hdr.SetString(vv)
+			if !hdr.IsInline() {
+				b.blockBuilder.UnsafeAppendString(hdr, vv)
+			}
+		}
+	}
+
+	b.builder.unsafeAppendBoolsToBitmap(valid, len(v))
+}
+
+// AppendValueFromString is paired with ValueStr for fulfilling the
+// base Builder interface. This is intended to read in a human-readable
+// string such as from CSV or JSON and append it to the array.
+//
+// For Binary values are expected to be base64 encoded (and will be
+// decoded as such before being appended).
+func (b *BinaryViewBuilder) AppendValueFromString(s string) error {
+	if s == NullValueStr {
+		b.AppendNull()
+		return nil
+	}
+
+	if b.dtype.IsUtf8() {
+		b.Append([]byte(s))
+		return nil
+	}
+
+	decodedVal, err := base64.StdEncoding.DecodeString(s)
+	if err != nil {
+		return fmt.Errorf("could not decode base64 string: %w", err)
+	}
+	b.Append(decodedVal)
+	return nil
+}
+
+func (b *BinaryViewBuilder) UnmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	switch v := t.(type) {
+	case string:
+		data, err := base64.StdEncoding.DecodeString(v)
+		if err != nil {
+			return err
+		}
+		b.Append(data)
+	case []byte:
+		b.Append(v)
+	case nil:
+		b.AppendNull()
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf([]byte{}),
+			Offset: dec.InputOffset(),
+		}
+	}
+	return nil
+}
+
+func (b *BinaryViewBuilder) Unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.UnmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *BinaryViewBuilder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("binary view builder must unpack from json array, found %s", delim)
+	}
+
+	return b.Unmarshal(dec)
+}
+
+func (b *BinaryViewBuilder) newData() (data *Data) {
+	bytesRequired := arrow.StringHeaderTraits.BytesRequired(b.length)
+	if bytesRequired > 0 && bytesRequired < b.data.Len() {
+		// trim buffers
+		b.data.Resize(bytesRequired)
+	}
+
+	dataBuffers := b.blockBuilder.Finish()
+	data = NewData(b.dtype, b.length, append([]*memory.Buffer{
+		b.nullBitmap, b.data}, dataBuffers...), nil, b.nulls, 0)
+	b.reset()
+
+	if b.data != nil {
+		b.data.Release()
+		b.data = nil
+		b.rawData = nil
+		for _, buf := range dataBuffers {
+			buf.Release()
+		}
+	}
+	return
+}
+
+func (b *BinaryViewBuilder) NewBinaryViewArray() (a *BinaryView) {
+	data := b.newData()
+	a = NewBinaryViewData(data)
+	data.Release()
+	return

Review Comment:
   a nit, so I'm OK either way



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1237575843


##########
dev/archery/archery/integration/datagen.py:
##########
@@ -743,6 +763,72 @@ class LargeStringColumn(_BaseStringColumn, _LargeOffsetsMixin):
     pass
 
 
+class BinaryViewColumn(PrimitiveColumn):
+
+    def _encode_value(self, x):
+        return frombytes(binascii.hexlify(x).upper())
+
+    def _get_buffers(self):
+        char_buffers = []
+        DEFAULT_BUFFER_SIZE = 32  # ¯\_(ツ)_/¯
+        INLINE_SIZE = 12
+
+        data = []
+        for i, v in enumerate(self.values):
+            if not self.is_valid[i]:
+                v = b''
+            assert isinstance(v, bytes)
+
+            if len(v) > INLINE_SIZE:
+                offset = 0
+                if len(v) > DEFAULT_BUFFER_SIZE:
+                    char_buffers.append(v)
+                else:
+                    if len(char_buffers) == 0:
+                        char_buffers.append(v)
+                    elif len(char_buffers[-1]) + len(v) > DEFAULT_BUFFER_SIZE:
+                        char_buffers.append(v)
+                    else:
+                        offset = len(char_buffers[-1])
+                        char_buffers[-1] += v
+                        assert len(char_buffers[-1]) <= DEFAULT_BUFFER_SIZE

Review Comment:
   @bkietz Is there a reason for this assert to exist given the if/elif clauses above it? I pretty much just took this from your C++ PR for the integration tests.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] candiduslynx commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "candiduslynx (via GitHub)" <gi...@apache.org>.
candiduslynx commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1236476842


##########
go/arrow/array/binary.go:
##########
@@ -318,6 +319,116 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool {
 	return true
 }
 
+type ViewLike interface {
+	arrow.Array
+	ValueHeader(int) *arrow.StringHeader
+}
+
+type BinaryView struct {
+	array
+	values      []arrow.StringHeader
+	dataBuffers []*memory.Buffer
+}
+
+func NewBinaryViewData(data arrow.ArrayData) *BinaryView {
+	a := &BinaryView{}
+	a.refCount = 1
+	a.setData(data.(*Data))
+	return a
+}
+
+func (a *BinaryView) setData(data *Data) {
+	if len(data.buffers) < 2 {
+		panic("len(data.buffers) < 2")
+	}
+	a.array.setData(data)
+
+	if valueData := data.buffers[1]; valueData != nil {
+		a.values = arrow.StringHeaderTraits.CastFromBytes(valueData.Bytes())
+	}
+
+	a.dataBuffers = data.buffers[2:]
+}
+
+func (a *BinaryView) ValueHeader(i int) *arrow.StringHeader {
+	if i < 0 || i >= a.array.data.length {
+		panic("arrow/array: index out of range")
+	}
+	return &a.values[a.array.data.offset+i]
+}
+
+func (a *BinaryView) Value(i int) []byte {
+	s := a.ValueHeader(i)
+	if s.IsInline() {
+		return s.InlineBytes()
+	}
+	start := s.BufferOffset()
+	buf := a.dataBuffers[s.BufferIndex()]
+	return buf.Bytes()[start : start+uint32(s.Len())]
+}
+
+func (a *BinaryView) ValueString(i int) string {
+	b := a.Value(i)
+	return *(*string)(unsafe.Pointer(&b))
+}
+
+func (a *BinaryView) String() string {
+	var o strings.Builder
+	o.WriteString("[")
+	for i := 0; i < a.Len(); i++ {
+		if i > 0 {
+			o.WriteString(" ")
+		}
+		switch {
+		case a.IsNull(i):
+			o.WriteString(NullValueStr)
+		default:
+			fmt.Fprintf(&o, "%q", a.ValueString(i))
+		}
+	}
+	o.WriteString("]")
+	return o.String()
+}
+
+func (a *BinaryView) ValueStr(i int) string {
+	if a.IsNull(i) {
+		return NullValueStr
+	}
+	return base64.StdEncoding.EncodeToString(a.Value(i))
+}
+
+func (a *BinaryView) GetOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+	return a.Value(i)
+}
+
+func (a *BinaryView) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := 0; i < a.Len(); i++ {
+		vals[i] = a.GetOneForMarshal(i)
+	}
+	// golang marshal standard says that []byte will be marshalled
+	// as a base64-encoded string
+	return json.Marshal(vals)
+}
+
+func arrayEqualBinaryView(left, right *BinaryView) bool {
+	leftBufs, rightBufs := left.dataBuffers, right.dataBuffers
+	for i := 0; i < left.Len(); i++ {
+		if left.IsNull(i) {

Review Comment:
   this is already checked in the `baseArrayEqual` func (~ len, data type, nulls are same)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1237592099


##########
go/arrow/array/binarybuilder.go:
##########
@@ -358,6 +359,301 @@ func (b *BinaryBuilder) UnmarshalJSON(data []byte) error {
 	return b.Unmarshal(dec)
 }
 
+const (
+	dfltBlockSize             = 1 << 20 // 1 MB
+	viewValueSizeLimit uint32 = math.MaxUint32
+)
+
+type BinaryViewBuilder struct {
+	builder
+	dtype arrow.BinaryDataType
+
+	data    *memory.Buffer
+	rawData []arrow.StringHeader
+
+	blockBuilder multiBufferBuilder
+}
+
+func NewBinaryViewBuilder(mem memory.Allocator) *BinaryViewBuilder {
+	return &BinaryViewBuilder{
+		dtype: arrow.BinaryTypes.BinaryView,
+		builder: builder{
+			refCount: 1,
+			mem:      mem,
+		},
+		blockBuilder: multiBufferBuilder{
+			refCount:  1,
+			blockSize: dfltBlockSize,
+			mem:       mem,
+		},
+	}
+}
+
+func (b *BinaryViewBuilder) Type() arrow.DataType { return b.dtype }
+
+func (b *BinaryViewBuilder) Release() {
+	debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases")
+
+	if atomic.AddInt64(&b.refCount, -1) == 0 {
+		if b.nullBitmap != nil {
+			b.nullBitmap.Release()
+			b.nullBitmap = nil
+		}
+		if b.data != nil {
+			b.data.Release()
+			b.data = nil
+			b.rawData = nil
+		}
+	}
+}
+
+func (b *BinaryViewBuilder) init(capacity int) {
+	b.builder.init(capacity)
+	b.data = memory.NewResizableBuffer(b.mem)
+	bytesN := arrow.StringHeaderTraits.BytesRequired(capacity)
+	b.data.Resize(bytesN)
+	b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+}
+
+func (b *BinaryViewBuilder) Resize(n int) {
+	nbuild := n
+	if n < minBuilderCapacity {
+		n = minBuilderCapacity
+	}
+
+	if b.capacity == 0 {
+		b.init(n)
+	} else {
+		b.builder.resize(nbuild, b.init)
+		b.data.Resize(arrow.StringHeaderTraits.BytesRequired(n))
+		b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+	}
+}
+
+func (b *BinaryViewBuilder) ReserveData(length int) {
+	if uint32(length) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB",
+			arrow.ErrInvalid))
+	}
+	b.blockBuilder.Reserve(int(length))
+}
+
+func (b *BinaryViewBuilder) Reserve(n int) {
+	b.builder.reserve(n, b.Resize)
+}
+
+func (b *BinaryViewBuilder) Append(v []byte) {
+	if uint32(len(v)) > viewValueSizeLimit {
+		panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB", arrow.ErrInvalid))
+	}
+
+	if !arrow.IsStringHeaderInline(len(v)) {
+		b.ReserveData(len(v))
+	}
+
+	b.Reserve(1)
+	b.UnsafeAppend(v)
+}
+
+func (b *BinaryViewBuilder) AppendString(v string) {
+	// create a []byte without copying the bytes
+	// in go1.20 this would be unsafe.StringData
+	val := *(*[]byte)(unsafe.Pointer(&struct {
+		string
+		int
+	}{v, len(v)}))
+	b.Append(val)
+}
+
+func (b *BinaryViewBuilder) AppendNull() {
+	b.Reserve(1)
+	b.UnsafeAppendBoolToBitmap(false)
+}
+
+func (b *BinaryViewBuilder) AppendEmptyValue() {
+	b.Reserve(1)
+	b.UnsafeAppendBoolToBitmap(true)
+}
+
+func (b *BinaryViewBuilder) UnsafeAppend(v []byte) {
+	hdr := &b.rawData[b.length]
+	hdr.SetBytes(v)
+	if !hdr.IsInline() {
+		b.blockBuilder.UnsafeAppend(hdr, v)
+	}
+	b.UnsafeAppendBoolToBitmap(true)
+}
+
+func (b *BinaryViewBuilder) AppendValues(v [][]byte, valid []bool) {
+	if len(v) != len(valid) && len(valid) != 0 {
+		panic("len(v) != len(valid) && len(valid) != 0")
+	}
+
+	if len(v) == 0 {
+		return
+	}
+
+	b.Reserve(len(v))
+	outOfLineTotal := 0
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			if !arrow.IsStringHeaderInline(len(vv)) {
+				outOfLineTotal += len(vv)
+			}
+		}
+	}
+
+	b.ReserveData(outOfLineTotal)
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			hdr := &b.rawData[b.length+i]
+			hdr.SetBytes(vv)
+			if !hdr.IsInline() {
+				b.blockBuilder.UnsafeAppend(hdr, vv)
+			}
+		}
+	}
+
+	b.builder.unsafeAppendBoolsToBitmap(valid, len(v))
+}
+
+func (b *BinaryViewBuilder) AppendStringValues(v []string, valid []bool) {
+	if len(v) != len(valid) && len(valid) != 0 {
+		panic("len(v) != len(valid) && len(valid) != 0")
+	}
+
+	if len(v) == 0 {
+		return
+	}
+
+	b.Reserve(len(v))
+	outOfLineTotal := 0
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			if !arrow.IsStringHeaderInline(len(vv)) {
+				outOfLineTotal += len(vv)
+			}
+		}
+	}
+
+	b.ReserveData(outOfLineTotal)
+	for i, vv := range v {
+		if len(valid) == 0 || valid[i] {
+			hdr := &b.rawData[b.length+i]
+			hdr.SetString(vv)
+			if !hdr.IsInline() {
+				b.blockBuilder.UnsafeAppendString(hdr, vv)
+			}
+		}
+	}
+
+	b.builder.unsafeAppendBoolsToBitmap(valid, len(v))
+}
+
+func (b *BinaryViewBuilder) AppendValueFromString(s string) error {

Review Comment:
   done, let me know if you think it is sufficient or needs some more information



##########
go/arrow/array/binary.go:
##########
@@ -318,6 +319,116 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool {
 	return true
 }
 
+type ViewLike interface {
+	arrow.Array
+	ValueHeader(int) *arrow.StringHeader
+}
+
+type BinaryView struct {
+	array
+	values      []arrow.StringHeader
+	dataBuffers []*memory.Buffer
+}
+
+func NewBinaryViewData(data arrow.ArrayData) *BinaryView {
+	a := &BinaryView{}
+	a.refCount = 1
+	a.setData(data.(*Data))
+	return a
+}
+
+func (a *BinaryView) setData(data *Data) {
+	if len(data.buffers) < 2 {
+		panic("len(data.buffers) < 2")
+	}
+	a.array.setData(data)
+
+	if valueData := data.buffers[1]; valueData != nil {
+		a.values = arrow.StringHeaderTraits.CastFromBytes(valueData.Bytes())
+	}
+
+	a.dataBuffers = data.buffers[2:]
+}
+
+func (a *BinaryView) ValueHeader(i int) *arrow.StringHeader {
+	if i < 0 || i >= a.array.data.length {
+		panic("arrow/array: index out of range")
+	}
+	return &a.values[a.array.data.offset+i]
+}
+
+func (a *BinaryView) Value(i int) []byte {
+	s := a.ValueHeader(i)
+	if s.IsInline() {
+		return s.InlineBytes()
+	}
+	start := s.BufferOffset()
+	buf := a.dataBuffers[s.BufferIndex()]
+	return buf.Bytes()[start : start+uint32(s.Len())]
+}
+
+func (a *BinaryView) ValueString(i int) string {
+	b := a.Value(i)
+	return *(*string)(unsafe.Pointer(&b))
+}
+
+func (a *BinaryView) String() string {
+	var o strings.Builder
+	o.WriteString("[")
+	for i := 0; i < a.Len(); i++ {
+		if i > 0 {
+			o.WriteString(" ")
+		}
+		switch {
+		case a.IsNull(i):
+			o.WriteString(NullValueStr)
+		default:
+			fmt.Fprintf(&o, "%q", a.ValueString(i))
+		}
+	}
+	o.WriteString("]")
+	return o.String()
+}
+
+func (a *BinaryView) ValueStr(i int) string {

Review Comment:
   done, let me know if you think it is sufficient or needs some more information



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] candiduslynx commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "candiduslynx (via GitHub)" <gi...@apache.org>.
candiduslynx commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1238205061


##########
go/arrow/internal/flatbuf/RecordBatch.go:
##########
@@ -128,8 +128,42 @@ func (rcv *RecordBatch) Compression(obj *BodyCompression) *BodyCompression {
 }
 
 /// Optional compression of the message body
+/// Some types such as Utf8View are represented using a variable number of buffers.

Review Comment:
   I wonder why `///` is used instead of `//`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation [arrow]

Posted by "bkietz (via GitHub)" <gi...@apache.org>.
bkietz commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1376288902


##########
go/arrow/ipc/metadata.go:
##########
@@ -1186,18 +1202,25 @@ func writeDictionaryMessage(mem memory.Allocator, id int64, isDelta bool, size,
 	return writeMessageFB(b, mem, flatbuf.MessageHeaderDictionaryBatch, dictFB, bodyLength)
 }
 
-func recordToFB(b *flatbuffers.Builder, size, bodyLength int64, fields []fieldMetadata, meta []bufferMetadata, codec flatbuf.CompressionType) flatbuffers.UOffsetT {
+func recordToFB(b *flatbuffers.Builder, size, bodyLength int64, fields []fieldMetadata, meta []bufferMetadata, codec flatbuf.CompressionType, variadicCounts []int64) flatbuffers.UOffsetT {
 	fieldsFB := writeFieldNodes(b, fields, flatbuf.RecordBatchStartNodesVector)
 	metaFB := writeBuffers(b, meta, flatbuf.RecordBatchStartBuffersVector)
 	var bodyCompressFB flatbuffers.UOffsetT
 	if codec != -1 {
 		bodyCompressFB = writeBodyCompression(b, codec)
 	}
 
+	flatbuf.RecordBatchStartVariadicBufferCountsVector(b, len(variadicCounts))
+	for i := len(variadicCounts) - 1; i >= 0; i-- {
+		b.PrependInt64(variadicCounts[i])
+	}
+	vcFB := b.EndVector(len(variadicCounts))
+

Review Comment:
   +1; I agree it probably would be preferable to leave the variadic buffer counts absent rather than empty if none are necessary



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-38718: [Go][Format][Integration] Add StringView/BinaryView to Go implementation [arrow]

Posted by "bkietz (via GitHub)" <gi...@apache.org>.
bkietz merged PR #35769:
URL: https://github.com/apache/arrow/pull/35769


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] felipecrv commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "felipecrv (via GitHub)" <gi...@apache.org>.
felipecrv commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1236027661


##########
go/arrow/array/binary.go:
##########
@@ -318,6 +319,116 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool {
 	return true
 }
 
+type ViewLike interface {
+	arrow.Array
+	ValueHeader(int) *arrow.StringHeader
+}
+
+type BinaryView struct {
+	array
+	values      []arrow.StringHeader
+	dataBuffers []*memory.Buffer
+}
+
+func NewBinaryViewData(data arrow.ArrayData) *BinaryView {
+	a := &BinaryView{}
+	a.refCount = 1
+	a.setData(data.(*Data))
+	return a
+}
+
+func (a *BinaryView) setData(data *Data) {
+	if len(data.buffers) < 2 {
+		panic("len(data.buffers) < 2")
+	}
+	a.array.setData(data)
+
+	if valueData := data.buffers[1]; valueData != nil {
+		a.values = arrow.StringHeaderTraits.CastFromBytes(valueData.Bytes())
+	}
+
+	a.dataBuffers = data.buffers[2:]
+}
+
+func (a *BinaryView) ValueHeader(i int) *arrow.StringHeader {
+	if i < 0 || i >= a.array.data.length {
+		panic("arrow/array: index out of range")
+	}
+	return &a.values[a.array.data.offset+i]
+}
+
+func (a *BinaryView) Value(i int) []byte {
+	s := a.ValueHeader(i)
+	if s.IsInline() {
+		return s.InlineBytes()
+	}
+	start := s.BufferOffset()
+	buf := a.dataBuffers[s.BufferIndex()]
+	return buf.Bytes()[start : start+uint32(s.Len())]
+}
+
+func (a *BinaryView) ValueString(i int) string {
+	b := a.Value(i)
+	return *(*string)(unsafe.Pointer(&b))
+}
+
+func (a *BinaryView) String() string {
+	var o strings.Builder
+	o.WriteString("[")
+	for i := 0; i < a.Len(); i++ {
+		if i > 0 {
+			o.WriteString(" ")
+		}
+		switch {
+		case a.IsNull(i):
+			o.WriteString(NullValueStr)
+		default:
+			fmt.Fprintf(&o, "%q", a.ValueString(i))
+		}
+	}
+	o.WriteString("]")
+	return o.String()
+}
+
+func (a *BinaryView) ValueStr(i int) string {
+	if a.IsNull(i) {
+		return NullValueStr
+	}
+	return base64.StdEncoding.EncodeToString(a.Value(i))
+}
+
+func (a *BinaryView) GetOneForMarshal(i int) interface{} {
+	if a.IsNull(i) {
+		return nil
+	}
+	return a.Value(i)
+}
+
+func (a *BinaryView) MarshalJSON() ([]byte, error) {
+	vals := make([]interface{}, a.Len())
+	for i := 0; i < a.Len(); i++ {
+		vals[i] = a.GetOneForMarshal(i)
+	}
+	// golang marshal standard says that []byte will be marshalled
+	// as a base64-encoded string
+	return json.Marshal(vals)
+}
+
+func arrayEqualBinaryView(left, right *BinaryView) bool {
+	leftBufs, rightBufs := left.dataBuffers, right.dataBuffers
+	for i := 0; i < left.Len(); i++ {
+		if left.IsNull(i) {

Review Comment:
   Shouldn't `right` also be Null here for the slot to be skipped?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1238701161


##########
go/arrow/ipc/writer.go:
##########
@@ -605,6 +606,33 @@ func (w *recordEncoder) visit(p *Payload, arr arrow.Array) error {
 		p.body = append(p.body, voffsets)
 		p.body = append(p.body, values)
 
+	case arrow.BinaryViewDataType:
+		data := arr.Data()
+		values := data.Buffers()[1]
+		arrLen := int64(arr.Len())
+		typeWidth := int64(arrow.StringHeaderSizeBytes)
+		minLength := paddedLength(arrLen*typeWidth, kArrowAlignment)
+
+		switch {
+		case needTruncate(int64(data.Offset()), values, minLength):
+			// non-zero offset: slice the buffer
+			offset := int64(data.Offset()) * typeWidth
+			// send padding if available
+			len := minI64(bitutil.CeilByte64(arrLen*typeWidth), int64(values.Len())-offset)
+			values = memory.NewBufferBytes(values.Bytes()[offset : offset+len])

Review Comment:
   good point! nice catch



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1238716504


##########
go/arrow/internal/arrjson/arrjson.go:
##########
@@ -2179,3 +2230,113 @@ func durationToJSON(arr *array.Duration) []interface{} {
 	}
 	return o
 }
+
+func variadicBuffersFromJSON(bufs []string) []*memory.Buffer {
+	out := make([]*memory.Buffer, len(bufs))
+	for i, data := range bufs {
+		rawData, err := hex.DecodeString(data)
+		if err != nil {
+			panic(err)
+		}
+
+		out[i] = memory.NewBufferBytes(rawData)
+	}
+	return out
+}
+
+func variadicBuffersToJSON(bufs []*memory.Buffer) []string {
+	out := make([]string, len(bufs))
+	for i, data := range bufs {
+		out[i] = strings.ToUpper(hex.EncodeToString(data.Bytes()))
+	}
+	return out
+}
+
+func stringHeadersFromJSON(mem memory.Allocator, isBinary bool, data []interface{}) *memory.Buffer {
+	buf := memory.NewResizableBuffer(mem)
+	buf.Resize(arrow.StringHeaderTraits.BytesRequired(len(data)))
+
+	values := arrow.StringHeaderTraits.CastFromBytes(buf.Bytes())
+
+	for i, d := range data {
+		switch v := d.(type) {
+		case nil:
+			continue
+		case map[string]interface{}:
+			if inlined, ok := v["INLINED"]; ok {
+				if isBinary {
+					val, err := hex.DecodeString(inlined.(string))
+					if err != nil {
+						panic(fmt.Errorf("could not decode %v: %v", inlined, err))
+					}
+					values[i].SetBytes(val)
+				} else {
+					values[i].SetString(inlined.(string))
+				}
+				continue
+			}
+
+			idx, offset := v["BUFFER_INDEX"].(json.Number), v["OFFSET"].(json.Number)
+			bufIdx, err := idx.Int64()
+			if err != nil {
+				panic(err)
+			}
+
+			bufOffset, err := offset.Int64()
+			if err != nil {
+				panic(err)
+			}
+
+			values[i].SetIndexOffset(uint32(bufIdx), uint32(bufOffset))
+			prefix, err := hex.DecodeString(v["PREFIX"].(string))
+			if err != nil {
+				panic(err)
+			}
+			sz, err := v["SIZE"].(json.Number).Int64()
+			if err != nil {
+				panic(err)
+			}
+
+			rawData := make([]byte, sz)
+			copy(rawData, prefix)
+			values[i].SetBytes(rawData)
+		}
+	}
+	return buf
+}
+
+func stringHeadersToJSON(arr array.ViewLike, isBinary bool) []interface{} {
+	type StringHeader struct {
+		Size      int     `json:"SIZE"`
+		Prefix    *string `json:"PREFIX,omitempty"`
+		BufferIdx *int    `json:"BUFFER_INDEX,omitempty"`
+		BufferOff *int    `json:"OFFSET,omitempty"`
+		Inlined   *string `json:"INLINED,omitempty"`
+	}
+
+	o := make([]interface{}, arr.Len())
+	for i := range o {
+		hdr := arr.ValueHeader(i)
+		if hdr.IsInline() {
+			data := hdr.InlineData()
+			if isBinary {
+				data = strings.ToUpper(hex.EncodeToString(hdr.InlineBytes()))

Review Comment:
   The spec for the integration test is to specify that the binary data should be upper-cased hex characters. @bkietz can comment on whether or not the ToUpper is necessary and if we want to relax the spec.



##########
go/arrow/internal/arrjson/arrjson.go:
##########
@@ -2179,3 +2230,113 @@ func durationToJSON(arr *array.Duration) []interface{} {
 	}
 	return o
 }
+
+func variadicBuffersFromJSON(bufs []string) []*memory.Buffer {
+	out := make([]*memory.Buffer, len(bufs))
+	for i, data := range bufs {
+		rawData, err := hex.DecodeString(data)
+		if err != nil {
+			panic(err)
+		}
+
+		out[i] = memory.NewBufferBytes(rawData)
+	}
+	return out
+}
+
+func variadicBuffersToJSON(bufs []*memory.Buffer) []string {
+	out := make([]string, len(bufs))
+	for i, data := range bufs {
+		out[i] = strings.ToUpper(hex.EncodeToString(data.Bytes()))
+	}
+	return out
+}
+
+func stringHeadersFromJSON(mem memory.Allocator, isBinary bool, data []interface{}) *memory.Buffer {
+	buf := memory.NewResizableBuffer(mem)
+	buf.Resize(arrow.StringHeaderTraits.BytesRequired(len(data)))
+
+	values := arrow.StringHeaderTraits.CastFromBytes(buf.Bytes())
+
+	for i, d := range data {
+		switch v := d.(type) {
+		case nil:
+			continue
+		case map[string]interface{}:
+			if inlined, ok := v["INLINED"]; ok {
+				if isBinary {
+					val, err := hex.DecodeString(inlined.(string))
+					if err != nil {
+						panic(fmt.Errorf("could not decode %v: %v", inlined, err))
+					}
+					values[i].SetBytes(val)
+				} else {
+					values[i].SetString(inlined.(string))
+				}
+				continue
+			}
+
+			idx, offset := v["BUFFER_INDEX"].(json.Number), v["OFFSET"].(json.Number)
+			bufIdx, err := idx.Int64()
+			if err != nil {
+				panic(err)
+			}
+
+			bufOffset, err := offset.Int64()
+			if err != nil {
+				panic(err)
+			}
+
+			values[i].SetIndexOffset(uint32(bufIdx), uint32(bufOffset))
+			prefix, err := hex.DecodeString(v["PREFIX"].(string))
+			if err != nil {
+				panic(err)
+			}
+			sz, err := v["SIZE"].(json.Number).Int64()
+			if err != nil {
+				panic(err)
+			}
+
+			rawData := make([]byte, sz)
+			copy(rawData, prefix)
+			values[i].SetBytes(rawData)
+		}
+	}
+	return buf
+}
+
+func stringHeadersToJSON(arr array.ViewLike, isBinary bool) []interface{} {
+	type StringHeader struct {
+		Size      int     `json:"SIZE"`
+		Prefix    *string `json:"PREFIX,omitempty"`
+		BufferIdx *int    `json:"BUFFER_INDEX,omitempty"`
+		BufferOff *int    `json:"OFFSET,omitempty"`
+		Inlined   *string `json:"INLINED,omitempty"`
+	}
+
+	o := make([]interface{}, arr.Len())
+	for i := range o {
+		hdr := arr.ValueHeader(i)
+		if hdr.IsInline() {
+			data := hdr.InlineData()
+			if isBinary {
+				data = strings.ToUpper(hex.EncodeToString(hdr.InlineBytes()))
+			}
+			o[i] = StringHeader{
+				Size:    hdr.Len(),
+				Inlined: &data,
+			}
+		} else {
+			idx, off := int(hdr.BufferIndex()), int(hdr.BufferOffset())
+			prefix := hdr.Prefix()
+			encodedPrefix := strings.ToUpper(hex.EncodeToString(prefix[:]))

Review Comment:
   Let's keep this discussion to https://github.com/apache/arrow/pull/35769#discussion_r1238716504



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1237271570


##########
go/arrow/array/binary.go:
##########
@@ -318,6 +319,116 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool {
 	return true
 }
 
+type ViewLike interface {
+	arrow.Array
+	ValueHeader(int) *arrow.StringHeader
+}
+
+type BinaryView struct {
+	array
+	values      []arrow.StringHeader
+	dataBuffers []*memory.Buffer
+}
+
+func NewBinaryViewData(data arrow.ArrayData) *BinaryView {
+	a := &BinaryView{}
+	a.refCount = 1
+	a.setData(data.(*Data))
+	return a
+}
+
+func (a *BinaryView) setData(data *Data) {
+	if len(data.buffers) < 2 {
+		panic("len(data.buffers) < 2")
+	}
+	a.array.setData(data)
+
+	if valueData := data.buffers[1]; valueData != nil {
+		a.values = arrow.StringHeaderTraits.CastFromBytes(valueData.Bytes())
+	}
+
+	a.dataBuffers = data.buffers[2:]
+}
+
+func (a *BinaryView) ValueHeader(i int) *arrow.StringHeader {
+	if i < 0 || i >= a.array.data.length {
+		panic("arrow/array: index out of range")
+	}
+	return &a.values[a.array.data.offset+i]
+}
+
+func (a *BinaryView) Value(i int) []byte {
+	s := a.ValueHeader(i)
+	if s.IsInline() {
+		return s.InlineBytes()
+	}
+	start := s.BufferOffset()
+	buf := a.dataBuffers[s.BufferIndex()]
+	return buf.Bytes()[start : start+uint32(s.Len())]
+}
+
+func (a *BinaryView) ValueString(i int) string {
+	b := a.Value(i)
+	return *(*string)(unsafe.Pointer(&b))
+}
+
+func (a *BinaryView) String() string {
+	var o strings.Builder
+	o.WriteString("[")
+	for i := 0; i < a.Len(); i++ {
+		if i > 0 {
+			o.WriteString(" ")
+		}
+		switch {
+		case a.IsNull(i):
+			o.WriteString(NullValueStr)
+		default:
+			fmt.Fprintf(&o, "%q", a.ValueString(i))
+		}
+	}
+	o.WriteString("]")
+	return o.String()
+}
+
+func (a *BinaryView) ValueStr(i int) string {

Review Comment:
   I'll add the clarifying comments



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] zeroshade commented on a diff in pull request #35769: GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1237260043


##########
go/arrow/array/binary.go:
##########
@@ -318,6 +319,116 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool {
 	return true
 }
 
+type ViewLike interface {
+	arrow.Array
+	ValueHeader(int) *arrow.StringHeader
+}
+
+type BinaryView struct {
+	array
+	values      []arrow.StringHeader
+	dataBuffers []*memory.Buffer
+}
+
+func NewBinaryViewData(data arrow.ArrayData) *BinaryView {
+	a := &BinaryView{}
+	a.refCount = 1
+	a.setData(data.(*Data))
+	return a
+}
+
+func (a *BinaryView) setData(data *Data) {
+	if len(data.buffers) < 2 {
+		panic("len(data.buffers) < 2")
+	}
+	a.array.setData(data)
+
+	if valueData := data.buffers[1]; valueData != nil {
+		a.values = arrow.StringHeaderTraits.CastFromBytes(valueData.Bytes())
+	}
+
+	a.dataBuffers = data.buffers[2:]
+}
+
+func (a *BinaryView) ValueHeader(i int) *arrow.StringHeader {
+	if i < 0 || i >= a.array.data.length {
+		panic("arrow/array: index out of range")
+	}
+	return &a.values[a.array.data.offset+i]
+}
+
+func (a *BinaryView) Value(i int) []byte {

Review Comment:
   At the top level, there do already exist comments that state that it is intended that all Arrow Arrays be immutable. I agree with the concern that adding a comment here specifically could be counterproductive. If you can think of a good place to put such a comment that would be more universal, I'd be more than happy to do so.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation [arrow]

Posted by "bkietz (via GitHub)" <gi...@apache.org>.
bkietz commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1374844861


##########
go/arrow/datatype_viewheader_inline_tinygo.go:
##########
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !go1.20 && tinygo
+
+package arrow
+
+import (
+	"reflect"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v14/arrow/internal/debug"
+)
+
+func (sh *ViewHeader) InlineData() (data string) {

Review Comment:
   Ah, then maybe 
   ```suggestion
   func (sh *ViewHeader) InlineString() (data string) {
   ```
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation [arrow]

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1375005262


##########
go/arrow/datatype_viewheader_inline_tinygo.go:
##########
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !go1.20 && tinygo
+
+package arrow
+
+import (
+	"reflect"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v14/arrow/internal/debug"
+)
+
+func (sh *ViewHeader) InlineData() (data string) {

Review Comment:
   that's fair. renamed



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation [arrow]

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1376461078


##########
go/arrow/internal/arrjson/arrjson.go:
##########
@@ -818,6 +826,7 @@ type Array struct {
 	Offset   interface{}           `json:"OFFSET,omitempty"`
 	Size     interface{}           `json:"SIZE,omitempty"`
 	Children []Array               `json:"children,omitempty"`
+	Variadic []string              `json:"VARIADIC_BUFFERS,omitempty"`

Review Comment:
   The integration testing JSON format has a defined format which uses caps for the array data values themselves and lower case for structural data like children, metadata, and data types.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-35627: [Go][Format][Integration] Add StringView/BinaryView to Go implementation [arrow]

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.
zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1391703078


##########
go/arrow/array/binary.go:
##########
@@ -318,6 +319,126 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool {
 	return true
 }
 
+type ViewLike interface {
+	arrow.Array
+	ValueHeader(int) *arrow.ViewHeader
+}
+
+type BinaryView struct {
+	array
+	values      []arrow.ViewHeader
+	dataBuffers []*memory.Buffer
+}
+
+func NewBinaryViewData(data arrow.ArrayData) *BinaryView {
+	a := &BinaryView{}
+	a.refCount = 1
+	a.setData(data.(*Data))
+	return a
+}
+
+func (a *BinaryView) setData(data *Data) {
+	if len(data.buffers) < 2 {
+		panic("len(data.buffers) < 2")
+	}
+	a.array.setData(data)
+
+	if valueData := data.buffers[1]; valueData != nil {
+		a.values = arrow.ViewHeaderTraits.CastFromBytes(valueData.Bytes())
+	}
+
+	a.dataBuffers = data.buffers[2:]
+}
+
+func (a *BinaryView) ValueHeader(i int) *arrow.ViewHeader {
+	if i < 0 || i >= a.array.data.length {
+		panic("arrow/array: index out of range")
+	}
+	return &a.values[a.array.data.offset+i]
+}
+
+func (a *BinaryView) Value(i int) []byte {
+	s := a.ValueHeader(i)
+	if s.IsInline() {
+		return s.InlineBytes()
+	}
+	start := s.BufferOffset()
+	buf := a.dataBuffers[s.BufferIndex()]
+	return buf.Bytes()[start : start+int32(s.Len())]
+}
+
+// ValueString returns the value at index i as a string instead of
+// a byte slice, without copying the underlying data.
+func (a *BinaryView) ValueString(i int) string {
+	b := a.Value(i)
+	return *(*string)(unsafe.Pointer(&b))
+}
+
+func (a *BinaryView) String() string {
+	var o strings.Builder
+	o.WriteString("[")
+	for i := 0; i < a.Len(); i++ {
+		if i > 0 {
+			o.WriteString(" ")
+		}
+		switch {
+		case a.IsNull(i):
+			o.WriteString(NullValueStr)
+		default:
+			fmt.Fprintf(&o, "%q", a.ValueString(i))
+		}
+	}
+	o.WriteString("]")
+	return o.String()
+}
+
+// ValueStr is paired with AppendValueFromString in that it returns
+// the value at index i as a string: Semantically this means that for
+// a null value it will return the string "(null)", otherwise it will
+// return the value as a base64 encoded string suitable for CSV/JSON.
+//
+// This is always going to be less performant than just using ValueString
+// and exists to fulfill the Array interface to provide a method which
+// can produce a human readable string for a given index.
+func (a *BinaryView) ValueStr(i int) string {

Review Comment:
   added



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org