You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ze...@apache.org on 2023/04/10 18:51:01 UTC

[arrow] branch main updated: GH-34853: [Go] Add TotalRecordSize, TotalArraySize (#34854)

This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 2fe17338e2 GH-34853: [Go] Add TotalRecordSize, TotalArraySize (#34854)
2fe17338e2 is described below

commit 2fe17338e2d1f85d0c2685d31d2dd51f138b6b80
Author: Yevgeny Pats <ye...@gmail.com>
AuthorDate: Mon Apr 10 14:50:55 2023 -0400

    GH-34853: [Go] Add TotalRecordSize, TotalArraySize (#34854)
    
    Closes https://github.com/apache/arrow/issues/34853
    * Closes: #34853
    
    Authored-by: Yevgeny Pats <16...@users.noreply.github.com>
    Signed-off-by: Matt Topol <zo...@gmail.com>
---
 go/arrow/util/byte_size.go      |  79 +++++++++++++++++++++++++++++
 go/arrow/util/byte_size_test.go | 110 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 189 insertions(+)

diff --git a/go/arrow/util/byte_size.go b/go/arrow/util/byte_size.go
new file mode 100644
index 0000000000..db08e8d27b
--- /dev/null
+++ b/go/arrow/util/byte_size.go
@@ -0,0 +1,79 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package util
+
+import (
+	"github.com/apache/arrow/go/v12/arrow"
+	"github.com/apache/arrow/go/v12/arrow/array"
+	"github.com/apache/arrow/go/v12/arrow/memory"
+)
+
+func isArrayDataNil(arrayData arrow.ArrayData) bool {
+	if arrayData == nil {
+		return true
+	}
+	if v, ok := arrayData.(*array.Data); ok {
+		return v == nil
+	}
+	panic("unknown ArrayData type")
+}
+
+func totalArrayDataSize(arrayData arrow.ArrayData, seenBuffers map[*memory.Buffer]struct{}) int64 {
+	var sum int64
+	var void = struct{}{}
+	for _, buf := range arrayData.Buffers() {
+		if buf == nil {
+			continue
+		}
+		if _, ok := seenBuffers[buf]; !ok {
+			sum += int64(buf.Len())
+			seenBuffers[buf] = void
+		}
+	}
+	for _, child := range arrayData.Children() {
+		sum += totalArrayDataSize(child, seenBuffers)
+	}
+	dict := arrayData.Dictionary()
+	if !isArrayDataNil(dict) {
+		sum += totalArrayDataSize(dict, seenBuffers)
+	}
+	return sum
+}
+
+func totalArraySize(arr arrow.Array, seenBuffers map[*memory.Buffer]struct{}) int64 {
+	return totalArrayDataSize(arr.Data(), seenBuffers)
+}
+
+func totalRecordSize(record arrow.Record, seenBuffers map[*memory.Buffer]struct{}) int64 {
+	var sum int64
+	for _, c := range record.Columns() {
+		sum += totalArraySize(c, seenBuffers)
+	}
+	return sum
+}
+
+// TotalArraySize returns the sum of the number of bytes in each buffer referenced by the Array.
+func TotalArraySize(arr arrow.Array) int64 {
+	seenBuffer := make(map[*memory.Buffer]struct{})
+	return totalArraySize(arr, seenBuffer)
+}
+
+// TotalRecordSize return the sum of bytes in each buffer referenced by the Record.
+func TotalRecordSize(record arrow.Record) int64 {
+	seenBuffer := make(map[*memory.Buffer]struct{})
+	return totalRecordSize(record, seenBuffer)
+}
diff --git a/go/arrow/util/byte_size_test.go b/go/arrow/util/byte_size_test.go
new file mode 100644
index 0000000000..794aaf1953
--- /dev/null
+++ b/go/arrow/util/byte_size_test.go
@@ -0,0 +1,110 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package util_test
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/apache/arrow/go/v12/arrow"
+	"github.com/apache/arrow/go/v12/arrow/array"
+	"github.com/apache/arrow/go/v12/arrow/memory"
+	"github.com/apache/arrow/go/v12/arrow/util"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestTotalArrayReusedBuffers(t *testing.T) {
+	mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
+	defer mem.AssertSize(t, 0)
+	bldr := array.NewBooleanBuilder(mem)
+	defer bldr.Release()
+	bldr.Append(true)
+	arr := bldr.NewArray()
+	defer arr.Release()
+
+	rec := array.NewRecord(arrow.NewSchema([]arrow.Field{
+		{Name: "a", Type: arrow.FixedWidthTypes.Boolean},
+		{Name: "b", Type: arrow.FixedWidthTypes.Boolean},
+	}, nil), []arrow.Array{arr, arr}, 1)
+	defer rec.Release()
+
+	assert.Equal(t, int64(5), util.TotalRecordSize(rec))
+
+	rec1 := array.NewRecord(arrow.NewSchema([]arrow.Field{
+		{Name: "a", Type: arrow.FixedWidthTypes.Boolean},
+	}, nil), []arrow.Array{arr}, 1)
+	defer rec1.Release()
+
+	// both records should have the same size as rec is using the same buffer
+	assert.Equal(t, int64(5), util.TotalRecordSize(rec1))
+}
+
+func TestTotalArraySizeBasic(t *testing.T) {
+	mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
+	defer mem.AssertSize(t, 0)
+
+	noNulls, _, err := array.FromJSON(mem,
+		arrow.PrimitiveTypes.Int16,
+		strings.NewReader("[1, 2, 3]"))
+	assert.NoError(t, err)
+	defer noNulls.Release()
+	assert.Equal(t, int64(10), util.TotalArraySize(noNulls))
+
+	withNulls, _, err := array.FromJSON(mem,
+		arrow.PrimitiveTypes.Int16,
+		strings.NewReader("[1, 2, 3, 4, null, 6, 7, 8, 9]"))
+	assert.NoError(t, err)
+	defer withNulls.Release()
+	assert.Equal(t, int64(22), util.TotalArraySize(withNulls))
+
+	bldr := array.NewBooleanBuilder(mem)
+	defer bldr.Release()
+
+	arr := bldr.NewArray()
+	defer arr.Release()
+
+	assert.Equal(t, int64(0), util.TotalArraySize(arr))
+}
+
+func TestTotalArraySizeNested(t *testing.T) {
+	mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
+	defer mem.AssertSize(t, 0)
+
+	arrayWithChildren, _, err := array.FromJSON(mem,
+		arrow.ListOf(arrow.PrimitiveTypes.Int64),
+		strings.NewReader("[[0, 1, 2, 3, 4], [5], null]"))
+	assert.NoError(t, err)
+	defer arrayWithChildren.Release()
+	assert.Equal(t, int64(72), util.TotalArraySize(arrayWithChildren))
+}
+
+func TestTotalArraySizeRecord(t *testing.T) {
+	mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
+	defer mem.AssertSize(t, 0)
+
+	recordBldr := array.NewRecordBuilder(mem, arrow.NewSchema([]arrow.Field{
+		{Name: "a", Type: arrow.PrimitiveTypes.Int32},
+		{Name: "b", Type: arrow.PrimitiveTypes.Int64},
+	}, nil))
+	defer recordBldr.Release()
+	recordBldr.Field(0).(*array.Int32Builder).AppendValues([]int32{1, 2, 3}, nil)
+	recordBldr.Field(1).(*array.Int64Builder).AppendValues([]int64{4, 5, 6}, nil)
+	record := recordBldr.NewRecord()
+	defer record.Release()
+
+	assert.Equal(t, int64(44), util.TotalRecordSize(record))
+}