You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "lidavidm (via GitHub)" <gi...@apache.org> on 2023/02/08 20:06:28 UTC

[GitHub] [arrow] lidavidm commented on a diff in pull request #34079: GH-34077: [Go] Implement RunEndEncoded Scalar

lidavidm commented on code in PR #34079:
URL: https://github.com/apache/arrow/pull/34079#discussion_r1100617379


##########
go/arrow/scalar/nested.go:
##########
@@ -744,3 +745,72 @@ func (s *DenseUnion) ChildValue() Scalar { return s.Value }
 func NewDenseUnionScalar(v Scalar, code arrow.UnionTypeCode, dt *arrow.DenseUnionType) *DenseUnion {
 	return &DenseUnion{scalar: scalar{dt, v.IsValid()}, TypeCode: code, Value: v}
 }
+
+type RunEndEncoded struct {
+	scalar
+
+	RunLength int64
+	Value     Scalar
+}
+
+func NewRunEndEncodedScalar(v Scalar, runLength int64, dt *arrow.RunEndEncodedType) *RunEndEncoded {
+	return &RunEndEncoded{scalar: scalar{dt, true}, RunLength: runLength, Value: v}
+}
+
+func (s *RunEndEncoded) Release() {
+	if r, ok := s.Value.(Releasable); ok {
+		r.Release()
+	}
+}
+
+func (s *RunEndEncoded) value() interface{} { return s.Value.value() }
+
+func (s *RunEndEncoded) Validate() (err error) {
+	if err = s.Value.Validate(); err != nil {
+		return
+	}
+
+	if err = validateOptional(&s.scalar, s.value(), "value"); err != nil {
+		return
+	}
+
+	if !s.Valid {
+		return
+	}
+
+	if s.Type.ID() != arrow.RUN_END_ENCODED {
+		err = fmt.Errorf("%w: run-end-encoded scalar should not have type %s",
+			arrow.ErrInvalid, s.Type)
+	}
+	return
+}
+
+func (s *RunEndEncoded) ValidateFull() error { return s.Validate() }
+
+func (s *RunEndEncoded) equals(rhs Scalar) bool {
+	other := rhs.(*RunEndEncoded)
+	return s.RunLength == other.RunLength && Equals(s.Value, other.Value)
+}
+
+func (s *RunEndEncoded) String() string {
+	return fmt.Sprintf("{run_length=%d, value=%v}",
+		s.RunLength, s.Value)
+}
+
+func (s *RunEndEncoded) CastTo(to arrow.DataType) (Scalar, error) {
+	if !s.Valid {

Review Comment:
   that said it's at least consistent



##########
go/arrow/scalar/nested.go:
##########
@@ -744,3 +745,72 @@ func (s *DenseUnion) ChildValue() Scalar { return s.Value }
 func NewDenseUnionScalar(v Scalar, code arrow.UnionTypeCode, dt *arrow.DenseUnionType) *DenseUnion {
 	return &DenseUnion{scalar: scalar{dt, v.IsValid()}, TypeCode: code, Value: v}
 }
+
+type RunEndEncoded struct {
+	scalar
+
+	RunLength int64
+	Value     Scalar
+}
+
+func NewRunEndEncodedScalar(v Scalar, runLength int64, dt *arrow.RunEndEncodedType) *RunEndEncoded {
+	return &RunEndEncoded{scalar: scalar{dt, true}, RunLength: runLength, Value: v}
+}
+
+func (s *RunEndEncoded) Release() {
+	if r, ok := s.Value.(Releasable); ok {
+		r.Release()
+	}
+}
+
+func (s *RunEndEncoded) value() interface{} { return s.Value.value() }
+
+func (s *RunEndEncoded) Validate() (err error) {
+	if err = s.Value.Validate(); err != nil {
+		return
+	}
+
+	if err = validateOptional(&s.scalar, s.value(), "value"); err != nil {
+		return
+	}
+
+	if !s.Valid {
+		return
+	}
+
+	if s.Type.ID() != arrow.RUN_END_ENCODED {
+		err = fmt.Errorf("%w: run-end-encoded scalar should not have type %s",
+			arrow.ErrInvalid, s.Type)
+	}
+	return
+}
+
+func (s *RunEndEncoded) ValidateFull() error { return s.Validate() }
+
+func (s *RunEndEncoded) equals(rhs Scalar) bool {
+	other := rhs.(*RunEndEncoded)
+	return s.RunLength == other.RunLength && Equals(s.Value, other.Value)
+}
+
+func (s *RunEndEncoded) String() string {
+	return fmt.Sprintf("{run_length=%d, value=%v}",
+		s.RunLength, s.Value)
+}
+
+func (s *RunEndEncoded) CastTo(to arrow.DataType) (Scalar, error) {
+	if !s.Valid {

Review Comment:
   semantically, it feels weird to be able to cast a null scalar to any type



##########
go/arrow/scalar/scalar.go:
##########
@@ -664,6 +671,28 @@ func GetScalar(arr arrow.Array, idx int) (Scalar, error) {
 		return NewTime64Scalar(arr.Value(idx), arr.DataType()), nil
 	case *array.Timestamp:
 		return NewTimestampScalar(arr.Value(idx), arr.DataType()), nil
+	case *array.RunEndEncoded:
+		physicalIndex := encoded.FindPhysicalIndex(arr.Data(), arr.Offset()+idx)
+		value, err := GetScalar(arr.Values(), physicalIndex)
+		if err != nil {
+			return nil, err
+		}
+
+		var runLength int64
+
+		runEnds := arr.RunEndsArr()
+		switch ends := runEnds.(type) {
+		case *array.Int16:
+			runLength = int64(ends.Value(physicalIndex))
+		case *array.Int32:
+			runLength = int64(ends.Value(physicalIndex))
+		case *array.Int64:
+			runLength = int64(ends.Value(physicalIndex))
+		}

Review Comment:
   I suppose at least here, we do adjust the runLength for the logical offset



##########
go/arrow/scalar/scalar_test.go:
##########
@@ -1416,3 +1427,56 @@ func TestUnionScalars(t *testing.T) {
 	suite.Run(t, new(SparseUnionSuite))
 	suite.Run(t, new(DenseUnionSuite))
 }
+
+func TestRunEndEncodedGetScalar(t *testing.T) {
+	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+	defer mem.AssertSize(t, 0)
+
+	runEnds, _, _ := array.FromJSON(mem, arrow.PrimitiveTypes.Int32, strings.NewReader(`[100, 200, 300, 400, 500]`))
+	defer runEnds.Release()
+
+	values, _, _ := array.FromJSON(mem, arrow.BinaryTypes.String, strings.NewReader(`["Hello", "beautiful", "world", "of", "RLE"]`))
+	defer values.Release()
+
+	reeArray := array.NewRunEndEncodedArray(runEnds, values, 500, 0)
+	defer reeArray.Release()

Review Comment:
   Is there a plan to implement FromJSON for REE? (Possibly using an analogous encoding to unions like `[run_length, value]`?)



##########
go/arrow/scalar/scalar.go:
##########
@@ -664,6 +671,28 @@ func GetScalar(arr arrow.Array, idx int) (Scalar, error) {
 		return NewTime64Scalar(arr.Value(idx), arr.DataType()), nil
 	case *array.Timestamp:
 		return NewTimestampScalar(arr.Value(idx), arr.DataType()), nil
+	case *array.RunEndEncoded:
+		physicalIndex := encoded.FindPhysicalIndex(arr.Data(), arr.Offset()+idx)
+		value, err := GetScalar(arr.Values(), physicalIndex)
+		if err != nil {
+			return nil, err
+		}
+
+		var runLength int64
+
+		runEnds := arr.RunEndsArr()
+		switch ends := runEnds.(type) {
+		case *array.Int16:
+			runLength = int64(ends.Value(physicalIndex))
+		case *array.Int32:
+			runLength = int64(ends.Value(physicalIndex))
+		case *array.Int64:
+			runLength = int64(ends.Value(physicalIndex))
+		}

Review Comment:
   Apologies if this discussion was already hashed out, but does it make sense to have a run-end-encoded scalar? Wouldn't 'scalar at logical index N' just be the scalar of the underlying type?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org