You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/10/05 22:26:48 UTC

[GitHub] [arrow] eerhardt commented on a change in pull request #8348: ARROW-6972: [C#] C# support for StructArrays

eerhardt commented on a change in pull request #8348:
URL: https://github.com/apache/arrow/pull/8348#discussion_r499882330



##########
File path: csharp/test/Apache.Arrow.Tests/ArrayBuilderTests.cs
##########
@@ -73,7 +73,6 @@ public void StringArrayBuilderHandlesNullsAndEmptyStrings()
             Assert.Equal(string.Empty, stringArray.GetString(3));
         }
 
-

Review comment:
       Can you revert this change, since it is the only one in this file?

##########
File path: csharp/src/Apache.Arrow/Arrays/StructArray.cs
##########
@@ -21,13 +21,13 @@ namespace Apache.Arrow
 {
     public class StructArray : Array
     {
-        private readonly List<Array> _fields;
+        private readonly IEnumerable<IArrowArray> _fields;

Review comment:
       A couple of questions here:
   
   1. Why the move from `Array` to `IArrowArray`? I kind of think we should remove `IArrowArray`, since the interface doesn't provide any benefit over the abstract class. See https://issues.apache.org/jira/browse/ARROW-5546
   2. How does this `_fields` property ever contain anything? AFAICT - it will always be an empty List.

##########
File path: csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs
##########
@@ -180,9 +193,15 @@ internal static ByteBuffer CreateByteBuffer(ReadOnlyMemory<byte> buffer)
         {
 
             ArrowBuffer nullArrowBuffer = BuildArrowBuffer(bodyData, recordBatchEnumerator.CurrentBuffer);
-            recordBatchEnumerator.MoveNextBuffer();
+            if (!recordBatchEnumerator.MoveNextBuffer())

Review comment:
       👍 

##########
File path: csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs
##########
@@ -165,8 +165,21 @@ internal static ByteBuffer CreateByteBuffer(ReadOnlyMemory<byte> buffer)
                 throw new InvalidDataException("Null count length must be >= 0"); // TODO:Localize exception message
             }
 
-            ArrowBuffer[] arrowBuff = new[] { nullArrowBuffer, valueArrowBuffer };
-            ArrayData[] children = GetChildren(ref recordBatchEnumerator, field, bodyData);
+            ArrowBuffer[] arrowBuff = null;
+            ArrayData[] children = null;
+            if (field.DataType.TypeId == ArrowTypeId.Struct)
+            {
+                arrowBuff = new[] { nullArrowBuffer};
+                children = GetChildren(ref recordBatchEnumerator, field, bodyData);
+            }
+            else
+            {
+                ArrowBuffer valueArrowBuffer = BuildArrowBuffer(bodyData, recordBatchEnumerator.CurrentBuffer);
+                recordBatchEnumerator.MoveNextBuffer();
+
+                arrowBuff = new[] { nullArrowBuffer, valueArrowBuffer };
+                children = GetChildren(ref recordBatchEnumerator, field, bodyData);

Review comment:
       it looks like the `children` line can be refactored out of the `if`. It looks the same for both cases.

##########
File path: csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs
##########
@@ -57,16 +58,30 @@ internal static Schema GetSchema(Flatbuf.Schema schema)
             for (int i = 0; i < schema.FieldsLength; i++)
             {
                 Flatbuf.Field field = schema.Fields(i).GetValueOrDefault();
-
-                schemaBuilder.Field(
-                    new Field(field.Name, GetFieldArrowType(field), field.Nullable));
+                Field arrowField = FieldFromFlatbuffer(field);
+                schemaBuilder.Field(arrowField);
             }
 
             return schemaBuilder.Build();
         }
 
+        private static Field FieldFromFlatbuffer(Flatbuf.Field flatbufField)
+        {
+            System.Collections.Generic.List<Field> childFields = null;
+            if (flatbufField.ChildrenLength > 0)
+            {
+                childFields = new System.Collections.Generic.List<Field>();

Review comment:
       ```suggestion
                   childFields = new System.Collections.Generic.List<Field>(flatbufField.ChildrenLength);
   ```
   
   Since we know up front how many we will need.
   
   Actually, do we even need a `List` here? Could it simply be an array since we know how many we will need?

##########
File path: csharp/src/Apache.Arrow/Types/StructType.cs
##########
@@ -28,7 +28,7 @@ public sealed class StructType : ArrowType
 
         public IEnumerable<Field> Fields => _fields;

Review comment:
       Since we are deriving from `NestedType` now, why do we need the `Fields` property? `NestedType` already has a `Children` property, which will be the same thing.

##########
File path: csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs
##########
@@ -165,8 +165,21 @@ internal static ByteBuffer CreateByteBuffer(ReadOnlyMemory<byte> buffer)
                 throw new InvalidDataException("Null count length must be >= 0"); // TODO:Localize exception message
             }
 
-            ArrowBuffer[] arrowBuff = new[] { nullArrowBuffer, valueArrowBuffer };
-            ArrayData[] children = GetChildren(ref recordBatchEnumerator, field, bodyData);
+            ArrowBuffer[] arrowBuff = null;

Review comment:
       No need to initialize these 2 variables to `null` since they will always be set in the `if-else`.
   
   1. It is a minor perf improvement.
   2. It allows the compiler to warn you in case you didn't set them.

##########
File path: csharp/test/Apache.Arrow.Tests/StructArrayTests.cs
##########
@@ -0,0 +1,86 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using Apache.Arrow.Types;
+using System.Collections.Generic;
+using Xunit;
+
+namespace Apache.Arrow.Tests
+{
+    public class StructArrayTests
+    {
+        [Fact]
+        public void TestStructArray()
+        {
+            // The following can be improved with a Builder class for StructArray.
+            List<Field> fields = new List<Field>();
+            Field.Builder fieldBuilder = new Field.Builder();
+            fields.Add(fieldBuilder.Name("Strings").DataType(StringType.Default).Nullable(true).Build());
+            fieldBuilder = new Field.Builder();
+            fields.Add(fieldBuilder.Name("Ints").DataType(Int32Type.Default).Nullable(true).Build());
+            StructType structType = new StructType(fields);
+
+            StringArray.Builder stringBuilder = new StringArray.Builder();
+            StringArray stringArray = stringBuilder.Append("joe").AppendNull().AppendNull().Append("mark").Build();
+            Int32Array.Builder intBuilder = new Int32Array.Builder();
+            Int32Array intArray = intBuilder.Append(1).Append(2).AppendNull().Append(4).Build();
+            List<Array> arrays = new List<Array>();
+            arrays.Add(stringArray);
+            arrays.Add(intArray);
+
+            ArrowBuffer.BitmapBuilder nullBitmap = new ArrowBuffer.BitmapBuilder();
+            var nullBitmapBuffer = nullBitmap.Append(true).Append(true).Append(false).Append(true).Build();
+            StructArray structs = new StructArray(structType, 4, arrays, nullBitmapBuffer, 1);
+
+            Assert.Equal(4, structs.Length);
+            Assert.Equal(1, structs.NullCount);
+            ArrayData[] childArrays = structs.Data.Children; // Data for StringArray and Int32Array
+            Assert.Equal(2, childArrays.Length);
+            for (int j = 0; j < childArrays.Length; j++)

Review comment:
       (nit) Why `j`? Why not `i`?

##########
File path: csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs
##########
@@ -171,6 +172,13 @@ public void Visit(Time64Type type)
                     Flatbuf.Time.CreateTime(Builder, ToFlatBuffer(type.Unit), 64));
             }
 
+            public void Visit(StructType type)
+            {
+                Flatbuf.Struct_.StartStruct_(Builder);
+                FieldType result = FieldType.Build(Flatbuf.Type.Struct_, Flatbuf.Struct_.EndStruct_(Builder));

Review comment:
       (super nit) Do you need the `FieldType result` variable? Why not just assign `Result` directly, like the other methods do?

##########
File path: csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
##########
@@ -102,6 +103,31 @@ public void Visit(BinaryArray array)
                 _buffers.Add(CreateBuffer(array.ValueBuffer));
             }
 
+            private void Visit(ArrayData[] children)
+            {
+                for (int i = 0; i < children.Length; i++)
+                {
+                    Visit(ArrowArrayFactory.BuildArray(children[i]));

Review comment:
       It's a bit unfortunate that we need to allocate a new `Array` object here. But we can get this merged as functional, and tweak the performance later.

##########
File path: csharp/src/Apache.Arrow/Types/StructType.cs
##########
@@ -28,7 +28,7 @@ public sealed class StructType : ArrowType
 
         public IEnumerable<Field> Fields => _fields;

Review comment:
       Note that C++ standardized on the "Field" name: https://github.com/apache/arrow/pull/7132




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org