You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ee...@apache.org on 2023/05/23 15:49:53 UTC

[arrow] branch main updated: GH-32605: [C#] Extend validity buffer api (#35342)

This is an automated email from the ASF dual-hosted git repository.

eerhardt pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new f38943af09 GH-32605: [C#] Extend validity buffer api (#35342)
f38943af09 is described below

commit f38943af09c51d2e50c94f03de14f88cbaa07a3f
Author: Aleksei Smirnov <tl...@inbox.ru>
AuthorDate: Tue May 23 18:49:44 2023 +0300

    GH-32605: [C#] Extend validity buffer api (#35342)
    
    Add a method to the ValidityBuffer that adds the same bool value length times without allocating an Enumerable.Repeat object
    
    ### Rationale for this change
    
    See more details in the code review comments in https://github.com/apache/arrow/pull/13810
    
    * Closes: #32605
    
    Authored-by: Aleksei Smirnov <tl...@inbox.ru>
    Signed-off-by: Eric Erhardt <er...@microsoft.com>
---
 .../Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs   |   4 +-
 .../src/Apache.Arrow/ArrowBuffer.BitmapBuilder.cs  |  21 ++++
 csharp/src/Apache.Arrow/BitUtility.cs              | 109 ++++++++++++++++-----
 csharp/src/Apache.Arrow/Properties/AssembyInfo.cs  |   1 +
 .../test/Apache.Arrow.Tests/ArrayBuilderTests.cs   |   2 +-
 .../ArrowBufferBitmapBuilderTests.cs               |  43 ++++++--
 csharp/test/Apache.Arrow.Tests/BitUtilityTests.cs  |  98 +++++++++++++++++-
 7 files changed, 245 insertions(+), 33 deletions(-)

diff --git a/csharp/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs b/csharp/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs
index ec1786fdc6..a50d4b52c3 100644
--- a/csharp/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs
+++ b/csharp/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs
@@ -142,7 +142,7 @@ namespace Apache.Arrow
             int len = ValueBuffer.Length;
             ValueBuffer.Append(span);
             int additionalBitsCount = ValueBuffer.Length - len;
-            ValidityBuffer.Reserve(additionalBitsCount).AppendRange(Enumerable.Repeat(true, additionalBitsCount));
+            ValidityBuffer.AppendRange(true, additionalBitsCount);
             return Instance;
         }
 
@@ -151,7 +151,7 @@ namespace Apache.Arrow
             int len = ValueBuffer.Length;
             ValueBuffer.AppendRange(values);
             var additionalBitsCount = ValueBuffer.Length - len;
-            ValidityBuffer.Reserve(additionalBitsCount).AppendRange(Enumerable.Repeat(true, additionalBitsCount));
+            ValidityBuffer.AppendRange(true, additionalBitsCount);
             return Instance;
         }
 
diff --git a/csharp/src/Apache.Arrow/ArrowBuffer.BitmapBuilder.cs b/csharp/src/Apache.Arrow/ArrowBuffer.BitmapBuilder.cs
index bdc68e37a9..410c22885a 100644
--- a/csharp/src/Apache.Arrow/ArrowBuffer.BitmapBuilder.cs
+++ b/csharp/src/Apache.Arrow/ArrowBuffer.BitmapBuilder.cs
@@ -138,6 +138,27 @@ namespace Apache.Arrow
                 return this;
             }
 
+            /// <summary>
+            /// Append multiple bits.
+            /// </summary>
+            /// <param name="value">Value of bits to append.</param>
+            /// <param name="length">Number of times the value should be added.</param>
+            /// <returns>Returns the builder (for fluent-style composition).</returns>
+            public BitmapBuilder AppendRange(bool value, int length)
+            {
+                if (length < 0)
+                    throw new ArgumentOutOfRangeException(nameof(length));
+
+                EnsureAdditionalCapacity(length);
+                Span<byte> span = Span;
+                BitUtility.SetBits(span, Length, length, value);
+
+                Length += length;
+                SetBitCount += value ? length : 0;
+
+                return this;
+            }
+
             /// <summary>
             /// Toggle the bit at a particular index.
             /// </summary>
diff --git a/csharp/src/Apache.Arrow/BitUtility.cs b/csharp/src/Apache.Arrow/BitUtility.cs
index 19417bbbe4..d9ad2e6da8 100644
--- a/csharp/src/Apache.Arrow/BitUtility.cs
+++ b/csharp/src/Apache.Arrow/BitUtility.cs
@@ -62,6 +62,67 @@ namespace Apache.Arrow
                 : (byte)(data[idx] & ~BitMask[mod]);
         }
 
+        /// <summary>
+        /// Set the number of bits in a span of bytes starting
+        /// at a specific index, and limiting to length.
+        /// </summary>
+        /// <param name="data">Span to set bits value.</param>
+        /// <param name="index">Bit index to start counting from.</param>
+        /// <param name="length">Maximum of bits in the span to consider.</param>
+        internal static void SetBits(Span<byte> data, int index, int length, bool value)
+        {
+            if (length == 0)
+                return;
+
+            int endBitIndex = checked(index + length - 1);
+                        
+            // Use simpler method if there aren't many values
+            if (length < 20)
+            {
+                for (int i = index; i <= endBitIndex; i++)
+                {
+                    SetBit(data, i, value);
+                }
+                return;
+            }
+            
+            // Otherwise do the work to figure out how to copy whole bytes
+            int startByteIndex = index / 8;
+            int startBitOffset = index % 8;
+            int endByteIndex = endBitIndex / 8;
+            int endBitOffset = endBitIndex % 8;
+                        
+            // If the starting index and ending index are not byte-aligned,
+            // we'll need to set bits the slow way. If they are
+            // byte-aligned, and for all other bytes in the 'middle', we
+            // can use a faster byte-aligned set.
+            int fullByteStartIndex = startBitOffset == 0 ? startByteIndex : startByteIndex + 1;
+            int fullByteEndIndex = endBitOffset == 7 ? endByteIndex : endByteIndex - 1;
+
+            // Bits we will be using to finish up the first byte
+            if (startBitOffset != 0)
+            {
+                Span<byte> slice = data.Slice(startByteIndex, 1);
+                for (int i = startBitOffset; i <= 7; i++)
+                    SetBit(slice, i, value);
+            }
+
+            if (fullByteEndIndex >= fullByteStartIndex)
+            {
+                Span<byte> slice = data.Slice(fullByteStartIndex, fullByteEndIndex - fullByteStartIndex + 1);
+                byte fill = (byte)(value ? 0xFF : 0x00);
+
+                slice.Fill(fill);
+            }
+
+            if (endBitOffset != 7)
+            {
+                Span<byte> slice = data.Slice(endByteIndex, 1);
+                for (int i = 0; i <= endBitOffset; i++)
+                    SetBit(slice, i, value);
+            }
+        }
+
         public static void ToggleBit(Span<byte> data, int index)
         {
             data[index / 8] ^= BitMask[index % 8];
@@ -69,29 +130,33 @@ namespace Apache.Arrow
 
         /// <summary>
         /// Counts the number of set bits in a span of bytes starting
-        /// at a specific bit offset.
+        /// at a specific bit index.
         /// </summary>
-        /// <param name="data">Span to count bits</param>
-        /// <param name="offset">Bit offset to start counting from</param>
-        /// <returns>Count of set (one) bits</returns>
-        public static int CountBits(ReadOnlySpan<byte> data, int offset) =>
-            CountBits(data, offset, data.Length * 8 - offset);
+        /// <param name="data">Span to count bits.</param>
+        /// <param name="index">Bit index to start counting from.</param>
+        /// <returns>Count of set (one) bits.</returns>
+        public static int CountBits(ReadOnlySpan<byte> data, int index) =>
+            CountBits(data, index, data.Length * 8 - index);
 
         /// <summary>
         /// Counts the number of set bits in a span of bytes starting
-        /// at a specific bit offset, and limiting to a certain number of bits
+        /// at a specific bit index, and limiting to a certain number of bits
         /// in the span.
         /// </summary>
         /// <param name="data">Span to count bits.</param>
-        /// <param name="offset">Bit offset to start counting from.</param>
+        /// <param name="index">Bit index to start counting from.</param>
         /// <param name="length">Maximum of bits in the span to consider.</param>
-        /// <returns>Count of set (one) bits</returns>
-        public static int CountBits(ReadOnlySpan<byte> data, int offset, int length)
+        /// <returns>Count of set (one) bits.</returns>
+        public static int CountBits(ReadOnlySpan<byte> data, int index, int length)
         {
-            int startByteIndex = offset / 8;
-            int startBitOffset = offset % 8;
-            int endByteIndex = (offset + length - 1) / 8;
-            int endBitOffset = (offset + length - 1) % 8;
+            int startByteIndex = index / 8;
+            int startBitOffset = index % 8;
+
+            int endBitIndex = index + length - 1;
+
+            int endByteIndex = endBitIndex / 8;
+            int endBitOffset = endBitIndex % 8;
+
             if (startBitOffset < 0)
                 return 0;
 
@@ -99,7 +164,7 @@ namespace Apache.Arrow
             if (startByteIndex == endByteIndex)
             {
                 // Range starts and ends within the same byte.
-                var slice = data.Slice(startByteIndex, 1);
+                ReadOnlySpan<byte> slice = data.Slice(startByteIndex, 1);
                 for (int i = startBitOffset; i <= endBitOffset; i++)
                     count += GetBit(slice, i) ? 1 : 0;
 
@@ -107,7 +172,7 @@ namespace Apache.Arrow
             }
 
             // If the starting index and ending index are not byte-aligned,
-            // we'll need to count bits the slow way.  If they are
+            // we'll need to count bits the slow way. If they are
             // byte-aligned, and for all other bytes in the 'middle', we
             // can use a faster byte-aligned count.
             int fullByteStartIndex = startBitOffset == 0 ? startByteIndex : startByteIndex + 1;
@@ -115,20 +180,20 @@ namespace Apache.Arrow
 
             if (startBitOffset != 0)
             {
-                var slice = data.Slice(startByteIndex, 1);
+                ReadOnlySpan<byte> slice = data.Slice(startByteIndex, 1);
                 for (int i = startBitOffset; i <= 7; i++)
                     count += GetBit(slice, i) ? 1 : 0;
             }
 
             if (fullByteEndIndex >= fullByteStartIndex)
             {
-                var slice = data.Slice(fullByteStartIndex, fullByteEndIndex - fullByteStartIndex + 1);
+                ReadOnlySpan<byte> slice = data.Slice(fullByteStartIndex, fullByteEndIndex - fullByteStartIndex + 1);
                 count += CountBits(slice);
             }
 
             if (endBitOffset != 7)
             {
-                var slice = data.Slice(endByteIndex, 1);
+                ReadOnlySpan<byte> slice = data.Slice(endByteIndex, 1);
                 for (int i = 0; i <= endBitOffset; i++)
                     count += GetBit(slice, i) ? 1 : 0;
             }
@@ -139,7 +204,7 @@ namespace Apache.Arrow
         /// <summary>
         /// Counts the number of set bits in a span of bytes.
         /// </summary>
-        /// <param name="data">Span to count bits</param>
+        /// <param name="data">Span to count bits.</param>
         /// <returns>Count of set (one) bits.</returns>
         public static int CountBits(ReadOnlySpan<byte> data)
         {
@@ -186,8 +251,8 @@ namespace Apache.Arrow
         /// <summary>
         /// Calculates the number of bytes required to store n bits.
         /// </summary>
-        /// <param name="n">number of bits</param>
-        /// <returns>number of bytes</returns>
+        /// <param name="n">Number of bits</param>
+        /// <returns>Number of bytes</returns>
         public static int ByteCount(int n)
         {
             Debug.Assert(n >= 0);
diff --git a/csharp/src/Apache.Arrow/Properties/AssembyInfo.cs b/csharp/src/Apache.Arrow/Properties/AssembyInfo.cs
index 415bf89af4..6b2a7aea39 100644
--- a/csharp/src/Apache.Arrow/Properties/AssembyInfo.cs
+++ b/csharp/src/Apache.Arrow/Properties/AssembyInfo.cs
@@ -16,3 +16,4 @@
 using System.Runtime.CompilerServices;
 
 [assembly: InternalsVisibleTo("Apache.Arrow.Flight, PublicKey=0024000004800000940000000602000000240000525341310004000001000100e504183f6d470d6b67b6d19212be3e1f598f70c246a120194bc38130101d0c1853e4a0f2232cb12e37a7a90e707aabd38511dac4f25fcb0d691b2aa265900bf42de7f70468fc997551a40e1e0679b605aa2088a4a69e07c117e988f5b1738c570ee66997fba02485e7856a49eca5fd0706d09899b8312577cbb9034599fc92d4")]
+[assembly: InternalsVisibleTo("Apache.Arrow.Tests, PublicKey=0024000004800000940000000602000000240000525341310004000001000100e504183f6d470d6b67b6d19212be3e1f598f70c246a120194bc38130101d0c1853e4a0f2232cb12e37a7a90e707aabd38511dac4f25fcb0d691b2aa265900bf42de7f70468fc997551a40e1e0679b605aa2088a4a69e07c117e988f5b1738c570ee66997fba02485e7856a49eca5fd0706d09899b8312577cbb9034599fc92d4")]
diff --git a/csharp/test/Apache.Arrow.Tests/ArrayBuilderTests.cs b/csharp/test/Apache.Arrow.Tests/ArrayBuilderTests.cs
index 8c90f78072..9ac2f779a6 100644
--- a/csharp/test/Apache.Arrow.Tests/ArrayBuilderTests.cs
+++ b/csharp/test/Apache.Arrow.Tests/ArrayBuilderTests.cs
@@ -160,7 +160,7 @@ namespace Apache.Arrow.Tests
         public void NestedListArrayBuilder()
         {
             var childListType = new ListType(Int64Type.Default);
-            var parentListBuilder = new ListArray.Builder(childListType);
+            var parentListBuilder = new ListArray.Builder((IArrowType)childListType);
             var childListBuilder = parentListBuilder.ValueBuilder as ListArray.Builder;
             Assert.NotNull(childListBuilder);
             var valueBuilder = childListBuilder.ValueBuilder as Int64Array.Builder;
diff --git a/csharp/test/Apache.Arrow.Tests/ArrowBufferBitmapBuilderTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowBufferBitmapBuilderTests.cs
index 1e20446766..dec8274962 100644
--- a/csharp/test/Apache.Arrow.Tests/ArrowBufferBitmapBuilderTests.cs
+++ b/csharp/test/Apache.Arrow.Tests/ArrowBufferBitmapBuilderTests.cs
@@ -83,7 +83,7 @@ namespace Apache.Arrow.Tests
                 // Arrange
                 var builder = new ArrowBuffer.BitmapBuilder();
                 int initialCapacity = builder.Capacity;
-                builder.AppendRange(Enumerable.Repeat(true, initialCapacity)); // Fill to capacity.
+                builder.AppendRange(true, initialCapacity); // Fill to capacity.
 
                 // Act
                 var actualReturnValue = builder.Append(true);
@@ -136,7 +136,7 @@ namespace Apache.Arrow.Tests
             {
                 // Arrange
                 var builder = new ArrowBuffer.BitmapBuilder();
-                builder.AppendRange(Enumerable.Repeat(true, 8));
+                builder.AppendRange(true, 8);
 
                 // Act
                 var actualReturnValue = builder.Append(new Span<byte>(bytesToAppend), validBits);
@@ -162,7 +162,7 @@ namespace Apache.Arrow.Tests
             {
                 // Arrange
                 var builder = new ArrowBuffer.BitmapBuilder();
-                builder.AppendRange(Enumerable.Repeat(true, 9));
+                builder.AppendRange(true, 9);
 
                 // Act
                 var actualReturnValue = builder.Append(new Span<byte>(bytesToAppend), validBits);
@@ -180,7 +180,7 @@ namespace Apache.Arrow.Tests
             {
                 // Arrange
                 var builder = new ArrowBuffer.BitmapBuilder();
-                builder.AppendRange(Enumerable.Repeat(true, 8));
+                builder.AppendRange(true, 8);
 
                 // Act
                 var actualReturnValue = builder.Append(Span<byte>.Empty, 8);
@@ -198,7 +198,7 @@ namespace Apache.Arrow.Tests
             {
                 // Arrange
                 var builder = new ArrowBuffer.BitmapBuilder();
-                builder.AppendRange(Enumerable.Repeat(true, 8));
+                builder.AppendRange(true, 8);
 
                 // Act
                 Assert.Throws<ArgumentException>(() => builder.Append(new byte[] { 0b0010111 }, 9));
@@ -213,7 +213,7 @@ namespace Apache.Arrow.Tests
             [InlineData(new bool[] {}, new[] { true, false }, 2, 1, 1)]
             [InlineData(new[] { true, false }, new bool[] {}, 2, 1, 1)]
             [InlineData(new[] { true, false }, new[] { true, false }, 4, 2, 2)]
-            public void IncreasesLength(
+            public void AppendingEnumerableIncreasesLength(
                 bool[] initialContents,
                 bool[] toAppend,
                 int expectedLength,
@@ -234,6 +234,35 @@ namespace Apache.Arrow.Tests
                 Assert.Equal(expectedSetBitCount, builder.SetBitCount);
                 Assert.Equal(expectedUnsetBitCount, builder.UnsetBitCount);
             }
+
+            [Theory]
+            [InlineData(new bool[] { }, true, 0, 0, 0, 0)]
+            [InlineData(new bool[] { }, true, 2, 2, 2, 0)]
+            [InlineData(new[] { true, false }, false, 0, 2, 1, 1)]
+            [InlineData(new[] { true, false }, false, 2, 4, 1, 3)]
+            [InlineData(new[] { true, false }, true, 2, 4, 3, 1)]
+            public void AppendingValueMultipleTimesIncreasesLength(
+                bool[] initialContents,
+                bool valueToAppend,
+                int numberOfTimes,
+                int expectedLength,
+                int expectedSetBitCount,
+                int expectedUnsetBitCount)
+            {
+                // Arrange
+                var builder = new ArrowBuffer.BitmapBuilder();
+                builder.AppendRange(initialContents);
+
+                // Act
+                var actualReturnValue = builder.AppendRange(valueToAppend, numberOfTimes);
+
+                // Assert
+                Assert.Equal(builder, actualReturnValue);
+                Assert.Equal(expectedLength, builder.Length);
+                Assert.True(builder.Capacity >= expectedLength);
+                Assert.Equal(expectedSetBitCount, builder.SetBitCount);
+                Assert.Equal(expectedUnsetBitCount, builder.UnsetBitCount);
+            }
         }
 
         public class Build
@@ -329,7 +358,7 @@ namespace Apache.Arrow.Tests
             {
                 // Arrange
                 var builder = new ArrowBuffer.BitmapBuilder(initialCapacity);
-                builder.AppendRange(Enumerable.Repeat(true, numBitsToAppend));
+                builder.AppendRange(true, numBitsToAppend);
 
                 // Act
                 var actualReturnValue = builder.Reserve(additionalCapacity);
diff --git a/csharp/test/Apache.Arrow.Tests/BitUtilityTests.cs b/csharp/test/Apache.Arrow.Tests/BitUtilityTests.cs
index 5e18716a06..528dca1bde 100644
--- a/csharp/test/Apache.Arrow.Tests/BitUtilityTests.cs
+++ b/csharp/test/Apache.Arrow.Tests/BitUtilityTests.cs
@@ -58,7 +58,7 @@ namespace Apache.Arrow.Tests
                 Assert.Equal(expectedCount,
                     BitUtility.CountBits(data, offset));
             }
-
+                        
             [Theory]
             [InlineData(new byte[] { 0b11111111 }, 0, 8, 8)]
             [InlineData(new byte[] { 0b11111111 }, 0, 4, 4)]
@@ -127,6 +127,102 @@ namespace Apache.Arrow.Tests
                 BitUtility.SetBit(data, index);
                 Assert.Equal(expectedValue, data);
             }
+
+            [Theory]
+            [InlineData(new byte[] { 0b10000000 }, 0, true, new byte[] { 0b10000001 })]
+            [InlineData(new byte[] { 0b00000000 }, 2, true, new byte[] { 0b00000100 })]
+            [InlineData(new byte[] { 0b11111111 }, 2, false, new byte[] { 0b11111011 })]
+            [InlineData(new byte[] { 0b01111111 }, 7, true, new byte[] { 0b11111111 })]
+            [InlineData(new byte[] { 0b00000000, 0b00000000 }, 8, true, new byte[] { 0b00000000, 0b00000001 })]
+            [InlineData(new byte[] { 0b11111110, 0b11111110 }, 15, false, new byte[] { 0b11111110, 0b01111110 })]
+            public void SetsBitValueAtIndex(byte[] data, int index, bool value, byte[] expectedValue)
+            {
+                BitUtility.SetBit(data, index, value);
+                Assert.Equal(expectedValue, data);
+            }
+        }
+
+        public class SetBits
+        {
+            [Theory]
+            [InlineData(new byte[] { 0b00000000 }, 0, 0, true, new byte[] { 0b00000000 })]
+            [InlineData(new byte[] { 0b00000000 }, 0, 1, true, new byte[] { 0b00000001 })]
+            [InlineData(new byte[] { 0b00000000 }, 0, 8, true, new byte[] { 0b11111111 })]
+            [InlineData(new byte[] { 0b00000000 }, 2, 2, true, new byte[] { 0b00001100 })]
+            [InlineData(new byte[] { 0b00000000 }, 5, 3, true, new byte[] { 0b11100000 })]
+            [InlineData(new byte[] { 0b00000000, 0b00000000 }, 8, 1, true, new byte[] { 0b00000000, 0b00000001 })]
+            [InlineData(new byte[] { 0b00000000, 0b00000000 }, 15, 1, true, new byte[] { 0b00000000, 0b10000000 })]
+            [InlineData(new byte[] { 0b00000000, 0b00000000 }, 7, 2, true, new byte[] { 0b10000000, 0b00000001 })]
+            [InlineData(new byte[] { 0b11111111, 0b11111111 }, 7, 2, false, new byte[] { 0b01111111, 0b11111110 })]
+            public void SetsBitsInRangeOnSmallRanges(byte[] data, int index, int length, bool value, byte[] expectedValue)
+            {
+                BitUtility.SetBits(data, index, length, value);
+                Assert.Equal(expectedValue, data);
+            }
+                        
+            [Fact]
+            public void SetsBitsInRangeOnBigRanges()
+            {
+                //Arrange
+                //Allocate 64 bits
+                Span<byte> data = stackalloc byte[8];
+                data.Clear();
+
+                //Act
+                //Sets 56 bits in the middle
+                BitUtility.SetBits(data, 4, 56, true);
+
+                //Assert
+                //Check that 4 bits in the lower and upper boundaries are left untouched
+                Assert.Equal(0b11110000, data[0]);
+                Assert.Equal(0b00001111, data[7]);
+
+                //Check bits in the middle
+                for (int i = 1; i < 7; i++)
+                    Assert.Equal(0b11111111, data[i]);
+            }
+
+            [Fact]
+            public void SetsBitsInRangeOnBigRanges_LowerBoundaryCornerCase()
+            {
+                //Arrange
+                //Allocate 64 bits
+                Span<byte> data = stackalloc byte[8];
+                data.Clear();
+
+                //Act
+                //Sets all bits starting from 1
+                BitUtility.SetBits(data, 1, 63, true);
+
+                //Assert
+                //Check lower boundary
+                Assert.Equal(0b11111110, data[0]);
+
+                //Check other bits
+                for (int i = 1; i < 8; i++)
+                    Assert.Equal(0b11111111, data[i]);
+            }
+
+            [Fact]
+            public void SetsBitsInRangeOnBigRanges_UpperBoundaryCornerCase()
+            {
+                //Arrange
+                //Allocate 64 bits
+                Span<byte> data = stackalloc byte[8];
+                data.Clear();
+
+                //Act
+                //Sets all bits starting from 0
+                BitUtility.SetBits(data, 0, 63, true);
+
+                //Assert
+                //Check upper boundary
+                Assert.Equal(0b01111111, data[7]);
+
+                //Check other bits
+                for (int i = 0; i < 7; i++)
+                    Assert.Equal(0b11111111, data[i]);
+            }
         }
 
         public class ClearBit