You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by wj...@apache.org on 2023/06/16 19:45:35 UTC

[arrow] branch main updated: GH-36120: [C#] Support schema metadata through the C API (#36122)

This is an automated email from the ASF dual-hosted git repository.

wjones127 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 14f2e4e386 GH-36120: [C#] Support schema metadata through the C API (#36122)
14f2e4e386 is described below

commit 14f2e4e386e691114083bc91fa4da6744e1d8944
Author: Curt Hagenlocher <cu...@hagenlocher.org>
AuthorDate: Fri Jun 16 12:45:29 2023 -0700

    GH-36120: [C#] Support schema metadata through the C API (#36122)
    
    ### What changes are included in this PR?
    
    Import and export of field- and schema-level metadata via the C API.
    
    ### Are these changes tested?
    
    Yes
    
    * Closes: #36120
    
    Authored-by: Curt Hagenlocher <cu...@hagenlocher.org>
    Signed-off-by: Will Jones <wi...@gmail.com>
---
 csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs  | 52 ++++++++++++++++++++--
 csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs  | 43 +++++++++++++++++-
 .../CDataInterfacePythonTests.cs                   | 16 ++++---
 3 files changed, 101 insertions(+), 10 deletions(-)

diff --git a/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs b/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs
index fae2455560..fe47c9f7f0 100644
--- a/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs
+++ b/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs
@@ -16,8 +16,10 @@
 
 using System;
 using System.Collections.Generic;
+using System.Diagnostics;
 using System.IO;
 using System.Runtime.InteropServices;
+using System.Text;
 using Apache.Arrow.Types;
 
 namespace Apache.Arrow.C
@@ -83,8 +85,7 @@ namespace Apache.Arrow.C
         {
             ExportType(field.DataType, schema);
             schema->name = StringUtil.ToCStringUtf8(field.Name);
-            // TODO: field metadata
-            schema->metadata = null;
+            schema->metadata = ConstructMetadata(field.Metadata);
             schema->flags = GetFlags(field.DataType, field.IsNullable);
         }
 
@@ -104,8 +105,8 @@ namespace Apache.Arrow.C
         public static unsafe void ExportSchema(Schema schema, CArrowSchema* out_schema)
         {
             var structType = new StructType(schema.FieldsList);
-            // TODO: top-level metadata
             ExportType(structType, out_schema);
+            out_schema->metadata = ConstructMetadata(schema.Metadata);
         }
 
         private static char FormatTimeUnit(TimeUnit unit) => unit switch
@@ -239,6 +240,51 @@ namespace Apache.Arrow.C
             }
         }
 
+        private unsafe static byte* ConstructMetadata(IReadOnlyDictionary<string, string> metadata)
+        {
+            if (metadata == null || metadata.Count == 0)
+            {
+                return null;
+            }
+
+            int size = 4;
+            int[] lengths = new int[metadata.Count * 2];
+            int i = 0;
+            foreach (KeyValuePair<string, string> pair in metadata)
+            {
+                size += 8;
+                lengths[i] = Encoding.UTF8.GetByteCount(pair.Key);
+                size += lengths[i++];
+                lengths[i] = Encoding.UTF8.GetByteCount(pair.Value);
+                size += lengths[i++];
+            }
+
+            IntPtr result = Marshal.AllocHGlobal(size);
+            Marshal.WriteInt32(result, metadata.Count);
+            byte* ptr = (byte*)result + 4;
+            i = 0;
+            foreach (KeyValuePair<string, string> pair in metadata)
+            {
+                WriteMetadataString(ref ptr, lengths[i++], pair.Key);
+                WriteMetadataString(ref ptr, lengths[i++], pair.Value);
+            }
+
+            Debug.Assert((long)(IntPtr)ptr - (long)result == size);
+
+            return (byte*)result;
+        }
+
+        private unsafe static void WriteMetadataString(ref byte* ptr, int length, string str)
+        {
+            Marshal.WriteInt32((IntPtr)ptr, length);
+            ptr += 4;
+            fixed (char* s = str)
+            {
+                Encoding.UTF8.GetBytes(s, str.Length, ptr, length);
+            }
+            ptr += length;
+        }
+
         private static unsafe void ReleaseCArrowSchema(CArrowSchema* schema)
         {
             if (schema == null) return;
diff --git a/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs b/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs
index a454ae6ba1..89c9481270 100644
--- a/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs
+++ b/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs
@@ -18,6 +18,8 @@ using System;
 using System.Collections.Generic;
 using System.IO;
 using System.Linq;
+using System.Runtime.InteropServices;
+using System.Text;
 using Apache.Arrow.Types;
 
 namespace Apache.Arrow.C
@@ -281,7 +283,7 @@ namespace Apache.Arrow.C
 
                 bool nullable = _cSchema->GetFlag(CArrowSchema.ArrowFlagNullable);
 
-                return new Field(fieldName, GetAsType(), nullable);
+                return new Field(fieldName, GetAsType(), nullable, GetMetadata(_cSchema->metadata));
             }
 
             public Schema GetAsSchema()
@@ -289,13 +291,50 @@ namespace Apache.Arrow.C
                 ArrowType fullType = GetAsType();
                 if (fullType is StructType structType)
                 {
-                    return new Schema(structType.Fields, default);
+                    return new Schema(structType.Fields, GetMetadata(_cSchema->metadata));
                 }
                 else
                 {
                     throw new ArgumentException("Imported type is not a struct type, so it cannot be converted to a schema.");
                 }
             }
+
+            private unsafe static IReadOnlyDictionary<string, string> GetMetadata(byte* metadata)
+            {
+                if (metadata == null)
+                {
+                    return null;
+                }
+
+                IntPtr ptr = (IntPtr)metadata;
+                int count = Marshal.ReadInt32(ptr);
+                if (count <= 0)
+                {
+                    return null;
+                }
+                ptr += 4;
+
+                Dictionary<string, string> result = new Dictionary<string, string>(count);
+                for (int i = 0; i < count; i++)
+                {
+                    result[ReadMetadataString(ref ptr)] = ReadMetadataString(ref ptr);
+                }
+                return result;
+            }
+
+            private unsafe static string ReadMetadataString(ref IntPtr ptr)
+            {
+                int length = Marshal.ReadInt32(ptr);
+                if (length < 0)
+                {
+                    throw new InvalidOperationException("unexpected negative length for metadata string");
+                }
+
+                ptr += 4;
+                string result = Encoding.UTF8.GetString((byte*)ptr, length);
+                ptr += length;
+                return result;
+            }
         }
     }
 }
diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs
index 8172c4f420..084d7bfb01 100644
--- a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs
+++ b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs
@@ -61,7 +61,7 @@ namespace Apache.Arrow.Tests
             using (Py.GIL())
             {
                 var schema = new Schema.Builder()
-                    .Field(f => f.Name("null").DataType(NullType.Default).Nullable(true))
+                    .Field(f => f.Name("null").DataType(NullType.Default).Nullable(true).Metadata("k0", "v0"))
                     .Field(f => f.Name("bool").DataType(BooleanType.Default).Nullable(true))
                     .Field(f => f.Name("i8").DataType(Int8Type.Default).Nullable(true))
                     .Field(f => f.Name("u8").DataType(UInt8Type.Default).Nullable(true))
@@ -72,7 +72,7 @@ namespace Apache.Arrow.Tests
                     .Field(f => f.Name("i64").DataType(Int64Type.Default).Nullable(true))
                     .Field(f => f.Name("u64").DataType(UInt64Type.Default).Nullable(true))
 
-                    .Field(f => f.Name("f16").DataType(HalfFloatType.Default).Nullable(true))
+                    .Field(f => f.Name("f16").DataType(HalfFloatType.Default).Nullable(true).Metadata("k1a", "").Metadata("k1b", "断箭"))
                     .Field(f => f.Name("f32").DataType(FloatType.Default).Nullable(true))
                     .Field(f => f.Name("f64").DataType(DoubleType.Default).Nullable(true))
 
@@ -105,6 +105,7 @@ namespace Apache.Arrow.Tests
                     // Checking wider characters.
                     .Field(f => f.Name("hello 你好 😄").DataType(BooleanType.Default).Nullable(true))
 
+                    .Metadata("k2a", "v2abc").Metadata("k2b", "v2abc").Metadata("k2c", "v2abc")
                     .Build();
                 return schema;
             }
@@ -114,8 +115,11 @@ namespace Apache.Arrow.Tests
         {
             using (Py.GIL())
             {
+                Dictionary<string, string> metadata0 = new Dictionary<string, string> { { "k0", "v0" } };
+                Dictionary<string, string> metadata1 = new Dictionary<string, string> { { "k1a", "" }, { "k1b", "断箭" } };
+
                 dynamic pa = Py.Import("pyarrow");
-                yield return pa.field("null", pa.GetAttr("null").Invoke(), true);
+                yield return pa.field("null", pa.GetAttr("null").Invoke(), true).with_metadata(metadata0);
                 yield return pa.field("bool", pa.bool_(), true);
                 yield return pa.field("i8", pa.int8(), true);
                 yield return pa.field("u8", pa.uint8(), true);
@@ -126,7 +130,7 @@ namespace Apache.Arrow.Tests
                 yield return pa.field("i64", pa.int64(), true);
                 yield return pa.field("u64", pa.uint64(), true);
 
-                yield return pa.field("f16", pa.float16(), true);
+                yield return pa.field("f16", pa.float16(), true).with_metadata(metadata1);
                 yield return pa.field("f32", pa.float32(), true);
                 yield return pa.field("f64", pa.float64(), true);
 
@@ -164,8 +168,10 @@ namespace Apache.Arrow.Tests
         {
             using (Py.GIL())
             {
+                Dictionary<string, string> metadata = new Dictionary<string, string> { { "k2a", "v2abc" }, { "k2b", "v2abc" }, { "k2c", "v2abc" } };
+
                 dynamic pa = Py.Import("pyarrow");
-                return pa.schema(GetPythonFields().ToList());
+                return pa.schema(GetPythonFields().ToList()).with_metadata(metadata);
             }
         }