You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2023/09/07 19:03:38 UTC

[arrow] branch main updated: GH-37056: [Java] Fix importing an empty data array from c-data (#37531)

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 5a781693d9 GH-37056: [Java] Fix importing an empty data array from c-data (#37531)
5a781693d9 is described below

commit 5a781693d953da8784edaf971f6124528398b4a2
Author: Jacek Stania <38...@users.noreply.github.com>
AuthorDate: Thu Sep 7 20:03:32 2023 +0100

    GH-37056: [Java] Fix importing an empty data array from c-data (#37531)
    
    
    ### Rationale for this change
    
    Fix java lib bug that prevents from importing specific data via c-data interface.
    Currently an attempt to load a vector with empty data buffer results in an IllegalStateException error.
    
    ### What changes are included in this PR?
    Updated BufferImportTypeVisitor to correctly handle a situation when underlying c-data array pointer is NULL (0) and the expected length of data is zero (0).
    
    ### Are these changes tested?
    Yes, updated the existing unit tests
    
    ### Are there any user-facing changes?
    No
    
    * Closes: #37056
    
    Lead-authored-by: Jacek Stania <ja...@gmail.com>
    Co-authored-by: Jacek Stania <38...@users.noreply.github.com>
    Co-authored-by: David Li <li...@gmail.com>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 .../apache/arrow/c/BufferImportTypeVisitor.java    | 43 +++++++++++-----------
 .../org/apache/arrow/c/ArrowArrayUtilityTest.java  | 30 ++++++++++++---
 .../java/org/apache/arrow/c/RoundtripTest.java     |  8 ++++
 java/c/src/test/python/integration_tests.py        | 27 +++++++++++++-
 4 files changed, 79 insertions(+), 29 deletions(-)

diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java
index c8b6d07086..7408bf7113 100644
--- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java
+++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java
@@ -80,45 +80,44 @@ class BufferImportTypeVisitor implements ArrowType.ArrowTypeVisitor<List<ArrowBu
   }
 
   @VisibleForTesting
-  long getBufferPtr(ArrowType type, int index) {
+  ArrowBuf importBuffer(ArrowType type, int index, long capacity) {
     checkState(
-        buffers.length > index,
-        "Expected at least %s buffers for type %s, but found %s", index + 1, type, buffers.length);
-    if (buffers[index] == NULL) {
-      throw new IllegalStateException(String.format("Buffer %s for type %s cannot be null", index, type));
+            buffers.length > index,
+            "Expected at least %s buffers for type %s, but found %s", index + 1, type, buffers.length);
+    long bufferPtr = buffers[index];
+
+    if (bufferPtr == NULL) {
+      // C array may be NULL but only accept that if expected capacity is zero too
+      if (capacity != 0) {
+        throw new IllegalStateException(String.format("Buffer %s for type %s cannot be null", index, type));
+      } else {
+        // no data in the C array, return an empty buffer
+        return allocator.getEmpty();
+      }
     }
-    return buffers[index];
+
+    ArrowBuf buf = underlyingAllocation.unsafeAssociateAllocation(allocator, capacity, bufferPtr);
+    imported.add(buf);
+    return buf;
   }
 
   private ArrowBuf importFixedBits(ArrowType type, int index, long bitsPerSlot) {
-    final long bufferPtr = getBufferPtr(type, index);
     final long capacity = DataSizeRoundingUtil.divideBy8Ceil(bitsPerSlot * fieldNode.getLength());
-    ArrowBuf buf = underlyingAllocation.unsafeAssociateAllocation(allocator, capacity, bufferPtr);
-    this.imported.add(buf);
-    return buf;
+    return importBuffer(type, index, capacity);
   }
 
   private ArrowBuf importFixedBytes(ArrowType type, int index, long bytesPerSlot) {
-    final long bufferPtr = getBufferPtr(type, index);
     final long capacity = bytesPerSlot * fieldNode.getLength();
-    ArrowBuf buf = underlyingAllocation.unsafeAssociateAllocation(allocator, capacity, bufferPtr);
-    this.imported.add(buf);
-    return buf;
+    return importBuffer(type, index, capacity);
   }
 
   private ArrowBuf importOffsets(ArrowType type, long bytesPerSlot) {
-    final long bufferPtr = getBufferPtr(type, 1);
     final long capacity = bytesPerSlot * (fieldNode.getLength() + 1);
-    ArrowBuf buf = underlyingAllocation.unsafeAssociateAllocation(allocator, capacity, bufferPtr);
-    this.imported.add(buf);
-    return buf;
+    return importBuffer(type, 1, capacity);
   }
 
   private ArrowBuf importData(ArrowType type, long capacity) {
-    final long bufferPtr = getBufferPtr(type, 2);
-    ArrowBuf buf = underlyingAllocation.unsafeAssociateAllocation(allocator, capacity, bufferPtr);
-    this.imported.add(buf);
-    return buf;
+    return importBuffer(type, 2, capacity);
   }
 
   private ArrowBuf maybeImportBitmap(ArrowType type) {
diff --git a/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java b/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java
index 2d31089ca7..4a0a966ba1 100644
--- a/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java
+++ b/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java
@@ -56,19 +56,39 @@ class ArrowArrayUtilityTest {
   // BufferImportTypeVisitor
 
   @Test
-  void getBufferPtr() throws Exception {
+  void importBuffer() throws Exception {
     // Note values are all dummy values here
-    try (BufferImportTypeVisitor visitor =
-        new BufferImportTypeVisitor(allocator, dummyHandle, new ArrowFieldNode(0, 0), new long[]{0})) {
+    try (BufferImportTypeVisitor notEmptyDataVisitor =
+        new BufferImportTypeVisitor(allocator, dummyHandle, new ArrowFieldNode(/* length= */ 1, 0), new long[]{0})) {
 
       // Too few buffers
-      assertThrows(IllegalStateException.class, () -> visitor.getBufferPtr(new ArrowType.Bool(), 1));
+      assertThrows(IllegalStateException.class, () -> notEmptyDataVisitor.importBuffer(new ArrowType.Bool(), 1, 1));
 
       // Null where one isn't expected
-      assertThrows(IllegalStateException.class, () -> visitor.getBufferPtr(new ArrowType.Bool(), 0));
+      assertThrows(IllegalStateException.class, () -> notEmptyDataVisitor.importBuffer(new ArrowType.Bool(), 0, 1));
+
+      // Expected capacity not zero but c array ptr is NULL (zero)
+      assertThrows(IllegalStateException.class, () -> notEmptyDataVisitor.importBuffer(new ArrowType.Bool(), 0, 1));
+
+      // Expected capacity is zero and c array ptr is NULL (zero)
+      assertThat(notEmptyDataVisitor.importBuffer(new ArrowType.Bool(), 0, 0)).isEqualTo(allocator.getEmpty());
+    }
+
+    try (BufferImportTypeVisitor emptyDataVisitor =
+        new BufferImportTypeVisitor(allocator, dummyHandle, new ArrowFieldNode(/* length= */ 0, 0), new long[]{0})) {
+
+      // Too few buffers
+      assertThrows(IllegalStateException.class, () -> emptyDataVisitor.importBuffer(new ArrowType.Bool(), 1, 1));
+
+      // Expected capacity not zero but c array ptr is NULL (zero)
+      assertThrows(IllegalStateException.class, () -> emptyDataVisitor.importBuffer(new ArrowType.Bool(), 0, 1));
+
+      // Expected capacity is zero and c array ptr is NULL (zero)
+      assertThat(emptyDataVisitor.importBuffer(new ArrowType.Bool(), 0, 0)).isEqualTo(allocator.getEmpty());
     }
   }
 
+
   @Test
   void cleanupAfterFailure() throws Exception {
     // Note values are all dummy values here
diff --git a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java
index fc73df449b..c1ceb93cb4 100644
--- a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java
+++ b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java
@@ -534,6 +534,14 @@ public class RoundtripTest {
     }
   }
 
+  @Test
+  public void testEmptyListVector() {
+    try (final ListVector vector = ListVector.empty("v", allocator)) {
+      setVector(vector, new ArrayList<Integer>());
+      assertTrue(roundtrip(vector, ListVector.class));
+    }
+  }
+
   @Test
   public void testLargeListVector() {
     try (final LargeListVector vector = LargeListVector.empty("v", allocator)) {
diff --git a/java/c/src/test/python/integration_tests.py b/java/c/src/test/python/integration_tests.py
index c23b4b9b44..eb8bb8c4b2 100644
--- a/java/c/src/test/python/integration_tests.py
+++ b/java/c/src/test/python/integration_tests.py
@@ -24,6 +24,7 @@ import xml.etree.ElementTree as ET
 
 import jpype
 import pyarrow as pa
+import pyarrow.ipc as ipc
 from pyarrow.cffi import ffi
 
 
@@ -119,6 +120,10 @@ class Bridge:
 
 
 class TestPythonIntegration(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        setup_jvm()
+
     def setUp(self):
         gc.collect()
         self.old_allocated_python = pa.total_allocated_bytes()
@@ -195,7 +200,26 @@ class TestPythonIntegration(unittest.TestCase):
             # disabled check_metadata since the list internal field name ("item")
             # is not preserved during round trips (it becomes "$data$").
         ), check_metadata=False)
-        
+
+    def test_empty_list_array(self):
+        """Validates GH-37056 fix.
+        Empty list of int32 produces a vector with empty child data buffer, however with non-zero capacity.
+        Using streaming forces the c-data array which represent the child data buffer to be NULL (pointer is 0).
+        On Java side, an attempt to import such array triggered an exception described in GH-37056.
+        """
+        with pa.BufferOutputStream() as bos:
+            schema = pa.schema([pa.field("f0", pa.list_(pa.int32()), True)])
+            with ipc.new_stream(bos, schema) as writer:
+                src = pa.RecordBatch.from_arrays([pa.array([[]])], schema=schema)
+                writer.write(src)
+        data_bytes = bos.getvalue()
+
+        def recreate_batch():
+            with pa.input_stream(data_bytes) as ios:
+                with ipc.open_stream(ios) as reader:
+                    return reader.read_next_batch()
+
+        self.round_trip_record_batch(recreate_batch)
 
     def test_struct_array(self):
         fields = [
@@ -274,5 +298,4 @@ class TestPythonIntegration(unittest.TestCase):
 
 
 if __name__ == '__main__':
-    setup_jvm()
     unittest.main(verbosity=2)