You are viewing a plain text version of this content. The canonical link for it is here.
Posted to jira@arrow.apache.org by "Arrow User (Jira)" <ji...@apache.org> on 2022/03/18 15:18:00 UTC
[jira] [Created] (ARROW-15971) Error when reading inner lists within a struct in empty outer lists from C++/Python in Java
Arrow User created ARROW-15971:
----------------------------------
Summary: Error when reading inner lists within a struct in empty outer lists from C++/Python in Java
Key: ARROW-15971
URL: https://issues.apache.org/jira/browse/ARROW-15971
Project: Apache Arrow
Issue Type: Bug
Components: C++, Java, Python
Affects Versions: 7.0.0
Reporter: Arrow User
### Description
When using C++ (or Python) to construct a null or empty outer array of type `array_1: list<item: struct<array_sub_col: list<item: string>>>`, either:
- `array_1:null` or
- `array_1:[]`
an out of bounds exceptions (see stack trace below) follows when later retrieving the field reader for the inner list (`array_sub_col`) in Java.
### Reproduction
Java: `7.0.0`
C++: `4.0.0`
Python: `7.0.0`
Creating a stream on C++ of type `array_1: list<item: struct<array_sub_col: list<item: string>>> ` with an empty (or null) outer list:
```
arrow::MemoryPool* pool = arrow::default_memory_pool();
arrow::Result<std::shared_ptr<arrow::io::BufferOutputStream>> stream_buffer =
arrow::io::BufferOutputStream::Create(1, pool);
std::vector<std::shared_ptr<arrow::Field>> inner_list_field{std::make_shared<arrow::Field>(
"array_sub_col",
arrow::list(arrow::utf8()))};
// Datatype for the builder: list<struct<list<string>>>
std::shared_ptr<DataType> data_type = list(struct_(inner_list_field));
std::unique_ptr<arrow::ArrayBuilder> builder;
arrow::MakeBuilder(pool, data_type, &builder);
auto* list_builder = dynamic_cast<arrow::ListBuilder*>(builder.get());
// Append a null or an empty list to the outer list
list_builder->AppendNull(); // or list_builder->AppendEmptyValue()
std::vector<std::shared_ptr<arrow::Array>> value_batch;
value_batch.resize(1);
list_builder->Finish(&value_batch[0]);
std::vector<std::shared_ptr<arrow::Field>> outer_list_field{std::make_shared<arrow::Field>("array_1",
data_type)};
auto schema = std::make_shared<arrow::Schema>(outer_list_field);
// Build a single row record batch
std::shared_ptr<arrow::RecordBatch> batch = RecordBatch::Make(schema, 1, value_batch);
ASSERT_OK(batch->Validate());
// Stream the batch to a file
arrow::Result<std::shared_ptr<ipc::RecordBatchWriter>> stream_writer =
arrow::ipc::MakeStreamWriter(stream_buffer.ValueOrDie().get(), schema, arrow::ipc::IpcWriteOptions::Defaults());
stream_writer.ValueOrDie()->WriteRecordBatch(*batch);
arrow::Result<std::shared_ptr<arrow::Buffer>> buffer_result = stream_buffer.ValueOrDie()->Finish();
std::shared_ptr<arrow::Buffer> buffer = buffer_result.ValueOrDie();
auto file_output = arrow::io::FileOutputStream::Open("/tmp/batch_stream.out").ValueOrDie();
file_output->Write(buffer->data(), buffer->size());
file_output->Close();
```
As expected, Python holds the same memory layout for the field vectors as the code above:
```
# Empty or null outer list
array = pa.array([None], type=pa.list_(pa.struct([pa.field("array_sub_col", pa.list_(pa.utf8()))])))
batch = pa.record_batch([struct_array], names=["array_1"])
// Stream the batch to a file
sink = pa.BufferOutputStream()
with pa.ipc.new_stream(sink, batch.schema) as writer:
writer.write_batch(batch)
buf = sink.getvalue()
with open('/tmp/batch_stream.out', 'wb') as f:
f.write(buf)
```
**Java fails when then trying to access the inner list's field reader:**
```
File file = new File("/tmp/batch_stream.out");
byte[] bytes = FileUtils.readFileToByteArray(file);
try (ArrowStreamReader reader = new ArrowStreamReader(new ByteArrayInputStream(bytes), allocator)) {
Schema schema = reader.getVectorSchemaRoot().getSchema();
reader.loadNextBatch();
readBatch.getVector("array_1").getReader().reader().reader("array_sub_col"); // <- fails: reader("array_sub_col") fails with OOB
// Concrete readers:
// FieldVector array_1 = readBatch.getVector("array_1");
// UnionListReader array_1_reader = (UnionListReader) array_1.getReader();
// NullableStructReaderImpl struct_reader = (NullableStructReaderImpl) array_1_reader.reader();
// FieldReader union_list_reader = struct_reader.reader("array_sub_col"); // <- fails: OOB
```
#### Stack trace:
```
java.lang.reflect.InvocationTargetException
at jdk.internal.reflect.NativeMethodAccessorImpl.invoke0 (Native Method)
at jdk.internal.reflect.NativeMethodAccessorImpl.invoke (NativeMethodAccessorImpl.java:62)
at jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke (DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke (Method.java:566)
at org.codehaus.mojo.exec.ExecJavaMojo$1.run (ExecJavaMojo.java:297)
at java.lang.Thread.run (Thread.java:829)
Caused by: java.lang.IndexOutOfBoundsException: index: 4, length: 4 (expected: range(0, 4))
at org.apache.arrow.memory.ArrowBuf.checkIndexD (ArrowBuf.java:318)
at org.apache.arrow.memory.ArrowBuf.chk (ArrowBuf.java:305)
at org.apache.arrow.memory.ArrowBuf.getInt (ArrowBuf.java:424)
at com.test.arrow.ValidateArrow.testArrow (ValidateArrow.java:433)
at com.test.arrow.ValidateArrow.main (ValidateArrow.java:440)
at jdk.internal.reflect.NativeMethodAccessorImpl.invoke0 (Native Method)
at jdk.internal.reflect.NativeMethodAccessorImpl.invoke (NativeMethodAccessorImpl.java:62)
at jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke (DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke (Method.java:566)
at org.codehaus.mojo.exec.ExecJavaMojo$1.run (ExecJavaMojo.java:297)
at java.lang.Thread.run (Thread.java:829)
```
--
This message was sent by Atlassian Jira
(v8.20.1#820001)