You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@arrow.apache.org by "Chaokun Yang (JIRA)" <ji...@apache.org> on 2019/04/29 07:58:00 UTC
[jira] [Updated] (ARROW-5231) [Java] Arrow Java can't read union
vector from ArrowStreamReader written by its own bugs
[ https://issues.apache.org/jira/browse/ARROW-5231?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Chaokun Yang updated ARROW-5231:
--------------------------------
Description:
Similar to https://issues.apache.org/jira/browse/ARROW-5230, when I write union data using ArrowStreamWriter in java, I can't read it back using ArrowStreamReader in java. The exception is:
{quote}Exception in thread "main" java.lang.IllegalArgumentException: not all nodes and buffers were consumed. nodes: [ArrowFieldNode [length=100, nullCount=0]] buffers: [ArrowBuf[14], udle: [7 104..117], ArrowBuf[15], udle: [7 120..520]]
at org.apache.arrow.vector.VectorLoader.load(VectorLoader.java:64)
at org.apache.arrow.vector.ipc.ArrowReader.loadRecordBatch(ArrowReader.java:219)
at org.apache.arrow.vector.ipc.ArrowStreamReader.loadNextBatch(ArrowStreamReader.java:121)
{quote}
The code to reproduce this exception is:
{code:java}
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.arrow.vector.complex.UnionVector;
import org.apache.arrow.vector.dictionary.DictionaryProvider;
import org.apache.arrow.vector.holders.NullableIntHolder;
import org.apache.arrow.vector.ipc.ArrowStreamReader;
import org.apache.arrow.vector.ipc.ArrowStreamWriter;
import org.apache.arrow.vector.types.UnionMode;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.types.pojo.Schema;
import java.io.ByteArrayInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.List;
public class UnionTest {
public static void writeUnionBatch(OutputStream os) throws IOException {
int[] typeIds = new int[]{ArrowType.ArrowTypeID.Int.ordinal()};
ArrowType.Union union = new ArrowType.Union(UnionMode.Sparse, typeIds);
List<Field> childList = Collections.singletonList(
new Field("s1", FieldType.nullable(new ArrowType.Int(32, true)), null)
);
Field field = new Field("f1", FieldType.nullable(union), childList);
List<Field> fields = Collections.singletonList(field);
Schema schema = new Schema(fields);
VectorSchemaRoot root = VectorSchemaRoot.create(schema, new RootAllocator(Integer.MAX_VALUE));
DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider();
ArrowStreamWriter writer = new ArrowStreamWriter(root, provider, os);
writer.start();
for (int i = 0; i < 2; i++) {
root.setRowCount(100);
List<FieldVector> vectors = root.getFieldVectors();
UnionVector vector = (UnionVector) vectors.get(0);
fillVector(vector, 100);
for (int j = 0; j < 100; j++) {
if (!vector.isNull(j)) {
System.out.println(vector.getObject(j));
}
}
writer.writeBatch();
}
writer.end();
writer.close();
}
private static void fillVector(UnionVector vector, int batchSize) {
vector.setInitialCapacity(batchSize);
vector.allocateNew();
for (int i = 0; i < batchSize; i++) {
NullableIntHolder intHolder = new NullableIntHolder();
intHolder.isSet = 1;
intHolder.value = i;
vector.setSafe(i, intHolder);
}
vector.setValueCount(batchSize);
}
public static void main(String[] args) throws IOException {
try(FileOutputStream fos = new FileOutputStream("result/union.arrow")) {
writeUnionBatch(fos);
System.out.println("write succeed");
fos.flush();
}
RootAllocator allocator = new RootAllocator(1000000000);
ByteArrayInputStream in = new ByteArrayInputStream(Files.readAllBytes(Paths.get("result/union.arrow")));
ArrowStreamReader reader = new ArrowStreamReader(in, allocator);
reader.loadNextBatch();
}
}
{code}
And it can't read union data generated by python, as is reported in https://issues.apache.org/jira/browse/ARROW-1692.
It seems strange arrow java can't read union data generated by its own. Is there any format gap between arrow java UnionVector write and read?
was:
Similar to https://issues.apache.org/jira/browse/ARROW-5230, when I write union data using ArrowStreamWriter in java, I can't read it back using ArrowStreamReader in java. The exception is:
{quote}Exception in thread "main" java.lang.IllegalArgumentException: not all nodes and buffers were consumed. nodes: [ArrowFieldNode [length=100, nullCount=0]] buffers: [ArrowBuf[14], udle: [7 104..117], ArrowBuf[15], udle: [7 120..520]]
at org.apache.arrow.vector.VectorLoader.load(VectorLoader.java:64)
at org.apache.arrow.vector.ipc.ArrowReader.loadRecordBatch(ArrowReader.java:219)
at org.apache.arrow.vector.ipc.ArrowStreamReader.loadNextBatch(ArrowStreamReader.java:121)
{quote}
The code to reproduce this exception is:
{code:java}
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.arrow.vector.complex.UnionVector;
import org.apache.arrow.vector.dictionary.DictionaryProvider;
import org.apache.arrow.vector.holders.NullableIntHolder;
import org.apache.arrow.vector.ipc.ArrowStreamReader;
import org.apache.arrow.vector.ipc.ArrowStreamWriter;
import org.apache.arrow.vector.types.UnionMode;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.types.pojo.Schema;
import java.io.ByteArrayInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.List;
public class UnionTest {
public static void writeUnionBatch(OutputStream os) throws IOException {
int[] typeIds = new int[]{
ArrowType.ArrowTypeID.Int.ordinal()};
ArrowType.Union union = new ArrowType.Union(UnionMode.Sparse, typeIds);
Field field = new Field("f1", FieldType.nullable(union), null);
List<Field> fields = Collections.singletonList(field);
Schema schema = new Schema(fields);
VectorSchemaRoot root = VectorSchemaRoot.create(schema, new RootAllocator(Integer.MAX_VALUE));
DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider();
ArrowStreamWriter writer = new ArrowStreamWriter(root, provider, os);
writer.start();
for (int i = 0; i < 2; i++) {
root.setRowCount(100);
List<FieldVector> vectors = root.getFieldVectors();
UnionVector vector = (UnionVector) vectors.get(0);
fillVector(vector, 100);
for (int j = 0; j < 100; j++) {
if (!vector.isNull(j)) {
System.out.println(vector.getObject(j));
}
}
writer.writeBatch();
}
writer.end();
writer.close();
}
private static void fillVector(UnionVector vector, int batchSize) {
vector.setInitialCapacity(batchSize);
vector.allocateNew();
for (int i = 0; i < batchSize; i++) {
NullableIntHolder intHolder = new NullableIntHolder();
intHolder.isSet = 1;
intHolder.value = i;
vector.setSafe(i, intHolder);
}
vector.setValueCount(batchSize);
}
public static void main(String[] args) throws IOException {
try(FileOutputStream fos = new FileOutputStream("result/union.arrow")) {
writeUnionBatch(fos);
System.out.println("write succeed");
fos.flush();
}
RootAllocator allocator = new RootAllocator(1000000000);
ByteArrayInputStream in = new ByteArrayInputStream(Files.readAllBytes(Paths.get("result/union.arrow")));
ArrowStreamReader reader = new ArrowStreamReader(in, allocator);
reader.loadNextBatch();
}
}
{code}
And it can't read union data generated by python, as is reported in https://issues.apache.org/jira/browse/ARROW-1692.
It seems strange arrow java can't read union data generated by its own. Is there any format gap between arrow java UnionVector write and read?
> [Java] Arrow Java can't read union vector from ArrowStreamReader written by its own bugs
> -----------------------------------------------------------------------------------------
>
> Key: ARROW-5231
> URL: https://issues.apache.org/jira/browse/ARROW-5231
> Project: Apache Arrow
> Issue Type: Bug
> Components: Java
> Environment: Mac OS 10.13.6, Arrow 0.13.0, JDK8
> Reporter: Chaokun Yang
> Priority: Major
>
> Similar to https://issues.apache.org/jira/browse/ARROW-5230, when I write union data using ArrowStreamWriter in java, I can't read it back using ArrowStreamReader in java. The exception is:
> {quote}Exception in thread "main" java.lang.IllegalArgumentException: not all nodes and buffers were consumed. nodes: [ArrowFieldNode [length=100, nullCount=0]] buffers: [ArrowBuf[14], udle: [7 104..117], ArrowBuf[15], udle: [7 120..520]]
> at org.apache.arrow.vector.VectorLoader.load(VectorLoader.java:64)
> at org.apache.arrow.vector.ipc.ArrowReader.loadRecordBatch(ArrowReader.java:219)
> at org.apache.arrow.vector.ipc.ArrowStreamReader.loadNextBatch(ArrowStreamReader.java:121)
> {quote}
> The code to reproduce this exception is:
>
> {code:java}
> import org.apache.arrow.memory.RootAllocator;
> import org.apache.arrow.vector.FieldVector;
> import org.apache.arrow.vector.VectorSchemaRoot;
> import org.apache.arrow.vector.complex.UnionVector;
> import org.apache.arrow.vector.dictionary.DictionaryProvider;
> import org.apache.arrow.vector.holders.NullableIntHolder;
> import org.apache.arrow.vector.ipc.ArrowStreamReader;
> import org.apache.arrow.vector.ipc.ArrowStreamWriter;
> import org.apache.arrow.vector.types.UnionMode;
> import org.apache.arrow.vector.types.pojo.ArrowType;
> import org.apache.arrow.vector.types.pojo.Field;
> import org.apache.arrow.vector.types.pojo.FieldType;
> import org.apache.arrow.vector.types.pojo.Schema;
> import java.io.ByteArrayInputStream;
> import java.io.FileOutputStream;
> import java.io.IOException;
> import java.io.OutputStream;
> import java.nio.file.Files;
> import java.nio.file.Paths;
> import java.util.Collections;
> import java.util.List;
> public class UnionTest {
> public static void writeUnionBatch(OutputStream os) throws IOException {
> int[] typeIds = new int[]{ArrowType.ArrowTypeID.Int.ordinal()};
> ArrowType.Union union = new ArrowType.Union(UnionMode.Sparse, typeIds);
> List<Field> childList = Collections.singletonList(
> new Field("s1", FieldType.nullable(new ArrowType.Int(32, true)), null)
> );
> Field field = new Field("f1", FieldType.nullable(union), childList);
> List<Field> fields = Collections.singletonList(field);
> Schema schema = new Schema(fields);
> VectorSchemaRoot root = VectorSchemaRoot.create(schema, new RootAllocator(Integer.MAX_VALUE));
> DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider();
> ArrowStreamWriter writer = new ArrowStreamWriter(root, provider, os);
> writer.start();
> for (int i = 0; i < 2; i++) {
> root.setRowCount(100);
> List<FieldVector> vectors = root.getFieldVectors();
> UnionVector vector = (UnionVector) vectors.get(0);
> fillVector(vector, 100);
> for (int j = 0; j < 100; j++) {
> if (!vector.isNull(j)) {
> System.out.println(vector.getObject(j));
> }
> }
> writer.writeBatch();
> }
> writer.end();
> writer.close();
> }
> private static void fillVector(UnionVector vector, int batchSize) {
> vector.setInitialCapacity(batchSize);
> vector.allocateNew();
> for (int i = 0; i < batchSize; i++) {
> NullableIntHolder intHolder = new NullableIntHolder();
> intHolder.isSet = 1;
> intHolder.value = i;
> vector.setSafe(i, intHolder);
> }
> vector.setValueCount(batchSize);
> }
> public static void main(String[] args) throws IOException {
> try(FileOutputStream fos = new FileOutputStream("result/union.arrow")) {
> writeUnionBatch(fos);
> System.out.println("write succeed");
> fos.flush();
> }
> RootAllocator allocator = new RootAllocator(1000000000);
> ByteArrayInputStream in = new ByteArrayInputStream(Files.readAllBytes(Paths.get("result/union.arrow")));
> ArrowStreamReader reader = new ArrowStreamReader(in, allocator);
> reader.loadNextBatch();
> }
> }
> {code}
> And it can't read union data generated by python, as is reported in https://issues.apache.org/jira/browse/ARROW-1692.
> It seems strange arrow java can't read union data generated by its own. Is there any format gap between arrow java UnionVector write and read?
>
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)