You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by li...@apache.org on 2022/02/21 14:50:26 UTC
[arrow-cookbook] branch main updated: [Java] Java cookbook for create arrow read/write IPC format (#136)
This is an automated email from the ASF dual-hosted git repository.
lidavidm pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-cookbook.git
The following commit(s) were added to refs/heads/main by this push:
new 2ca9851 [Java] Java cookbook for create arrow read/write IPC format (#136)
2ca9851 is described below
commit 2ca98511f1a4f4570592ed76db4d337f8cd3e39d
Author: david dali susanibar arce <da...@gmail.com>
AuthorDate: Mon Feb 21 09:50:20 2022 -0500
[Java] Java cookbook for create arrow read/write IPC format (#136)
* Adding java cookbook for creating arrow read/write IPC format
* Adding java cookbook for creating arrow read/write IPC format
* Update java/source/io.rst
Co-authored-by: David Li <li...@gmail.com>
* Update java/source/io.rst
Co-authored-by: David Li <li...@gmail.com>
* Solving pr comments
* Solving pr comments
* Solving pr comments
* Arrow java io cookbook
* Solving pr comments
* Solving pr comments
* Solving pr comments
* Solving pr comments
* Solving pr comments
Co-authored-by: David Li <li...@gmail.com>
---
java/source/conf.py | 3 +-
java/source/index.rst | 1 +
java/source/io.rst | 426 +++++++++++++++++++++
java/thirdpartydeps/arrowfiles/random_access.arrow | Bin 0 -> 1346 bytes
java/thirdpartydeps/arrowfiles/streaming.arrow | Bin 0 -> 1032 bytes
5 files changed, 429 insertions(+), 1 deletion(-)
diff --git a/java/source/conf.py b/java/source/conf.py
index 767fab7..72da1de 100644
--- a/java/source/conf.py
+++ b/java/source/conf.py
@@ -28,7 +28,8 @@ author = 'The Apache Software Foundation'
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
- "javadoctest"
+ "javadoctest",
+ "sphinx.ext.intersphinx"
]
# Add any paths that contain templates here, relative to this directory.
diff --git a/java/source/index.rst b/java/source/index.rst
index c0c97d1..17b87c3 100644
--- a/java/source/index.rst
+++ b/java/source/index.rst
@@ -11,6 +11,7 @@ Welcome to java arrow's documentation!
:caption: Contents:
create
+ io
schema
data
diff --git a/java/source/io.rst b/java/source/io.rst
new file mode 100644
index 0000000..43fa84d
--- /dev/null
+++ b/java/source/io.rst
@@ -0,0 +1,426 @@
+.. _arrow-io:
+
+========================
+Reading and writing data
+========================
+
+The `Arrow IPC format <https://arrow.apache.org/docs/java/ipc.html>`_ defines two types of binary formats
+for serializing Arrow data: the streaming format and the file format (or random access format). Such files can
+be directly memory-mapped when read.
+
+.. contents::
+
+Writing
+=======
+
+Both writing file and streaming formats use the same API.
+
+Writing Random Access Files
+***************************
+
+Write - Out to File
+-------------------
+
+.. testcode::
+
+ import org.apache.arrow.memory.RootAllocator;
+ import org.apache.arrow.vector.VarCharVector;
+ import org.apache.arrow.vector.IntVector;
+ import org.apache.arrow.vector.types.pojo.Field;
+ import org.apache.arrow.vector.types.pojo.FieldType;
+ import org.apache.arrow.vector.types.pojo.ArrowType;
+ import org.apache.arrow.vector.types.pojo.Schema;
+ import org.apache.arrow.vector.VectorSchemaRoot;
+ import static java.util.Arrays.asList;
+ import org.apache.arrow.vector.ipc.ArrowFileWriter;
+
+ import java.io.File;
+ import java.io.FileOutputStream;
+ import java.io.IOException;
+
+ try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) {
+ Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null);
+ Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null);
+ Schema schemaPerson = new Schema(asList(name, age));
+ try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)){
+ VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name");
+ nameVector.allocateNew(3);
+ nameVector.set(0, "David".getBytes());
+ nameVector.set(1, "Gladis".getBytes());
+ nameVector.set(2, "Juan".getBytes());
+ IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age");
+ ageVector.allocateNew(3);
+ ageVector.set(0, 10);
+ ageVector.set(1, 20);
+ ageVector.set(2, 30);
+ vectorSchemaRoot.setRowCount(3);
+ File file = new File("randon_access_to_file.arrow");
+ try (FileOutputStream fileOutputStream = new FileOutputStream(file);
+ ArrowFileWriter writer = new ArrowFileWriter(vectorSchemaRoot, null, fileOutputStream.getChannel())
+ ) {
+ writer.start();
+ writer.writeBatch();
+ writer.end();
+ System.out.println("Record batches written: " + writer.getRecordBlocks().size() + ". Number of rows written: " + vectorSchemaRoot.getRowCount());
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+.. testoutput::
+
+ Record batches written: 1. Number of rows written: 3
+
+Write - Out to Buffer
+---------------------
+
+.. testcode::
+
+ import org.apache.arrow.memory.RootAllocator;
+ import org.apache.arrow.vector.VarCharVector;
+ import org.apache.arrow.vector.IntVector;
+ import org.apache.arrow.vector.types.pojo.Field;
+ import org.apache.arrow.vector.types.pojo.FieldType;
+ import org.apache.arrow.vector.types.pojo.ArrowType;
+ import org.apache.arrow.vector.types.pojo.Schema;
+ import org.apache.arrow.vector.VectorSchemaRoot;
+ import static java.util.Arrays.asList;
+ import org.apache.arrow.vector.ipc.ArrowFileWriter;
+
+ import java.io.ByteArrayOutputStream;
+ import java.io.IOException;
+ import java.nio.channels.Channels;
+
+ try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) {
+ Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null);
+ Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null);
+ Schema schemaPerson = new Schema(asList(name, age));
+ try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)){
+ VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name");
+ nameVector.allocateNew(3);
+ nameVector.set(0, "David".getBytes());
+ nameVector.set(1, "Gladis".getBytes());
+ nameVector.set(2, "Juan".getBytes());
+ IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age");
+ ageVector.allocateNew(3);
+ ageVector.set(0, 10);
+ ageVector.set(1, 20);
+ ageVector.set(2, 30);
+ vectorSchemaRoot.setRowCount(3);
+ try (ByteArrayOutputStream out = new ByteArrayOutputStream();
+ ArrowFileWriter writer = new ArrowFileWriter(vectorSchemaRoot, null, Channels.newChannel(out)))
+ {
+ writer.start();
+ writer.writeBatch();
+ System.out.println("Record batches written: " + writer.getRecordBlocks().size() + ". Number of rows written: " + vectorSchemaRoot.getRowCount());
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+.. testoutput::
+
+ Record batches written: 1. Number of rows written: 3
+
+Writing Streaming Format
+************************
+
+Write - Out to File
+-------------------
+
+.. testcode::
+
+ import org.apache.arrow.memory.RootAllocator;
+ import org.apache.arrow.vector.VarCharVector;
+ import org.apache.arrow.vector.IntVector;
+ import org.apache.arrow.vector.types.pojo.Field;
+ import org.apache.arrow.vector.types.pojo.FieldType;
+ import org.apache.arrow.vector.types.pojo.ArrowType;
+ import org.apache.arrow.vector.types.pojo.Schema;
+ import org.apache.arrow.vector.VectorSchemaRoot;
+ import static java.util.Arrays.asList;
+ import org.apache.arrow.vector.ipc.ArrowStreamWriter;
+ import java.io.File;
+ import java.io.FileOutputStream;
+ import java.io.IOException;
+
+ try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) {
+ // Create and populate data:
+ Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null);
+ Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null);
+ Schema schemaPerson = new Schema(asList(name, age));
+ try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)){
+ VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name");
+ nameVector.allocateNew(3);
+ nameVector.set(0, "David".getBytes());
+ nameVector.set(1, "Gladis".getBytes());
+ nameVector.set(2, "Juan".getBytes());
+ IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age");
+ ageVector.allocateNew(3);
+ ageVector.set(0, 10);
+ ageVector.set(1, 20);
+ ageVector.set(2, 30);
+ vectorSchemaRoot.setRowCount(3);
+ File file = new File("streaming_to_file.arrow");
+ try (FileOutputStream fileOutputStream = new FileOutputStream(file);
+ ArrowStreamWriter writer = new ArrowStreamWriter(vectorSchemaRoot, null, fileOutputStream.getChannel())
+ ){
+ writer.start();
+ writer.writeBatch();
+ System.out.println("Number of rows written: " + vectorSchemaRoot.getRowCount());
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+.. testoutput::
+
+ Number of rows written: 3
+
+Write - Out to Buffer
+---------------------
+
+.. testcode::
+
+ import org.apache.arrow.memory.RootAllocator;
+ import org.apache.arrow.vector.VarCharVector;
+ import org.apache.arrow.vector.IntVector;
+ import org.apache.arrow.vector.ipc.ArrowStreamWriter;
+ import org.apache.arrow.vector.types.pojo.Field;
+ import org.apache.arrow.vector.types.pojo.FieldType;
+ import org.apache.arrow.vector.types.pojo.ArrowType;
+ import org.apache.arrow.vector.types.pojo.Schema;
+ import org.apache.arrow.vector.VectorSchemaRoot;
+ import static java.util.Arrays.asList;
+
+ import java.io.ByteArrayOutputStream;
+ import java.io.IOException;
+ import java.nio.channels.Channels;
+
+ try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) {
+ // Create and populate data:
+ Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null);
+ Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null);
+ Schema schemaPerson = new Schema(asList(name, age));
+ try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)){
+ VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name");
+ nameVector.allocateNew(3);
+ nameVector.set(0, "David".getBytes());
+ nameVector.set(1, "Gladis".getBytes());
+ nameVector.set(2, "Juan".getBytes());
+ IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age");
+ ageVector.allocateNew(3);
+ ageVector.set(0, 10);
+ ageVector.set(1, 20);
+ ageVector.set(2, 30);
+ vectorSchemaRoot.setRowCount(3);
+ try (ByteArrayOutputStream out = new ByteArrayOutputStream();
+ ArrowStreamWriter writer = new ArrowStreamWriter(vectorSchemaRoot, null, Channels.newChannel(out))
+ ){
+ writer.start();
+ writer.writeBatch();
+ System.out.println("Number of rows written: " + vectorSchemaRoot.getRowCount());
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+.. testoutput::
+
+ Number of rows written: 3
+
+Reading
+=======
+
+Reading the random access format and streaming format both offer the same API,
+with the difference that random access files also offer access to any record batch by index.
+
+Reading Random Access Files
+***************************
+
+Read - From File
+----------------
+
+We are providing a path with auto generated arrow files for testing purposes, change that at your convenience.
+
+.. testcode::
+
+ import org.apache.arrow.memory.RootAllocator;
+ import org.apache.arrow.vector.ipc.ArrowFileReader;
+ import org.apache.arrow.vector.ipc.message.ArrowBlock;
+ import org.apache.arrow.vector.VectorSchemaRoot;
+ import java.io.File;
+ import java.io.FileInputStream;
+ import java.io.FileOutputStream;
+ import java.io.IOException;
+
+ try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)){
+ File file = new File("./thirdpartydeps/arrowfiles/random_access.arrow");
+ try (FileInputStream fileInputStream = new FileInputStream(file);
+ ArrowFileReader reader = new ArrowFileReader(fileInputStream.getChannel(), rootAllocator)
+ ){
+ System.out.println("Record batches in file: " + reader.getRecordBlocks().size());
+ for (ArrowBlock arrowBlock : reader.getRecordBlocks()) {
+ reader.loadRecordBatch(arrowBlock);
+ VectorSchemaRoot vectorSchemaRootRecover = reader.getVectorSchemaRoot();
+ System.out.print(vectorSchemaRootRecover.contentToTSVString());
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+.. testoutput::
+
+ Record batches in file: 3
+ name age
+ David 10
+ Gladis 20
+ Juan 30
+ name age
+ Nidia 15
+ Alexa 20
+ Mara 15
+ name age
+ Raul 34
+ Jhon 29
+ Thomy 33
+
+Read - From Buffer
+------------------
+
+.. testcode::
+
+ import org.apache.arrow.memory.RootAllocator;
+ import org.apache.arrow.vector.ipc.ArrowFileReader;
+ import org.apache.arrow.vector.ipc.SeekableReadChannel;
+ import org.apache.arrow.vector.ipc.message.ArrowBlock;
+ import org.apache.arrow.vector.VectorSchemaRoot;
+ import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel;
+
+ import java.io.IOException;
+ import java.nio.file.Files;
+ import java.nio.file.Path;
+ import java.nio.file.Paths;
+
+ try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) {
+ Path path = Paths.get("./thirdpartydeps/arrowfiles/random_access.arrow");
+ try (ArrowFileReader reader = new ArrowFileReader(new SeekableReadChannel(new ByteArrayReadableSeekableByteChannel(Files.readAllBytes(path))), rootAllocator)){
+ System.out.println("Record batches in file: " + reader.getRecordBlocks().size());
+ for (ArrowBlock arrowBlock : reader.getRecordBlocks()) {
+ reader.loadRecordBatch(arrowBlock);
+ VectorSchemaRoot vectorSchemaRootRecover = reader.getVectorSchemaRoot();
+ System.out.print(vectorSchemaRootRecover.contentToTSVString());
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+.. testoutput::
+
+ Record batches in file: 3
+ name age
+ David 10
+ Gladis 20
+ Juan 30
+ name age
+ Nidia 15
+ Alexa 20
+ Mara 15
+ name age
+ Raul 34
+ Jhon 29
+ Thomy 33
+
+Reading Streaming Format
+************************
+
+Read - From File
+----------------
+
+.. testcode::
+
+ import org.apache.arrow.memory.RootAllocator;
+ import org.apache.arrow.vector.ipc.ArrowStreamReader;
+ import org.apache.arrow.vector.VectorSchemaRoot;
+ import java.io.File;
+ import java.io.FileInputStream;
+ import java.io.IOException;
+
+ try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) {
+ File file = new File("./thirdpartydeps/arrowfiles/streaming.arrow");
+ try (FileInputStream fileInputStreamForStream = new FileInputStream(file);
+ ArrowStreamReader reader = new ArrowStreamReader(fileInputStreamForStream, rootAllocator)) {
+ while (reader.loadNextBatch()) {
+ VectorSchemaRoot vectorSchemaRootRecover = reader.getVectorSchemaRoot();
+ System.out.print(vectorSchemaRootRecover.contentToTSVString());
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+.. testoutput::
+
+ name age
+ David 10
+ Gladis 20
+ Juan 30
+ name age
+ Nidia 15
+ Alexa 20
+ Mara 15
+ name age
+ Raul 34
+ Jhon 29
+ Thomy 33
+
+Read - From Buffer
+------------------
+
+.. testcode::
+
+ import org.apache.arrow.memory.RootAllocator;
+ import org.apache.arrow.vector.ipc.ArrowStreamReader;
+
+ import java.io.ByteArrayInputStream;
+ import java.io.IOException;
+ import java.nio.file.Files;
+ import java.nio.file.Path;
+ import java.nio.file.Paths;
+
+ try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) {
+ Path path = Paths.get("./thirdpartydeps/arrowfiles/streaming.arrow");
+ try (ArrowStreamReader reader = new ArrowStreamReader(new ByteArrayInputStream(Files.readAllBytes(path)), rootAllocator)){
+ while(reader.loadNextBatch()){
+ System.out.print(reader.getVectorSchemaRoot().contentToTSVString());
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+.. testoutput::
+
+ name age
+ David 10
+ Gladis 20
+ Juan 30
+ name age
+ Nidia 15
+ Alexa 20
+ Mara 15
+ name age
+ Raul 34
+ Jhon 29
+ Thomy 33
+
+Reading Parquet File
+********************
+
+Please check :doc:`Dataset <./dataset>`
diff --git a/java/thirdpartydeps/arrowfiles/random_access.arrow b/java/thirdpartydeps/arrowfiles/random_access.arrow
new file mode 100644
index 0000000..9efe50f
Binary files /dev/null and b/java/thirdpartydeps/arrowfiles/random_access.arrow differ
diff --git a/java/thirdpartydeps/arrowfiles/streaming.arrow b/java/thirdpartydeps/arrowfiles/streaming.arrow
new file mode 100644
index 0000000..b409d28
Binary files /dev/null and b/java/thirdpartydeps/arrowfiles/streaming.arrow differ