You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by li...@apache.org on 2022/05/25 12:19:28 UTC
[arrow-cookbook] branch main updated: [Java] Adding examples about Dictionary-encoded Layout (#215)
This is an automated email from the ASF dual-hosted git repository.
lidavidm pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-cookbook.git
The following commit(s) were added to refs/heads/main by this push:
new f8e0a56 [Java] Adding examples about Dictionary-encoded Layout (#215)
f8e0a56 is described below
commit f8e0a56bf9fdfdfbabad179e670bd01aa2142c39
Author: david dali susanibar arce <da...@gmail.com>
AuthorDate: Wed May 25 07:19:24 2022 -0500
[Java] Adding examples about Dictionary-encoded Layout (#215)
* Addind examples about Dictionary-encoded Layout
* Apply suggestions from code review
Co-authored-by: David Li <li...@gmail.com>
* Solving issues: variable name, null indexType
* Solving issues: variable name
* Adding dictionary id base on DictionaryEncoding and FieldType
Co-authored-by: David Li <li...@gmail.com>
---
java/source/create.rst | 63 +++++++++++++++++++++++++-
java/source/io.rst | 119 +++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 181 insertions(+), 1 deletion(-)
diff --git a/java/source/create.rst b/java/source/create.rst
index f7680c9..33619fa 100644
--- a/java/source/create.rst
+++ b/java/source/create.rst
@@ -70,6 +70,66 @@ Array of Varchar
[one, two, three]
+Dictionary-Encoded Array of Varchar
+-----------------------------------
+
+In some scenarios `dictionary-encoding`_ a column is useful to save memory.
+
+.. testcode::
+
+ import org.apache.arrow.memory.BufferAllocator;
+ import org.apache.arrow.memory.RootAllocator;
+ import org.apache.arrow.vector.FieldVector;
+ import org.apache.arrow.vector.VarCharVector;
+ import org.apache.arrow.vector.dictionary.Dictionary;
+ import org.apache.arrow.vector.dictionary.DictionaryEncoder;
+ import org.apache.arrow.vector.types.pojo.ArrowType;
+ import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
+
+ import java.nio.charset.StandardCharsets;
+
+ try (BufferAllocator root = new RootAllocator();
+ VarCharVector countries = new VarCharVector("country-dict", root);
+ VarCharVector appUserCountriesUnencoded = new VarCharVector("app-use-country-dict", root)
+ ) {
+ countries.allocateNew(10);
+ countries.set(0, "Andorra".getBytes(StandardCharsets.UTF_8));
+ countries.set(1, "Cuba".getBytes(StandardCharsets.UTF_8));
+ countries.set(2, "Grecia".getBytes(StandardCharsets.UTF_8));
+ countries.set(3, "Guinea".getBytes(StandardCharsets.UTF_8));
+ countries.set(4, "Islandia".getBytes(StandardCharsets.UTF_8));
+ countries.set(5, "Malta".getBytes(StandardCharsets.UTF_8));
+ countries.set(6, "Tailandia".getBytes(StandardCharsets.UTF_8));
+ countries.set(7, "Uganda".getBytes(StandardCharsets.UTF_8));
+ countries.set(8, "Yemen".getBytes(StandardCharsets.UTF_8));
+ countries.set(9, "Zambia".getBytes(StandardCharsets.UTF_8));
+ countries.setValueCount(10);
+
+ Dictionary countriesDictionary = new Dictionary(countries,
+ new DictionaryEncoding(/*id=*/1L, /*ordered=*/false, /*indexType=*/new ArrowType.Int(8, true)));
+ System.out.println("Dictionary: " + countriesDictionary);
+
+ appUserCountriesUnencoded.allocateNew(5);
+ appUserCountriesUnencoded.set(0, "Andorra".getBytes(StandardCharsets.UTF_8));
+ appUserCountriesUnencoded.set(1, "Guinea".getBytes(StandardCharsets.UTF_8));
+ appUserCountriesUnencoded.set(2, "Islandia".getBytes(StandardCharsets.UTF_8));
+ appUserCountriesUnencoded.set(3, "Malta".getBytes(StandardCharsets.UTF_8));
+ appUserCountriesUnencoded.set(4, "Uganda".getBytes(StandardCharsets.UTF_8));
+ appUserCountriesUnencoded.setValueCount(5);
+ System.out.println("Unencoded data: " + appUserCountriesUnencoded);
+
+ try (FieldVector appUserCountriesDictionaryEncoded = (FieldVector) DictionaryEncoder
+ .encode(appUserCountriesUnencoded, countriesDictionary)) {
+ System.out.println("Dictionary-encoded data: " + appUserCountriesDictionaryEncoded);
+ }
+ }
+
+.. testoutput::
+
+ Dictionary: Dictionary DictionaryEncoding[id=1,ordered=false,indexType=Int(8, true)] [Andorra, Cuba, Grecia, Guinea, Islandia, Malta, Tailandia, Uganda, Yemen, Zambia]
+ Unencoded data: [Andorra, Guinea, Islandia, Malta, Uganda]
+ Dictionary-encoded data: [0, 3, 4, 5, 7]
+
Array of List
-------------
@@ -109,4 +169,5 @@ Array of List
[[1,2,3], [10,20,30], [100,200,300], [1000,2000,3000]]
.. _`FieldVector`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/FieldVector.html
-.. _`ValueVector`: https://arrow.apache.org/docs/java/vector.html
\ No newline at end of file
+.. _`ValueVector`: https://arrow.apache.org/docs/java/vector.html
+.. _`dictionary-encoding`: https://arrow.apache.org/docs/format/Columnar.html#dictionary-encoded-layout
\ No newline at end of file
diff --git a/java/source/io.rst b/java/source/io.rst
index ffa06a3..8157c7c 100644
--- a/java/source/io.rst
+++ b/java/source/io.rst
@@ -443,3 +443,122 @@ Reading Parquet File
********************
Please check :doc:`Dataset <./dataset>`
+
+Handling Data with Dictionaries
+*******************************
+
+Reading and writing dictionary-encoded data requires separately tracking the dictionaries.
+
+.. testcode::
+
+ import org.apache.arrow.memory.BufferAllocator;
+ import org.apache.arrow.memory.RootAllocator;
+ import org.apache.arrow.vector.FieldVector;
+ import org.apache.arrow.vector.ValueVector;
+ import org.apache.arrow.vector.VarCharVector;
+ import org.apache.arrow.vector.VectorSchemaRoot;
+ import org.apache.arrow.vector.dictionary.Dictionary;
+ import org.apache.arrow.vector.dictionary.DictionaryEncoder;
+ import org.apache.arrow.vector.dictionary.DictionaryProvider;
+ import org.apache.arrow.vector.ipc.ArrowFileReader;
+ import org.apache.arrow.vector.ipc.ArrowFileWriter;
+ import org.apache.arrow.vector.ipc.message.ArrowBlock;
+ import org.apache.arrow.vector.types.Types;
+ import org.apache.arrow.vector.types.pojo.ArrowType;
+ import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
+ import org.apache.arrow.vector.types.pojo.FieldType;
+
+ import java.io.File;
+ import java.io.FileInputStream;
+ import java.io.FileNotFoundException;
+ import java.io.FileOutputStream;
+ import java.io.IOException;
+ import java.nio.charset.StandardCharsets;
+
+ final DictionaryEncoding dictionaryEncoding = new DictionaryEncoding(
+ /*id=*/666L, /*ordered=*/false, /*indexType=*/
+ new ArrowType.Int(8, true)
+ );
+ try (BufferAllocator root = new RootAllocator();
+ VarCharVector countries = new VarCharVector("country-dict", root);
+ VarCharVector appUserCountriesUnencoded = new VarCharVector(
+ "app-use-country-dict",
+ new FieldType(true, Types.MinorType.VARCHAR.getType(), dictionaryEncoding),
+ root)
+ ) {
+ countries.allocateNew(10);
+ countries.set(0, "Andorra".getBytes(StandardCharsets.UTF_8));
+ countries.set(1, "Cuba".getBytes(StandardCharsets.UTF_8));
+ countries.set(2, "Grecia".getBytes(StandardCharsets.UTF_8));
+ countries.set(3, "Guinea".getBytes(StandardCharsets.UTF_8));
+ countries.set(4, "Islandia".getBytes(StandardCharsets.UTF_8));
+ countries.set(5, "Malta".getBytes(StandardCharsets.UTF_8));
+ countries.set(6, "Tailandia".getBytes(StandardCharsets.UTF_8));
+ countries.set(7, "Uganda".getBytes(StandardCharsets.UTF_8));
+ countries.set(8, "Yemen".getBytes(StandardCharsets.UTF_8));
+ countries.set(9, "Zambia".getBytes(StandardCharsets.UTF_8));
+ countries.setValueCount(10);
+
+ Dictionary countriesDictionary = new Dictionary(countries, dictionaryEncoding);
+ System.out.println("Dictionary: " + countriesDictionary);
+
+ appUserCountriesUnencoded.allocateNew(5);
+ appUserCountriesUnencoded.set(0, "Andorra".getBytes(StandardCharsets.UTF_8));
+ appUserCountriesUnencoded.set(1, "Guinea".getBytes(StandardCharsets.UTF_8));
+ appUserCountriesUnencoded.set(2, "Islandia".getBytes(StandardCharsets.UTF_8));
+ appUserCountriesUnencoded.set(3, "Malta".getBytes(StandardCharsets.UTF_8));
+ appUserCountriesUnencoded.set(4, "Uganda".getBytes(StandardCharsets.UTF_8));
+ appUserCountriesUnencoded.setValueCount(5);
+ System.out.println("Unencoded data: " + appUserCountriesUnencoded);
+
+ File file = new File("random_access_file_with_dictionary.arrow");
+ DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider();
+ provider.put(countriesDictionary);
+ try (FieldVector appUseCountryDictionaryEncoded = (FieldVector) DictionaryEncoder
+ .encode(appUserCountriesUnencoded, countriesDictionary);
+ VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.of(appUseCountryDictionaryEncoded);
+ FileOutputStream fileOutputStream = new FileOutputStream(file);
+ ArrowFileWriter writer = new ArrowFileWriter(vectorSchemaRoot, provider, fileOutputStream.getChannel())
+ ) {
+ System.out.println("Dictionary-encoded data: " +appUseCountryDictionaryEncoded);
+ System.out.println("Dictionary-encoded ID: " +appUseCountryDictionaryEncoded.getField().getDictionary().getId());
+ writer.start();
+ writer.writeBatch();
+ writer.end();
+ System.out.println("Record batches written: " + writer.getRecordBlocks().size() + ". Number of rows written: " + vectorSchemaRoot.getRowCount());
+ try(
+ BufferAllocator rootAllocator = new RootAllocator();
+ FileInputStream fileInputStream = new FileInputStream(file);
+ ArrowFileReader reader = new ArrowFileReader(fileInputStream.getChannel(), rootAllocator)
+ ){
+ for (ArrowBlock arrowBlock : reader.getRecordBlocks()) {
+ reader.loadRecordBatch(arrowBlock);
+ FieldVector appUseCountryDictionaryEncodedRead = reader.getVectorSchemaRoot().getVector("app-use-country-dict");
+ DictionaryEncoding dictionaryEncodingRead = appUseCountryDictionaryEncodedRead.getField().getDictionary();
+ System.out.println("Dictionary-encoded ID recovered: " + dictionaryEncodingRead.getId());
+ Dictionary appUseCountryDictionaryRead = reader.getDictionaryVectors().get(dictionaryEncodingRead.getId());
+ System.out.println("Dictionary-encoded data recovered: " + appUseCountryDictionaryEncodedRead);
+ System.out.println("Dictionary recovered: " + appUseCountryDictionaryRead);
+ try (ValueVector readVector = DictionaryEncoder.decode(appUseCountryDictionaryEncodedRead, appUseCountryDictionaryRead)) {
+ System.out.println("Decoded data: " + readVector);
+ }
+ }
+ }
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+.. testoutput::
+
+ Dictionary: Dictionary DictionaryEncoding[id=666,ordered=false,indexType=Int(8, true)] [Andorra, Cuba, Grecia, Guinea, Islandia, Malta, Tailandia, Uganda, Yemen, Zambia]
+ Unencoded data: [Andorra, Guinea, Islandia, Malta, Uganda]
+ Dictionary-encoded data: [0, 3, 4, 5, 7]
+ Dictionary-encoded ID: 666
+ Record batches written: 1. Number of rows written: 5
+ Dictionary-encoded ID recovered: 666
+ Dictionary-encoded data recovered: [0, 3, 4, 5, 7]
+ Dictionary recovered: Dictionary DictionaryEncoding[id=666,ordered=false,indexType=Int(8, true)] [Andorra, Cuba, Grecia, Guinea, Islandia, Malta, Tailandia, Uganda, Yemen, Zambia]
+ Decoded data: [Andorra, Guinea, Islandia, Malta, Uganda]