You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by li...@apache.org on 2022/05/25 12:19:28 UTC

[arrow-cookbook] branch main updated: [Java] Adding examples about Dictionary-encoded Layout (#215)

This is an automated email from the ASF dual-hosted git repository.

lidavidm pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-cookbook.git


The following commit(s) were added to refs/heads/main by this push:
     new f8e0a56  [Java] Adding examples about Dictionary-encoded Layout (#215)
f8e0a56 is described below

commit f8e0a56bf9fdfdfbabad179e670bd01aa2142c39
Author: david dali susanibar arce <da...@gmail.com>
AuthorDate: Wed May 25 07:19:24 2022 -0500

    [Java] Adding examples about Dictionary-encoded Layout (#215)
    
    * Addind examples about Dictionary-encoded Layout
    
    * Apply suggestions from code review
    
    Co-authored-by: David Li <li...@gmail.com>
    
    * Solving issues: variable name, null indexType
    
    * Solving issues: variable name
    
    * Adding dictionary id base on DictionaryEncoding and FieldType
    
    Co-authored-by: David Li <li...@gmail.com>
---
 java/source/create.rst |  63 +++++++++++++++++++++++++-
 java/source/io.rst     | 119 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 181 insertions(+), 1 deletion(-)

diff --git a/java/source/create.rst b/java/source/create.rst
index f7680c9..33619fa 100644
--- a/java/source/create.rst
+++ b/java/source/create.rst
@@ -70,6 +70,66 @@ Array of Varchar
 
     [one, two, three]
 
+Dictionary-Encoded Array of Varchar
+-----------------------------------
+
+In some scenarios `dictionary-encoding`_ a column is useful to save memory.
+
+.. testcode::
+
+    import org.apache.arrow.memory.BufferAllocator;
+    import org.apache.arrow.memory.RootAllocator;
+    import org.apache.arrow.vector.FieldVector;
+    import org.apache.arrow.vector.VarCharVector;
+    import org.apache.arrow.vector.dictionary.Dictionary;
+    import org.apache.arrow.vector.dictionary.DictionaryEncoder;
+    import org.apache.arrow.vector.types.pojo.ArrowType;
+    import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
+
+    import java.nio.charset.StandardCharsets;
+
+    try (BufferAllocator root = new RootAllocator();
+         VarCharVector countries = new VarCharVector("country-dict", root);
+         VarCharVector appUserCountriesUnencoded = new VarCharVector("app-use-country-dict", root)
+    ) {
+        countries.allocateNew(10);
+        countries.set(0, "Andorra".getBytes(StandardCharsets.UTF_8));
+        countries.set(1, "Cuba".getBytes(StandardCharsets.UTF_8));
+        countries.set(2, "Grecia".getBytes(StandardCharsets.UTF_8));
+        countries.set(3, "Guinea".getBytes(StandardCharsets.UTF_8));
+        countries.set(4, "Islandia".getBytes(StandardCharsets.UTF_8));
+        countries.set(5, "Malta".getBytes(StandardCharsets.UTF_8));
+        countries.set(6, "Tailandia".getBytes(StandardCharsets.UTF_8));
+        countries.set(7, "Uganda".getBytes(StandardCharsets.UTF_8));
+        countries.set(8, "Yemen".getBytes(StandardCharsets.UTF_8));
+        countries.set(9, "Zambia".getBytes(StandardCharsets.UTF_8));
+        countries.setValueCount(10);
+
+        Dictionary countriesDictionary = new Dictionary(countries,
+                new DictionaryEncoding(/*id=*/1L, /*ordered=*/false, /*indexType=*/new ArrowType.Int(8, true)));
+        System.out.println("Dictionary: " + countriesDictionary);
+
+        appUserCountriesUnencoded.allocateNew(5);
+        appUserCountriesUnencoded.set(0, "Andorra".getBytes(StandardCharsets.UTF_8));
+        appUserCountriesUnencoded.set(1, "Guinea".getBytes(StandardCharsets.UTF_8));
+        appUserCountriesUnencoded.set(2, "Islandia".getBytes(StandardCharsets.UTF_8));
+        appUserCountriesUnencoded.set(3, "Malta".getBytes(StandardCharsets.UTF_8));
+        appUserCountriesUnencoded.set(4, "Uganda".getBytes(StandardCharsets.UTF_8));
+        appUserCountriesUnencoded.setValueCount(5);
+        System.out.println("Unencoded data: " + appUserCountriesUnencoded);
+
+        try (FieldVector appUserCountriesDictionaryEncoded = (FieldVector) DictionaryEncoder
+                .encode(appUserCountriesUnencoded, countriesDictionary)) {
+            System.out.println("Dictionary-encoded data: " + appUserCountriesDictionaryEncoded);
+        }
+    }
+
+.. testoutput::
+
+    Dictionary: Dictionary DictionaryEncoding[id=1,ordered=false,indexType=Int(8, true)] [Andorra, Cuba, Grecia, Guinea, Islandia, Malta, Tailandia, Uganda, Yemen, Zambia]
+    Unencoded data: [Andorra, Guinea, Islandia, Malta, Uganda]
+    Dictionary-encoded data: [0, 3, 4, 5, 7]
+
 Array of List
 -------------
 
@@ -109,4 +169,5 @@ Array of List
     [[1,2,3], [10,20,30], [100,200,300], [1000,2000,3000]]
 
 .. _`FieldVector`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/FieldVector.html
-.. _`ValueVector`: https://arrow.apache.org/docs/java/vector.html
\ No newline at end of file
+.. _`ValueVector`: https://arrow.apache.org/docs/java/vector.html
+.. _`dictionary-encoding`: https://arrow.apache.org/docs/format/Columnar.html#dictionary-encoded-layout
\ No newline at end of file
diff --git a/java/source/io.rst b/java/source/io.rst
index ffa06a3..8157c7c 100644
--- a/java/source/io.rst
+++ b/java/source/io.rst
@@ -443,3 +443,122 @@ Reading Parquet File
 ********************
 
 Please check :doc:`Dataset <./dataset>`
+
+Handling Data with Dictionaries
+*******************************
+
+Reading and writing dictionary-encoded data requires separately tracking the dictionaries.
+
+.. testcode::
+
+    import org.apache.arrow.memory.BufferAllocator;
+    import org.apache.arrow.memory.RootAllocator;
+    import org.apache.arrow.vector.FieldVector;
+    import org.apache.arrow.vector.ValueVector;
+    import org.apache.arrow.vector.VarCharVector;
+    import org.apache.arrow.vector.VectorSchemaRoot;
+    import org.apache.arrow.vector.dictionary.Dictionary;
+    import org.apache.arrow.vector.dictionary.DictionaryEncoder;
+    import org.apache.arrow.vector.dictionary.DictionaryProvider;
+    import org.apache.arrow.vector.ipc.ArrowFileReader;
+    import org.apache.arrow.vector.ipc.ArrowFileWriter;
+    import org.apache.arrow.vector.ipc.message.ArrowBlock;
+    import org.apache.arrow.vector.types.Types;
+    import org.apache.arrow.vector.types.pojo.ArrowType;
+    import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
+    import org.apache.arrow.vector.types.pojo.FieldType;
+
+    import java.io.File;
+    import java.io.FileInputStream;
+    import java.io.FileNotFoundException;
+    import java.io.FileOutputStream;
+    import java.io.IOException;
+    import java.nio.charset.StandardCharsets;
+
+    final DictionaryEncoding dictionaryEncoding = new DictionaryEncoding(
+            /*id=*/666L, /*ordered=*/false, /*indexType=*/
+            new ArrowType.Int(8, true)
+    );
+    try (BufferAllocator root = new RootAllocator();
+         VarCharVector countries = new VarCharVector("country-dict", root);
+         VarCharVector appUserCountriesUnencoded = new VarCharVector(
+                 "app-use-country-dict",
+                 new FieldType(true, Types.MinorType.VARCHAR.getType(), dictionaryEncoding),
+                 root)
+    ) {
+        countries.allocateNew(10);
+        countries.set(0, "Andorra".getBytes(StandardCharsets.UTF_8));
+        countries.set(1, "Cuba".getBytes(StandardCharsets.UTF_8));
+        countries.set(2, "Grecia".getBytes(StandardCharsets.UTF_8));
+        countries.set(3, "Guinea".getBytes(StandardCharsets.UTF_8));
+        countries.set(4, "Islandia".getBytes(StandardCharsets.UTF_8));
+        countries.set(5, "Malta".getBytes(StandardCharsets.UTF_8));
+        countries.set(6, "Tailandia".getBytes(StandardCharsets.UTF_8));
+        countries.set(7, "Uganda".getBytes(StandardCharsets.UTF_8));
+        countries.set(8, "Yemen".getBytes(StandardCharsets.UTF_8));
+        countries.set(9, "Zambia".getBytes(StandardCharsets.UTF_8));
+        countries.setValueCount(10);
+
+        Dictionary countriesDictionary = new Dictionary(countries, dictionaryEncoding);
+        System.out.println("Dictionary: " + countriesDictionary);
+
+        appUserCountriesUnencoded.allocateNew(5);
+        appUserCountriesUnencoded.set(0, "Andorra".getBytes(StandardCharsets.UTF_8));
+        appUserCountriesUnencoded.set(1, "Guinea".getBytes(StandardCharsets.UTF_8));
+        appUserCountriesUnencoded.set(2, "Islandia".getBytes(StandardCharsets.UTF_8));
+        appUserCountriesUnencoded.set(3, "Malta".getBytes(StandardCharsets.UTF_8));
+        appUserCountriesUnencoded.set(4, "Uganda".getBytes(StandardCharsets.UTF_8));
+        appUserCountriesUnencoded.setValueCount(5);
+        System.out.println("Unencoded data: " + appUserCountriesUnencoded);
+
+        File file = new File("random_access_file_with_dictionary.arrow");
+        DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider();
+        provider.put(countriesDictionary);
+        try (FieldVector appUseCountryDictionaryEncoded = (FieldVector) DictionaryEncoder
+                .encode(appUserCountriesUnencoded, countriesDictionary);
+             VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.of(appUseCountryDictionaryEncoded);
+             FileOutputStream fileOutputStream = new FileOutputStream(file);
+             ArrowFileWriter writer = new ArrowFileWriter(vectorSchemaRoot, provider, fileOutputStream.getChannel())
+        ) {
+            System.out.println("Dictionary-encoded data: " +appUseCountryDictionaryEncoded);
+            System.out.println("Dictionary-encoded ID: " +appUseCountryDictionaryEncoded.getField().getDictionary().getId());
+            writer.start();
+            writer.writeBatch();
+            writer.end();
+            System.out.println("Record batches written: " + writer.getRecordBlocks().size() + ". Number of rows written: " + vectorSchemaRoot.getRowCount());
+            try(
+                BufferAllocator rootAllocator = new RootAllocator();
+                FileInputStream fileInputStream = new FileInputStream(file);
+                ArrowFileReader reader = new ArrowFileReader(fileInputStream.getChannel(), rootAllocator)
+            ){
+                for (ArrowBlock arrowBlock : reader.getRecordBlocks()) {
+                    reader.loadRecordBatch(arrowBlock);
+                    FieldVector appUseCountryDictionaryEncodedRead = reader.getVectorSchemaRoot().getVector("app-use-country-dict");
+                    DictionaryEncoding dictionaryEncodingRead = appUseCountryDictionaryEncodedRead.getField().getDictionary();
+                    System.out.println("Dictionary-encoded ID recovered: " + dictionaryEncodingRead.getId());
+                    Dictionary appUseCountryDictionaryRead = reader.getDictionaryVectors().get(dictionaryEncodingRead.getId());
+                    System.out.println("Dictionary-encoded data recovered: " + appUseCountryDictionaryEncodedRead);
+                    System.out.println("Dictionary recovered: " + appUseCountryDictionaryRead);
+                    try (ValueVector readVector = DictionaryEncoder.decode(appUseCountryDictionaryEncodedRead, appUseCountryDictionaryRead)) {
+                        System.out.println("Decoded data: " + readVector);
+                    }
+                }
+            }
+        } catch (FileNotFoundException e) {
+            e.printStackTrace();
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+    }
+
+.. testoutput::
+
+    Dictionary: Dictionary DictionaryEncoding[id=666,ordered=false,indexType=Int(8, true)] [Andorra, Cuba, Grecia, Guinea, Islandia, Malta, Tailandia, Uganda, Yemen, Zambia]
+    Unencoded data: [Andorra, Guinea, Islandia, Malta, Uganda]
+    Dictionary-encoded data: [0, 3, 4, 5, 7]
+    Dictionary-encoded ID: 666
+    Record batches written: 1. Number of rows written: 5
+    Dictionary-encoded ID recovered: 666
+    Dictionary-encoded data recovered: [0, 3, 4, 5, 7]
+    Dictionary recovered: Dictionary DictionaryEncoding[id=666,ordered=false,indexType=Int(8, true)] [Andorra, Cuba, Grecia, Guinea, Islandia, Malta, Tailandia, Uganda, Yemen, Zambia]
+    Decoded data: [Andorra, Guinea, Islandia, Malta, Uganda]