You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "vibhatha (via GitHub)" <gi...@apache.org> on 2023/12/13 09:54:00 UTC

Re: [PR] GH-37841: [Java] Dictionary decoding not using the compression factory from the ArrowReader [arrow]

vibhatha commented on code in PR #38371:
URL: https://github.com/apache/arrow/pull/38371#discussion_r1425111899


##########
java/compression/src/test/java/org/apache/arrow/compression/TestArrowReaderWriterWithCompression.java:
##########
@@ -18,72 +18,245 @@
 package org.apache.arrow.compression;
 
 import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
 import java.nio.channels.Channels;
+import java.nio.channels.FileChannel;
+import java.nio.channels.SeekableByteChannel;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Optional;
 
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.FieldVector;
 import org.apache.arrow.vector.GenerateSampleData;
+import org.apache.arrow.vector.VarCharVector;
 import org.apache.arrow.vector.VectorSchemaRoot;
 import org.apache.arrow.vector.compression.CompressionUtil;
 import org.apache.arrow.vector.compression.NoCompressionCodec;
+import org.apache.arrow.vector.dictionary.Dictionary;
+import org.apache.arrow.vector.dictionary.DictionaryEncoder;
+import org.apache.arrow.vector.dictionary.DictionaryProvider;
 import org.apache.arrow.vector.ipc.ArrowFileReader;
 import org.apache.arrow.vector.ipc.ArrowFileWriter;
+import org.apache.arrow.vector.ipc.ArrowStreamReader;
+import org.apache.arrow.vector.ipc.ArrowStreamWriter;
 import org.apache.arrow.vector.ipc.message.IpcOption;
 import org.apache.arrow.vector.types.pojo.ArrowType;
+import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
 import org.apache.arrow.vector.types.pojo.Field;
 import org.apache.arrow.vector.types.pojo.FieldType;
 import org.apache.arrow.vector.types.pojo.Schema;
 import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel;
+import org.junit.After;
 import org.junit.Assert;
+import org.junit.Before;
 import org.junit.Test;
+import org.junit.jupiter.api.Disabled;
 
 public class TestArrowReaderWriterWithCompression {
 
-  @Test
-  public void testArrowFileZstdRoundTrip() throws Exception {
-    // Prepare sample data
-    final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE);
+  private BufferAllocator allocator;
+  private ByteArrayOutputStream out;
+  private VectorSchemaRoot root;
+
+  @Before
+  public void setup() {
+    allocator = new RootAllocator(Integer.MAX_VALUE);
+    out = new ByteArrayOutputStream();
+    root = null;
+  }
+
+  @After
+  public void tearDown() {
+    if (root != null) {
+      root.close();
+    }
+    if (allocator != null) {
+      allocator.close();
+    }
+    if (out != null) {
+      out.reset();
+    }
+
+  }
+
+  private void createAndWriteArrowFile(DictionaryProvider provider,
+      CompressionUtil.CodecType codecType) throws IOException {
     List<Field> fields = new ArrayList<>();
     fields.add(new Field("col", FieldType.notNullable(new ArrowType.Utf8()), new ArrayList<>()));
-    VectorSchemaRoot root = VectorSchemaRoot.create(new Schema(fields), allocator);
+    root = VectorSchemaRoot.create(new Schema(fields), allocator);
+
     final int rowCount = 10;
     GenerateSampleData.generateTestData(root.getVector(0), rowCount);
     root.setRowCount(rowCount);
 
-    // Write an in-memory compressed arrow file
-    ByteArrayOutputStream out = new ByteArrayOutputStream();
-    try (final ArrowFileWriter writer =
-           new ArrowFileWriter(root, null, Channels.newChannel(out), new HashMap<>(),
-             IpcOption.DEFAULT, CommonsCompressionFactory.INSTANCE, CompressionUtil.CodecType.ZSTD, Optional.of(7))) {
+    try (final ArrowFileWriter writer = new ArrowFileWriter(root, provider, Channels.newChannel(out),
+        new HashMap<>(), IpcOption.DEFAULT, CommonsCompressionFactory.INSTANCE, codecType, Optional.of(7))) {
+      writer.start();
+      writer.writeBatch();
+      writer.end();
+    }
+  }
+
+  private Dictionary createDictionary(VarCharVector dictionaryVector) {
+    setVector(dictionaryVector,
+        "foo".getBytes(StandardCharsets.UTF_8),
+        "bar".getBytes(StandardCharsets.UTF_8),
+        "baz".getBytes(StandardCharsets.UTF_8));
+
+    return new Dictionary(dictionaryVector,
+        new DictionaryEncoding(/*id=*/1L, /*ordered=*/false, /*indexType=*/null));
+  }
+
+  private VarCharVector createVarCharVector(String name, BufferAllocator allocator) {
+    VarCharVector vector = (VarCharVector) FieldType
+        .nullable(new ArrowType.Utf8()).createNewSingleVector(name, allocator, null);
+    vector.allocateNewSafe();
+    vector.set(0, "foo".getBytes(StandardCharsets.UTF_8));
+    vector.setValueCount(6);
+    return vector;
+  }
+
+  private List<Field> createFields(Dictionary dictionary, BufferAllocator allocator) {
+    VarCharVector vector = createVarCharVector("D1", allocator);
+    FieldVector encodedVector = (FieldVector) DictionaryEncoder.encode(vector, dictionary);
+    vector.close();
+
+    List<Field> fields = new ArrayList<>();
+    fields.add(new Field("col", FieldType.notNullable(new ArrowType.Utf8()), new ArrayList<>()));
+    fields.add(encodedVector.getField());
+
+    return fields;
+  }
+
+  private File writeArrowStream(VectorSchemaRoot root, DictionaryProvider provider,
+      CompressionUtil.CodecType codecType) throws IOException {
+    File tempFile = File.createTempFile("dictionary_compression", ".arrow");
+    try (FileOutputStream fileOut = new FileOutputStream(tempFile);
+        ArrowStreamWriter writer = new ArrowStreamWriter(root, provider,
+            Channels.newChannel(fileOut), IpcOption.DEFAULT,
+            CommonsCompressionFactory.INSTANCE, codecType, Optional.of(7))) {
       writer.start();
       writer.writeBatch();
       writer.end();
     }
+    return tempFile;
+  }
 
-    // Read the in-memory compressed arrow file with CommonsCompressionFactory provided
+  @Test
+  @Disabled

Review Comment:
   updated and tested. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org