You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2021/07/11 16:21:56 UTC

[orc] branch main updated: ORC-831: Do Not Copy String When Flushing Dictionary (#736)

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new 6002079  ORC-831: Do Not Copy String When Flushing Dictionary (#736)
6002079 is described below

commit 6002079f1d57cb5b8c7fb8375cdf3c6a977a77df
Author: belugabehr <12...@users.noreply.github.com>
AuthorDate: Sun Jul 11 12:21:49 2021 -0400

    ORC-831: Do Not Copy String When Flushing Dictionary (#736)
    
    ### What changes were proposed in this pull request?
    
    The code to flush first loads (copies) the data into a Text object and then writes it to the direct stream. Instead, pass the directStream to the dictionary and have it write directly to the stream instead of an intermediate Text object.
    
    ### Why are the changes needed?
    
    Performance.
    
    ### How was this patch tested?
    
    No change in behavior so using existing unit tests.
---
 .../src/java/org/apache/orc/impl/Dictionary.java   | 15 +++++++++++
 .../java/org/apache/orc/impl/DictionaryUtils.java  | 31 ++++++++++++++++++++--
 .../apache/orc/impl/StringHashTableDictionary.java |  6 +++++
 .../org/apache/orc/impl/StringRedBlackTree.java    |  7 +++++
 .../orc/impl/writer/StringBaseTreeWriter.java      |  7 ++---
 5 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/java/core/src/java/org/apache/orc/impl/Dictionary.java b/java/core/src/java/org/apache/orc/impl/Dictionary.java
index 2b8f45a..3c2849d 100644
--- a/java/core/src/java/org/apache/orc/impl/Dictionary.java
+++ b/java/core/src/java/org/apache/orc/impl/Dictionary.java
@@ -44,9 +44,24 @@ public interface Dictionary {
 
   /**
    * Given the position index, return the original string before being encoded.
+   * The value of the Text in the Dictionary is copied into {@code result}.
+   *
+   * @param result the holder to copy the dictionary text into
+   * @param position the position where the key was added
    */
   void getText(Text result, int position);
 
+  /**
+   * Given the position index, write the original string, before being encoded,
+   * to the OutputStream.
+   *
+   * @param out the output stream to which to write the data
+   * @param position the position where the key was originally added
+   * @return the number of byte written to the stream
+   * @throws IOException if an I/O error occurs
+   */
+  int writeTo(OutputStream out, int position) throws IOException;
+
   int add(byte[] bytes, int offset, int length);
 
   int size();
diff --git a/java/core/src/java/org/apache/orc/impl/DictionaryUtils.java b/java/core/src/java/org/apache/orc/impl/DictionaryUtils.java
index 91a397a..57d60da 100644
--- a/java/core/src/java/org/apache/orc/impl/DictionaryUtils.java
+++ b/java/core/src/java/org/apache/orc/impl/DictionaryUtils.java
@@ -17,8 +17,10 @@
  */
 package org.apache.orc.impl;
 
-import org.apache.hadoop.io.Text;
+import java.io.IOException;
+import java.io.OutputStream;
 
+import org.apache.hadoop.io.Text;
 
 public class DictionaryUtils {
   private DictionaryUtils() {
@@ -30,7 +32,7 @@ public class DictionaryUtils {
    * @param result Container for the UTF8 String.
    * @param position position in the keyOffsets
    * @param keyOffsets starting offset of the key (in byte) in the byte array.
-   * @param byteArray storing raw bytes of all key seen in dictionary
+   * @param byteArray storing raw bytes of all keys seen in dictionary
    */
   public static void getTextInternal(Text result, int position, DynamicIntArray keyOffsets, DynamicByteArray byteArray) {
     int offset = keyOffsets.get(position);
@@ -42,4 +44,29 @@ public class DictionaryUtils {
     }
     byteArray.setText(result, offset, length);
   }
+
+  /**
+   * Write a UTF8 string from the byteArray, using the offset in index-array,
+   * into an OutputStream
+   *
+   * @param out the output stream
+   * @param position position in the keyOffsets
+   * @param keyOffsets starting offset of the key (in byte) in the byte array
+   * @param byteArray storing raw bytes of all keys seen in dictionary
+   * @return the number of bytes written to the output stream
+   * @throw IOException if an I/O error occurs
+   */
+  public static int writeToTextInternal(OutputStream out, int position,
+      DynamicIntArray keyOffsets, DynamicByteArray byteArray)
+      throws IOException {
+    int offset = keyOffsets.get(position);
+    int length;
+    if (position + 1 == keyOffsets.size()) {
+      length = byteArray.size() - offset;
+    } else {
+      length = keyOffsets.get(position + 1) - offset;
+    }
+    byteArray.write(out, offset, length);
+    return length;
+  }
 }
diff --git a/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java b/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java
index b33dfd4..d962947 100644
--- a/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java
+++ b/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java
@@ -19,6 +19,7 @@
 package org.apache.orc.impl;
 
 import java.io.IOException;
+import java.io.OutputStream;
 import java.util.Arrays;
 
 import org.apache.hadoop.io.Text;
@@ -120,6 +121,11 @@ public class StringHashTableDictionary implements Dictionary {
   }
 
   @Override
+  public int writeTo(OutputStream out, int position) throws IOException {
+    return DictionaryUtils.writeToTextInternal(out, position, this.keyOffsets, this.byteArray);
+  }
+
+  @Override
   public int add(byte[] bytes, int offset, int length) {
     newKey.set(bytes, offset, length);
     return add(newKey);
diff --git a/java/core/src/java/org/apache/orc/impl/StringRedBlackTree.java b/java/core/src/java/org/apache/orc/impl/StringRedBlackTree.java
index e8088cc..387739c 100644
--- a/java/core/src/java/org/apache/orc/impl/StringRedBlackTree.java
+++ b/java/core/src/java/org/apache/orc/impl/StringRedBlackTree.java
@@ -18,6 +18,7 @@
 package org.apache.orc.impl;
 
 import java.io.IOException;
+import java.io.OutputStream;
 
 import org.apache.hadoop.io.Text;
 
@@ -111,6 +112,12 @@ public class StringRedBlackTree extends RedBlackTree implements Dictionary {
     DictionaryUtils.getTextInternal(result, originalPosition, this.keyOffsets, this.byteArray);
   }
 
+  @Override
+  public int writeTo(OutputStream out, int position) throws IOException {
+    return DictionaryUtils.writeToTextInternal(out, position, this.keyOffsets,
+        this.byteArray);
+  }
+
   /**
    * Get the size of the character data in the table.
    * @return the bytes used by the table
diff --git a/java/core/src/java/org/apache/orc/impl/writer/StringBaseTreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/StringBaseTreeWriter.java
index fc5bb80..d5c74ed 100644
--- a/java/core/src/java/org/apache/orc/impl/writer/StringBaseTreeWriter.java
+++ b/java/core/src/java/org/apache/orc/impl/writer/StringBaseTreeWriter.java
@@ -20,7 +20,6 @@ package org.apache.orc.impl.writer;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hive.ql.util.JavaDataModel;
-import org.apache.hadoop.io.Text;
 import org.apache.orc.OrcConf;
 import org.apache.orc.OrcProto;
 import org.apache.orc.StringColumnStatistics;
@@ -178,7 +177,6 @@ public abstract class StringBaseTreeWriter extends TreeWriterBase {
     int length = rows.size();
     int rowIndexEntry = 0;
     OrcProto.RowIndex.Builder rowIndex = getRowIndex();
-    Text text = new Text();
     // write the values translated into the dump order.
     for (int i = 0; i <= length; ++i) {
       // now that we are writing out the row values, we can finalize the
@@ -202,9 +200,8 @@ public abstract class StringBaseTreeWriter extends TreeWriterBase {
         if (useDictionaryEncoding) {
           rowOutput.write(dumpOrder[rows.get(i)]);
         } else {
-          dictionary.getText(text, rows.get(i));
-          directStreamOutput.write(text.getBytes(), 0, text.getLength());
-          lengthOutput.write(text.getLength());
+          final int writeLen = dictionary.writeTo(directStreamOutput, rows.get(i));
+          lengthOutput.write(writeLen);
         }
       }
     }