You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@orc.apache.org by GitBox <gi...@apache.org> on 2021/05/06 14:11:00 UTC

[GitHub] [orc] wgtmac commented on a change in pull request #695: ORC-614: [C++] Implement efficient seek() in decompression streams

wgtmac commented on a change in pull request #695:
URL: https://github.com/apache/orc/pull/695#discussion_r627456463



##########
File path: c++/src/Compression.cc
##########
@@ -503,16 +532,47 @@ DIAGNOSTIC_PUSH
     return true;
   }
 
+  /** There are three possible scenarios when seeking a position:
+   * 1. The seeked position is already read and decompressed into
+   *    the output stream.
+   * 2. It is already read from the input stream, but has not been
+   *    decompressed yet, ie. it's not in the output stream.
+   * 3. It is not read yet from the inputstream.
+   */
   void DecompressionStream::seek(PositionProvider& position) {
-    // clear state to force seek to read from the right position
+    size_t seekedPosition = position.current();
+    // Case 3.: the seeked position is the one that is currently buffered and
+    // decompressed. Here we only need to set the output buffer's pointer to the
+    // seeked position. Note that after the headerPosition comes the 3 bytes of
+    // the header.
+    if (headerPosition == seekedPosition
+        && inputBufferStartPosition <= headerPosition + 3 && inputBufferStart) {
+      position.next(); // Skip the input level position.
+      size_t posInChunk = position.next(); // Chunk level position.
+      outputBufferLength = uncompressedBufferLength - posInChunk;
+      outputBuffer = outputBufferStart + posInChunk;
+      return;
+    }
+    // Clear state to prepare reading from a new chunk header.
     state = DECOMPRESS_HEADER;
     outputBuffer = nullptr;
     outputBufferLength = 0;
     remainingLength = 0;
-    inputBuffer = nullptr;
-    inputBufferEnd = nullptr;
-
-    input->seek(position);
+    if (seekedPosition < static_cast<uint64_t>(input->ByteCount()) &&
+        seekedPosition >= inputBufferStartPosition) {
+      // Case 2.: The input is buffered, but not yet decompressed. No need to
+      // force re-reading the inputBuffer, we just have to move it to the
+      // seeked position.
+      position.next(); // Skip the input level position.
+      inputBuffer
+        = inputBufferStart + (seekedPosition - inputBufferStartPosition);
+    } else {
+      // Case 1.: The seeked position is not in the input buffer, here we are

Review comment:
       done

##########
File path: c++/src/Compression.cc
##########
@@ -503,16 +532,47 @@ DIAGNOSTIC_PUSH
     return true;
   }
 
+  /** There are three possible scenarios when seeking a position:
+   * 1. The seeked position is already read and decompressed into
+   *    the output stream.
+   * 2. It is already read from the input stream, but has not been
+   *    decompressed yet, ie. it's not in the output stream.
+   * 3. It is not read yet from the inputstream.
+   */
   void DecompressionStream::seek(PositionProvider& position) {
-    // clear state to force seek to read from the right position
+    size_t seekedPosition = position.current();
+    // Case 3.: the seeked position is the one that is currently buffered and

Review comment:
       done

##########
File path: c++/src/Compression.cc
##########
@@ -353,19 +353,29 @@ DIAGNOSTIC_PUSH
     // the current state
     DecompressState state;
 
-    // the start of the current buffer
-    // This pointer is not owned by us. It is either owned by zstream or
-    // the underlying stream.
+    // The starting and current position of the buffer for the uncompressed
+    // data. It either points to the data buffer or the underlying input stream.
+    const char* outputBufferStart;
     const char* outputBuffer;
-    // the size of the current buffer
+    // The original (ie. the overall) and the actual length of the uncompressed
+    // data.
+    size_t uncompressedBufferLength;
     size_t outputBufferLength;
-    // the size of the current chunk
+
+    // The remaining size of the current chunk that is not yet consumed
+    // ie. decompressed or returned in output if state==DECOMPRESS_ORIGINAL
     size_t remainingLength;
 
     // the last buffer returned from the input
+    const char* inputBufferStart;

Review comment:
       done




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org