You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2021/11/04 15:53:15 UTC

[orc] branch main updated: ORC-1041: Use `memcpy` during LZO decompression (#958)

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new 502661a  ORC-1041: Use `memcpy` during LZO decompression (#958)
502661a is described below

commit 502661aee2b1895ba4ff652297bf3a5f4ea8955a
Author: Yiqun Zhang <gu...@gmail.com>
AuthorDate: Thu Nov 4 23:50:22 2021 +0800

    ORC-1041: Use `memcpy` during LZO decompression (#958)
    
    ### What changes were proposed in this pull request?
    
    This pr is aimed to fix the implementation of copy data blocks during LZO decompression.
    
    `*reinterpret_cast< int64_t*>(output) = *reinterpret_cast< int64_t*>(matchAddress);`  can lead to unexpected behavior, and in failed test cases it does not appear to be an atomic operation.
    
    This pr uses memcpy instead of the above statement.
    
    Here are the performance benchmarks, where `memcpy` is basically the same as `reinterpret_cast` + `assignment`.  The newer compilers outperform unfolded assignment, so here is a screenshot of the results with only some of the parameters, which you can reproduce with the following test code at https://quick-bench.com/
    
    ![WX20211104-104627](https://user-images.githubusercontent.com/4069905/140250827-6282739b-c060-43fa-b348-87ede15129fc.png)
    ![WX20211104-105010](https://user-images.githubusercontent.com/4069905/140250854-cf6da388-18d8-42f0-8cd6-18468633acc3.png)
    ![WX20211104-105348](https://user-images.githubusercontent.com/4069905/140250863-6c99cfcb-0b72-4ee0-a6b0-ac31344ac771.png)
    
    ```c++
    #include <string.h>
    
    static void use_memcpy(benchmark::State& state) {
      auto size = state.range(0);
      char buf[size];
      for (int i = 0; i < 8; ++i) {
        buf[i] = 'a';
      }
      for (auto _ : state) {
        char *output = buf + 8;
        char *matchAddress = buf;
        char *matchOutputLimit = buf + size;
        while (output < matchOutputLimit) {
          memcpy(output, matchAddress, 8);
          matchAddress += 8;
          output += 8;
        }
      }
    }
    
    static void use_expanded_assignment(benchmark::State& state) {
      auto size = state.range(0);
      char buf[size];
      for (int i = 0; i < 8; ++i) {
        buf[i] = 'a';
      }
      for (auto _ : state) {
        char *output = buf + 8;
        char *matchAddress = buf;
        char *matchOutputLimit = buf + size;
        while (output < matchOutputLimit) {
          output[0] = *matchOutputLimit;
          output[1] = *(matchOutputLimit + 1);
          output[2] = *(matchOutputLimit + 2);
          output[3] = *(matchOutputLimit + 3);
          output[4] = *(matchOutputLimit + 4);
          output[5] = *(matchOutputLimit + 5);
          output[6] = *(matchOutputLimit + 6);
          output[7] = *(matchOutputLimit + 7);
          matchAddress += 8;
          output += 8;
        }
      }
    }
    
    static void use_reinterpret_assignment(benchmark::State& state) {
      auto size = state.range(0);
      char buf[size];
      for (int i = 0; i < 8; ++i) {
        buf[i] = 'a';
      }
      for (auto _ : state) {
        char *output = buf + 8;
        char *matchAddress = buf;
        char *matchOutputLimit = buf + size;
        while (output < matchOutputLimit) {
          *reinterpret_cast<int64_t*>(output) =
                    *reinterpret_cast<int64_t*>(matchAddress);
          matchAddress += 8;
          output += 8;
        }
      }
    }
    
    BENCHMARK(use_memcpy)->Arg(100000);
    
    BENCHMARK(use_expanded_assignment)->Arg(100000);
    
    BENCHMARK(use_reinterpret_assignment)->Arg(100000);
    ```
    
    ### Why are the changes needed?
    
    Fix the bug of LZO decompression.
    
    ### How was this patch tested?
    
    Pass the CIs.
---
 c++/src/LzoDecompressor.cc | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/c++/src/LzoDecompressor.cc b/c++/src/LzoDecompressor.cc
index d1ba183..21bf194 100644
--- a/c++/src/LzoDecompressor.cc
+++ b/c++/src/LzoDecompressor.cc
@@ -312,13 +312,11 @@ namespace orc {
               output += SIZE_OF_INT;
               matchAddress += increment32;
 
-              *reinterpret_cast<int32_t*>(output) =
-                *reinterpret_cast<int32_t*>(matchAddress);
+              memcpy(output, matchAddress, SIZE_OF_INT);
               output += SIZE_OF_INT;
               matchAddress -= decrement64;
             } else {
-              *reinterpret_cast<int64_t*>(output) =
-                *reinterpret_cast<int64_t*>(matchAddress);
+              memcpy(output, matchAddress, SIZE_OF_LONG);
               matchAddress += SIZE_OF_LONG;
               output += SIZE_OF_LONG;
             }
@@ -329,8 +327,7 @@ namespace orc {
               }
 
               while (output < fastOutputLimit) {
-                *reinterpret_cast<int64_t*>(output) =
-                  *reinterpret_cast<int64_t*>(matchAddress);
+                memcpy(output, matchAddress, SIZE_OF_LONG);
                 matchAddress += SIZE_OF_LONG;
                 output += SIZE_OF_LONG;
               }
@@ -340,8 +337,7 @@ namespace orc {
               }
             } else {
               while (output < matchOutputLimit) {
-                *reinterpret_cast<int64_t*>(output) =
-                  *reinterpret_cast<int64_t*>(matchAddress);
+                memcpy(output, matchAddress, SIZE_OF_LONG);
                 matchAddress += SIZE_OF_LONG;
                 output += SIZE_OF_LONG;
               }
@@ -366,8 +362,7 @@ namespace orc {
           // fast copy. We may over-copy but there's enough room in input
           // and output to not overrun them
           do {
-            *reinterpret_cast<int64_t*>(output) =
-              *reinterpret_cast<const int64_t*>(input);
+            memcpy(output, input, SIZE_OF_LONG);
             input += SIZE_OF_LONG;
             output += SIZE_OF_LONG;
           } while (output < literalOutputLimit);