You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2022/10/19 18:24:49 UTC

[orc] branch branch-1.8 updated: ORC-1288: [C++] fix the bug that invalid memory freeing with zlib compression

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-1.8
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/branch-1.8 by this push:
     new f0bfb8878 ORC-1288: [C++] fix the bug that invalid memory freeing with zlib compression
f0bfb8878 is described below

commit f0bfb887820aaaec4d5f9ae9f54c0fbb397f0cbb
Author: coderex2522 <re...@gmail.com>
AuthorDate: Wed Oct 19 11:24:33 2022 -0700

    ORC-1288: [C++] fix the bug that invalid memory freeing with zlib compression
    
    ### What changes were proposed in this pull request?
    This bug can be triggered when writing a orc file that contains multiple stripes, and each stripe contains columns with no null values. The bug is caused by the fact that the member variables in class CompressionStream is not reset after the suppress function is called.
    
    ### Why are the changes needed?
    The patch can fix the bug about invalid memory freeing with multiple compression kinds.
    
    ### How was this patch tested?
    Added new UT WriterTest.suppressPresentStreamWithCompressionKinds.
    
    Closes #1282 from coderex2522/ORC-1288.
    
    Authored-by: coderex2522 <re...@gmail.com>
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
    (cherry picked from commit ae673e888bf3861c5db8d0738469f24acc0f9cba)
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
 c++/src/Compression.cc | 13 +++++++++++++
 c++/test/TestWriter.cc | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)

diff --git a/c++/src/Compression.cc b/c++/src/Compression.cc
index ea1017150..83d2507f6 100644
--- a/c++/src/Compression.cc
+++ b/c++/src/Compression.cc
@@ -60,6 +60,7 @@ namespace orc {
 
     virtual std::string getName() const override = 0;
     virtual uint64_t flush() override;
+    virtual void suppress() override;
 
     virtual bool isCompressed() const override { return true; }
     virtual uint64_t getSize() const override;
@@ -129,6 +130,12 @@ namespace orc {
     return BufferedOutputStream::flush();
   }
 
+  void CompressionStreamBase::suppress() {
+    outputBuffer = nullptr;
+    bufferSize = outputPosition = outputSize = 0;
+    BufferedOutputStream::suppress();
+  }
+
   uint64_t CompressionStreamBase::getSize() const {
     return BufferedOutputStream::getSize() -
            static_cast<uint64_t>(outputSize - outputPosition);
@@ -927,6 +934,7 @@ DIAGNOSTIC_POP
     }
 
     virtual bool Next(void** data, int*size) override;
+    virtual void suppress() override;
     virtual std::string getName() const override = 0;
 
   protected:
@@ -995,6 +1003,11 @@ DIAGNOSTIC_POP
     return true;
   }
 
+  void BlockCompressionStream::suppress() {
+    compressorBuffer.resize(0);
+    CompressionStreamBase::suppress();
+  }
+
   /**
    * LZ4 block compression
    */
diff --git a/c++/test/TestWriter.cc b/c++/test/TestWriter.cc
index b0158d9ea..ee8270805 100644
--- a/c++/test/TestWriter.cc
+++ b/c++/test/TestWriter.cc
@@ -1996,5 +1996,53 @@ namespace orc {
     }
   }
 
+  // Before the fix of ORC-1288, this case will trigger the bug about
+  // invalid memory freeing with zlib compression when writing a orc file
+  // that contains multiple stripes, and each stripe contains multiple columns
+  // with no null values.
+  void testSuppressPresentStream(orc::CompressionKind kind) {
+    MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
+    MemoryPool* pool = getDefaultPool();
+    uint64_t rowCount = 5000000;
+    auto type = std::unique_ptr<Type>(
+      Type::buildTypeFromString("struct<c0:int>"));
+    WriterOptions options;
+    options.setStripeSize(1024)
+      .setCompressionBlockSize(1024)
+      .setCompression(kind)
+      .setMemoryPool(pool);
+
+    auto writer = createWriter(*type, &memStream, options);
+    auto batch = writer->createRowBatch(rowCount);
+    auto& structBatch = dynamic_cast<StructVectorBatch&>(*batch);
+    auto& longBatch = dynamic_cast<LongVectorBatch&>(*structBatch.fields[0]);
+    uint64_t rows = 0;
+    uint64_t batchSize = 10000;
+    for (uint64_t i = 0; i < rowCount; ++i) {
+      longBatch.data[i] = static_cast<int64_t>(i);
+      ++rows;
+      if (rows == batchSize) {
+        structBatch.numElements = rows;
+        longBatch.numElements = rows;
+        writer->add(*batch);
+        rows = 0;
+      }
+    }
+    if (rows != 0) {
+      structBatch.numElements = rows;
+      longBatch.numElements = rows;
+      writer->add(*batch);
+      rows = 0;
+    }
+    writer->close();
+  }
+
+  TEST(WriterTest, suppressPresentStreamWithCompressionKinds) {
+    testSuppressPresentStream(CompressionKind_ZLIB);
+    testSuppressPresentStream(CompressionKind_ZSTD);
+    testSuppressPresentStream(CompressionKind_LZ4);
+    testSuppressPresentStream(CompressionKind_SNAPPY);
+  }
+
   INSTANTIATE_TEST_CASE_P(OrcTest, WriterTest, Values(FileVersion::v_0_11(), FileVersion::v_0_12(), FileVersion::UNSTABLE_PRE_2_0()));
 }