You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by ga...@apache.org on 2020/02/17 12:48:39 UTC

[orc] branch master updated: ORC-600: Fix StringDictionaryColumnReader to update index buffer correctly

This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/master by this push:
     new bd48255  ORC-600: Fix StringDictionaryColumnReader to update index buffer correctly
bd48255 is described below

commit bd4825568dca4ce06f8d3428f5e9ced2f53bb6f2
Author: Norbert Luksa <no...@gmail.com>
AuthorDate: Mon Feb 17 13:48:28 2020 +0100

    ORC-600: Fix StringDictionaryColumnReader to update index buffer correctly
    
    When we are scanning with StringDictionaryColumnReader an encoded batch, ORC crashes if we want to read more elements than the current capacity of the batch. It can occur eg. with string lists where it's possible that the offsets defined in the list overflows in the index buffer of the EncodedStringDictionaryBatch.
    
    This commit provides a fix by defining a resize method for EncodedStringDictionaryBatch that also resizes the index buffer.
    
    Tests:
     - Added a test that would cause ORC to crash without the fix.
    
    This fixes #480
---
 c++/include/orc/Vector.hh    |  1 +
 c++/src/Vector.cc            |  7 ++++
 c++/test/TestColumnReader.cc | 88 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 96 insertions(+)

diff --git a/c++/include/orc/Vector.hh b/c++/include/orc/Vector.hh
index 4cb2450..6213769 100644
--- a/c++/include/orc/Vector.hh
+++ b/c++/include/orc/Vector.hh
@@ -154,6 +154,7 @@ namespace orc {
     EncodedStringVectorBatch(uint64_t capacity, MemoryPool& pool);
     virtual ~EncodedStringVectorBatch();
     std::string toString() const;
+    void resize(uint64_t capacity);
     std::shared_ptr<StringDictionary> dictionary;
 
     // index for dictionary entry
diff --git a/c++/src/Vector.cc b/c++/src/Vector.cc
index 5c0e973..37ce67c 100644
--- a/c++/src/Vector.cc
+++ b/c++/src/Vector.cc
@@ -149,6 +149,13 @@ namespace orc {
     return buffer.str();
   }
 
+  void EncodedStringVectorBatch::resize(uint64_t cap) {
+    if (capacity < cap) {
+      StringVectorBatch::resize(cap);
+      index.resize(cap);
+    }
+  }
+
   StringVectorBatch::StringVectorBatch(uint64_t _capacity, MemoryPool& pool
                ): ColumnVectorBatch(_capacity, pool),
                   data(pool, _capacity),
diff --git a/c++/test/TestColumnReader.cc b/c++/test/TestColumnReader.cc
index 9f8d8b8..5acb19a 100644
--- a/c++/test/TestColumnReader.cc
+++ b/c++/test/TestColumnReader.cc
@@ -4481,6 +4481,94 @@ TEST(TestColumnReader, testUnionWithManyVariants) {
 }
 
 
+TEST(TestColumnReader, testStringDictinoryIndexOverflow) {
+  MockStripeStreams streams;
+
+  // set getSelectedColumns()
+  std::vector<bool> selectedColumns(2, true);
+  EXPECT_CALL(streams, getSelectedColumns())
+      .WillRepeatedly(testing::Return(selectedColumns));
+
+  // set getEncoding
+  proto::ColumnEncoding directEncoding;
+  directEncoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
+  EXPECT_CALL(streams, getEncoding(0))
+      .WillRepeatedly(testing::Return(directEncoding));
+  proto::ColumnEncoding dictionaryEncoding;
+  dictionaryEncoding.set_kind(proto::ColumnEncoding_Kind_DICTIONARY);
+  dictionaryEncoding.set_dictionarysize(2);
+  EXPECT_CALL(streams, getEncoding(1))
+      .WillRepeatedly(testing::Return(dictionaryEncoding));
+
+  // set getStream
+  EXPECT_CALL(streams, getStreamProxy(0, proto::Stream_Kind_PRESENT, true))
+      .WillRepeatedly(testing::Return(nullptr));
+  // [11110000 for 0..127] * 2
+  const unsigned char buffer1[] = { 0x7f, 0xf0, 0x7f, 0xf0 };
+  EXPECT_CALL(streams, getStreamProxy(1, proto::Stream_Kind_PRESENT, true))
+      .WillRepeatedly(testing::Return(new SeekableArrayInputStream
+                                      (buffer1, ARRAY_SIZE(buffer1))));
+  // [0 for x in 1..256*2] + [1 for x in 1..256*3]
+  const unsigned char buffer2[] = { 0x7f, 0x00, 0x00,
+                                    0x7f, 0x00, 0x00,
+                                    0x7f, 0x00, 0x01,
+                                    0x7f, 0x00, 0x01,
+                                    0x7f, 0x00, 0x01 };
+  EXPECT_CALL(streams, getStreamProxy(1, proto::Stream_Kind_DATA, true))
+      .WillRepeatedly(testing::Return(new SeekableArrayInputStream
+                                      (buffer2, ARRAY_SIZE(buffer2))));
+  const unsigned char buffer3[] = { 0x4f, 0x52, 0x43, 0x4f, 0x77, 0x65, 0x6e };
+  EXPECT_CALL(streams, getStreamProxy(1, proto::Stream_Kind_DICTIONARY_DATA,
+                                      false))
+      .WillRepeatedly(testing::Return(new SeekableArrayInputStream
+                                      (buffer3, ARRAY_SIZE(buffer3))));
+  const unsigned char buffer4[] = { 0x02, 0x01, 0x03 };
+  EXPECT_CALL(streams, getStreamProxy(1, proto::Stream_Kind_LENGTH, false))
+      .WillRepeatedly(testing::Return(new SeekableArrayInputStream
+                                      (buffer4, ARRAY_SIZE(buffer4))));
+
+  // create the row type
+  std::unique_ptr<Type> rowType = createStructType();
+  rowType->addStructField("myString",createPrimitiveType(STRING));
+  std::unique_ptr<ColumnReader> reader = buildReader(*rowType, streams);
+
+  EncodedStringVectorBatch *encodedStringBatch =
+      new EncodedStringVectorBatch(1024, *getDefaultPool());
+
+  StructVectorBatch batch(1024, *getDefaultPool());
+  batch.fields.push_back(encodedStringBatch);
+  reader->nextEncoded(batch, 8, 0);
+  ASSERT_EQ(8, batch.numElements);
+  ASSERT_EQ(true, !batch.hasNulls);
+  ASSERT_EQ(8, encodedStringBatch->numElements);
+  ASSERT_EQ(true, encodedStringBatch->hasNulls);
+  reader->nextEncoded(batch, 1100, 0);
+  ASSERT_EQ(1100, batch.numElements);
+  ASSERT_EQ(true, !batch.hasNulls);
+  ASSERT_EQ(1100, encodedStringBatch->numElements);
+  ASSERT_EQ(true, encodedStringBatch->hasNulls);
+  for (size_t i = 0; i < batch.numElements; ++i) {
+    if (i & 4) {
+      EXPECT_EQ(0, encodedStringBatch->notNull[i]);
+    } else {
+      EXPECT_EQ(1, encodedStringBatch->notNull[i]);
+      const char* expected = i < 512 ? "ORC" : "Owen";
+      int64_t index = encodedStringBatch->index.data()[i];
+
+      char* actualString;
+      int64_t actualLength;
+      encodedStringBatch->dictionary->getValueByIndex(index, actualString, actualLength);
+      ASSERT_EQ(strlen(expected), actualLength)
+        << "Wrong length at " << i;
+
+      for (size_t letter = 0; letter < strlen(expected); ++letter) {
+        EXPECT_EQ(expected[letter], actualString[letter])
+        << "Wrong contents at " << i << ", " << letter;
+      }
+    }
+  }
+}
+
 INSTANTIATE_TEST_CASE_P(OrcColumnReaderTest, TestColumnReaderEncoded, Values(true, false));
 
 }  // namespace orc