You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2021/03/23 10:21:46 UTC

[orc] branch master updated: ORC-763: [C++] Fix ORC timestamp inconsistencies with Java ORC (#661)

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/master by this push:
     new ba45bcc  ORC-763: [C++] Fix ORC timestamp inconsistencies with Java ORC (#661)
ba45bcc is described below

commit ba45bcc17ddf6c9f8a63fbf5bf467c78e9b70062
Author: Gang Wu <ga...@alibaba-inc.com>
AuthorDate: Tue Mar 23 18:21:40 2021 +0800

    ORC-763: [C++] Fix ORC timestamp inconsistencies with Java ORC (#661)
    
    ### What changes were proposed in this pull request?
    The Java ORC reader takes one second off from timestamp values with negative second and nanosecond greater than 999999. The C++ ORC reader/writer should use same logic to be consistent.
    
    ### Why are the changes needed?
    WIthout this fix, we may see data inconsistency of timestamp values from different ORC readers.
    
    ### How was this patch tested?
    It can be tested manually by C++/Java ORC tools.
---
 c++/src/ColumnReader.cc |  2 +-
 c++/src/ColumnWriter.cc |  2 +-
 c++/test/TestWriter.cc  | 58 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc
index c4fdb33..9694107 100644
--- a/c++/src/ColumnReader.cc
+++ b/c++/src/ColumnReader.cc
@@ -374,7 +374,7 @@ namespace orc {
         }
         int64_t writerTime = secsBuffer[i] + epochOffset;
         secsBuffer[i] = writerTimezone.convertToUTC(writerTime);
-        if (secsBuffer[i] < 0 && nanoBuffer[i] != 0) {
+        if (secsBuffer[i] < 0 && nanoBuffer[i] > 999999) {
           secsBuffer[i] -= 1;
         }
       }
diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index 684ce13..a7e6485 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -1804,7 +1804,7 @@ namespace orc {
         }
         tsStats->update(millsUTC, static_cast<int32_t>(nanos[i] % 1000000));
 
-        if (secs[i] < 0 && nanos[i] != 0) {
+        if (secs[i] < 0 && nanos[i] > 999999) {
           secs[i] += 1;
         }
 
diff --git a/c++/test/TestWriter.cc b/c++/test/TestWriter.cc
index f76d0fe..67936de 100644
--- a/c++/test/TestWriter.cc
+++ b/c++/test/TestWriter.cc
@@ -641,6 +641,64 @@ namespace orc {
     }
   }
 
+  TEST_P(WriterTest, writeNegativeTimestamp) {
+    MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
+    MemoryPool* pool = getDefaultPool();
+    std::unique_ptr<Type> type(Type::buildTypeFromString("struct<a:timestamp>"));
+    auto writer = createWriter(16 * 1024 * 1024, 64 * 1024,
+      CompressionKind_ZLIB, *type, pool, &memStream, fileVersion);
+    uint64_t batchCount = 5;
+    auto batch = writer->createRowBatch(batchCount * 2);
+    auto structBatch = dynamic_cast<StructVectorBatch *>(batch.get());
+    auto tsBatch = dynamic_cast<TimestampVectorBatch *>(structBatch->fields[0]);
+    structBatch->numElements = batchCount;
+    tsBatch->numElements = batchCount;
+    const int64_t seconds[] = { -2, -1, 0, 1, 2 };
+
+    // write 1st batch with nanosecond <= 999999
+    for (uint64_t i = 0; i < batchCount; ++i) {
+      tsBatch->data[i] = seconds[i];
+      tsBatch->nanoseconds[i] = 999999;
+    }
+    writer->add(*batch);
+
+    // write 2nd batch with nanosecond > 999999
+    for (uint64_t i = 0; i < batchCount; ++i) {
+      tsBatch->data[i] = seconds[i];
+      tsBatch->nanoseconds[i] = 1000000;
+    }
+    writer->add(*batch);
+    writer->close();
+
+    std::unique_ptr<InputStream> inStream(
+      new MemoryInputStream (memStream.getData(), memStream.getLength()));
+    auto reader = createReader(pool, std::move(inStream));
+    auto rowReader = createRowReader(reader.get());
+    batch = rowReader->createRowBatch(batchCount);
+    structBatch = dynamic_cast<StructVectorBatch *>(batch.get());
+    tsBatch = dynamic_cast<TimestampVectorBatch *>(structBatch->fields[0]);
+
+    // read 1st batch with nanosecond <= 999999
+    EXPECT_EQ(true, rowReader->next(*batch));
+    for (uint64_t i = 0; i < batchCount; ++i) {
+      EXPECT_EQ(seconds[i], tsBatch->data[i]);
+      EXPECT_EQ(999999, tsBatch->nanoseconds[i]);
+    }
+
+    // read 2nd batch with nanosecond > 999999
+    EXPECT_EQ(true, rowReader->next(*batch));
+    for (uint64_t i = 0; i < batchCount; ++i) {
+      if (seconds[i] == -1) {
+        // reproduce the JDK bug of java.sql.Timestamp.
+        // make sure the C++ ORC writer has consistent effect.
+        EXPECT_EQ(0, tsBatch->data[i]);
+      } else {
+        EXPECT_EQ(seconds[i], tsBatch->data[i]);
+      }
+      EXPECT_EQ(1000000, tsBatch->nanoseconds[i]);
+    }
+  }
+
   TEST_P(WriterTest, writeCharAndVarcharColumn) {
     MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
     MemoryPool * pool = getDefaultPool();