You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2021/03/23 10:21:46 UTC
[orc] branch master updated: ORC-763: [C++] Fix ORC timestamp
inconsistencies with Java ORC (#661)
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/master by this push:
new ba45bcc ORC-763: [C++] Fix ORC timestamp inconsistencies with Java ORC (#661)
ba45bcc is described below
commit ba45bcc17ddf6c9f8a63fbf5bf467c78e9b70062
Author: Gang Wu <ga...@alibaba-inc.com>
AuthorDate: Tue Mar 23 18:21:40 2021 +0800
ORC-763: [C++] Fix ORC timestamp inconsistencies with Java ORC (#661)
### What changes were proposed in this pull request?
The Java ORC reader takes one second off from timestamp values with negative second and nanosecond greater than 999999. The C++ ORC reader/writer should use same logic to be consistent.
### Why are the changes needed?
WIthout this fix, we may see data inconsistency of timestamp values from different ORC readers.
### How was this patch tested?
It can be tested manually by C++/Java ORC tools.
---
c++/src/ColumnReader.cc | 2 +-
c++/src/ColumnWriter.cc | 2 +-
c++/test/TestWriter.cc | 58 +++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 60 insertions(+), 2 deletions(-)
diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc
index c4fdb33..9694107 100644
--- a/c++/src/ColumnReader.cc
+++ b/c++/src/ColumnReader.cc
@@ -374,7 +374,7 @@ namespace orc {
}
int64_t writerTime = secsBuffer[i] + epochOffset;
secsBuffer[i] = writerTimezone.convertToUTC(writerTime);
- if (secsBuffer[i] < 0 && nanoBuffer[i] != 0) {
+ if (secsBuffer[i] < 0 && nanoBuffer[i] > 999999) {
secsBuffer[i] -= 1;
}
}
diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index 684ce13..a7e6485 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -1804,7 +1804,7 @@ namespace orc {
}
tsStats->update(millsUTC, static_cast<int32_t>(nanos[i] % 1000000));
- if (secs[i] < 0 && nanos[i] != 0) {
+ if (secs[i] < 0 && nanos[i] > 999999) {
secs[i] += 1;
}
diff --git a/c++/test/TestWriter.cc b/c++/test/TestWriter.cc
index f76d0fe..67936de 100644
--- a/c++/test/TestWriter.cc
+++ b/c++/test/TestWriter.cc
@@ -641,6 +641,64 @@ namespace orc {
}
}
+ TEST_P(WriterTest, writeNegativeTimestamp) {
+ MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
+ MemoryPool* pool = getDefaultPool();
+ std::unique_ptr<Type> type(Type::buildTypeFromString("struct<a:timestamp>"));
+ auto writer = createWriter(16 * 1024 * 1024, 64 * 1024,
+ CompressionKind_ZLIB, *type, pool, &memStream, fileVersion);
+ uint64_t batchCount = 5;
+ auto batch = writer->createRowBatch(batchCount * 2);
+ auto structBatch = dynamic_cast<StructVectorBatch *>(batch.get());
+ auto tsBatch = dynamic_cast<TimestampVectorBatch *>(structBatch->fields[0]);
+ structBatch->numElements = batchCount;
+ tsBatch->numElements = batchCount;
+ const int64_t seconds[] = { -2, -1, 0, 1, 2 };
+
+ // write 1st batch with nanosecond <= 999999
+ for (uint64_t i = 0; i < batchCount; ++i) {
+ tsBatch->data[i] = seconds[i];
+ tsBatch->nanoseconds[i] = 999999;
+ }
+ writer->add(*batch);
+
+ // write 2nd batch with nanosecond > 999999
+ for (uint64_t i = 0; i < batchCount; ++i) {
+ tsBatch->data[i] = seconds[i];
+ tsBatch->nanoseconds[i] = 1000000;
+ }
+ writer->add(*batch);
+ writer->close();
+
+ std::unique_ptr<InputStream> inStream(
+ new MemoryInputStream (memStream.getData(), memStream.getLength()));
+ auto reader = createReader(pool, std::move(inStream));
+ auto rowReader = createRowReader(reader.get());
+ batch = rowReader->createRowBatch(batchCount);
+ structBatch = dynamic_cast<StructVectorBatch *>(batch.get());
+ tsBatch = dynamic_cast<TimestampVectorBatch *>(structBatch->fields[0]);
+
+ // read 1st batch with nanosecond <= 999999
+ EXPECT_EQ(true, rowReader->next(*batch));
+ for (uint64_t i = 0; i < batchCount; ++i) {
+ EXPECT_EQ(seconds[i], tsBatch->data[i]);
+ EXPECT_EQ(999999, tsBatch->nanoseconds[i]);
+ }
+
+ // read 2nd batch with nanosecond > 999999
+ EXPECT_EQ(true, rowReader->next(*batch));
+ for (uint64_t i = 0; i < batchCount; ++i) {
+ if (seconds[i] == -1) {
+ // reproduce the JDK bug of java.sql.Timestamp.
+ // make sure the C++ ORC writer has consistent effect.
+ EXPECT_EQ(0, tsBatch->data[i]);
+ } else {
+ EXPECT_EQ(seconds[i], tsBatch->data[i]);
+ }
+ EXPECT_EQ(1000000, tsBatch->nanoseconds[i]);
+ }
+ }
+
TEST_P(WriterTest, writeCharAndVarcharColumn) {
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
MemoryPool * pool = getDefaultPool();