You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by md...@apache.org on 2018/04/04 01:45:14 UTC
orc git commit: ORC-322: [C++] Fix writing & reading timestamp
Repository: orc
Updated Branches:
refs/heads/master 8f4fcd120 -> 8ea3592f6
ORC-322: [C++] Fix writing & reading timestamp
Fixes #233
Signed-off-by: Deepak Majeti <md...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/8ea3592f
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/8ea3592f
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/8ea3592f
Branch: refs/heads/master
Commit: 8ea3592f6aeddb5e6930798e4ff95065c6177e00
Parents: 8f4fcd1
Author: Gang Wu <ga...@alibaba-inc.com>
Authored: Sat Mar 17 01:17:02 2018 +0800
Committer: Deepak Majeti <md...@apache.org>
Committed: Tue Apr 3 21:43:27 2018 -0400
----------------------------------------------------------------------
c++/include/orc/Vector.hh | 3 +++
c++/src/ColumnReader.cc | 3 +--
c++/src/ColumnWriter.cc | 3 +--
c++/src/Timezone.cc | 4 ++++
c++/src/Timezone.hh | 5 +++++
c++/src/Writer.cc | 4 +++-
c++/test/TestWriter.cc | 2 +-
tools/src/CSVFileImport.cc | 5 ++++-
8 files changed, 22 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/orc/blob/8ea3592f/c++/include/orc/Vector.hh
----------------------------------------------------------------------
diff --git a/c++/include/orc/Vector.hh b/c++/include/orc/Vector.hh
index 65101db..86a9c54 100644
--- a/c++/include/orc/Vector.hh
+++ b/c++/include/orc/Vector.hh
@@ -258,6 +258,9 @@ namespace orc {
uint64_t getMemoryUsage();
// the number of seconds past 1 Jan 1970 00:00 UTC (aka time_t)
+ // Note that we always assume data is in GMT timezone; therefore it is
+ // user's responsibility to convert wall clock time in local timezone
+ // to GMT.
DataBuffer<int64_t> data;
// the nanoseconds of each value
http://git-wip-us.apache.org/repos/asf/orc/blob/8ea3592f/c++/src/ColumnReader.cc
----------------------------------------------------------------------
diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc
index ee2d80d..d4a5691 100644
--- a/c++/src/ColumnReader.cc
+++ b/c++/src/ColumnReader.cc
@@ -336,8 +336,7 @@ namespace orc {
}
}
int64_t writerTime = secsBuffer[i] + epochOffset;
- secsBuffer[i] = writerTime +
- writerTimezone.getVariant(writerTime).gmtOffset;
+ secsBuffer[i] = writerTimezone.convertToUTC(writerTime);
if (secsBuffer[i] < 0 && nanoBuffer[i] != 0) {
secsBuffer[i] -= 1;
}
http://git-wip-us.apache.org/repos/asf/orc/blob/8ea3592f/c++/src/ColumnWriter.cc
----------------------------------------------------------------------
diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index 9453180..e837665 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -1132,7 +1132,7 @@ namespace orc {
const WriterOptions& options) :
ColumnWriter(type, factory, options),
rleVersion(RleVersion_1),
- timezone(getLocalTimezone()){
+ timezone(getTimezoneByName("GMT")){
std::unique_ptr<BufferedOutputStream> dataStream =
factory.createStream(proto::Stream_Kind_DATA);
std::unique_ptr<BufferedOutputStream> secondaryStream =
@@ -1199,7 +1199,6 @@ namespace orc {
tsStats->increase(1);
tsStats->update(millsUTC);
- secs[i] -= timezone.getVariant(secs[i]).gmtOffset;
secs[i] -= timezone.getEpoch();
nanos[i] = formatNano(nanos[i]);
} else if (!hasNull) {
http://git-wip-us.apache.org/repos/asf/orc/blob/8ea3592f/c++/src/Timezone.cc
----------------------------------------------------------------------
diff --git a/c++/src/Timezone.cc b/c++/src/Timezone.cc
index 6083293..9d56d7f 100644
--- a/c++/src/Timezone.cc
+++ b/c++/src/Timezone.cc
@@ -605,6 +605,10 @@ namespace orc {
return epoch;
}
+ int64_t convertToUTC(int64_t clk) const override {
+ return clk + getVariant(clk).gmtOffset;
+ }
+
private:
void parseTimeVariants(const unsigned char* ptr,
uint64_t variantOffset,
http://git-wip-us.apache.org/repos/asf/orc/blob/8ea3592f/c++/src/Timezone.hh
----------------------------------------------------------------------
diff --git a/c++/src/Timezone.hh b/c++/src/Timezone.hh
index b1dbd6a..136b7a1 100644
--- a/c++/src/Timezone.hh
+++ b/c++/src/Timezone.hh
@@ -76,6 +76,11 @@ namespace orc {
* Get the version of the zone file.
*/
virtual uint64_t getVersion() const =0;
+
+ /**
+ * Convert wall clock time of current timezone to UTC timezone
+ */
+ virtual int64_t convertToUTC(int64_t clk) const = 0;
};
/**
http://git-wip-us.apache.org/repos/asf/orc/blob/8ea3592f/c++/src/Writer.cc
----------------------------------------------------------------------
diff --git a/c++/src/Writer.cc b/c++/src/Writer.cc
index 22f5750..016ce35 100644
--- a/c++/src/Writer.cc
+++ b/c++/src/Writer.cc
@@ -382,7 +382,9 @@ namespace orc {
*stripeFooter.add_columns() = encodings[i];
}
- // TODO: ORC-205 Include writer timezone in stripe footer
+ // use GMT to guarantee TimestampVectorBatch from reader can write
+ // same wall clock time
+ stripeFooter.set_writertimezone("GMT");
// add stripe statistics to metadata
proto::StripeStatistics* stripeStats = metadata.add_stripestats();
http://git-wip-us.apache.org/repos/asf/orc/blob/8ea3592f/c++/test/TestWriter.cc
----------------------------------------------------------------------
diff --git a/c++/test/TestWriter.cc b/c++/test/TestWriter.cc
index 4e6dbf0..c61d184 100644
--- a/c++/test/TestWriter.cc
+++ b/c++/test/TestWriter.cc
@@ -554,7 +554,7 @@ namespace orc {
uint64_t stripeSize = 16 * 1024;
uint64_t compressionBlockSize = 1024;
- uint64_t rowCount = 1024;
+ uint64_t rowCount = 102400;
std::unique_ptr<Writer> writer = createWriter(stripeSize,
compressionBlockSize,
http://git-wip-us.apache.org/repos/asf/orc/blob/8ea3592f/tools/src/CSVFileImport.cc
----------------------------------------------------------------------
diff --git a/tools/src/CSVFileImport.cc b/tools/src/CSVFileImport.cc
index d40d5b5..2901636 100644
--- a/tools/src/CSVFileImport.cc
+++ b/tools/src/CSVFileImport.cc
@@ -18,6 +18,7 @@
#include "orc/Exceptions.hh"
#include "orc/OrcFile.hh"
+#include "Timezone.hh"
#include <algorithm>
#include <fstream>
@@ -233,6 +234,7 @@ void fillTimestampValues(const std::vector<std::string>& data,
orc::ColumnVectorBatch* batch,
uint64_t numValues,
uint64_t colIndex) {
+ const orc::Timezone& localTZ = orc::getLocalTimezone();
orc::TimestampVectorBatch* tsBatch =
dynamic_cast<orc::TimestampVectorBatch*>(batch);
bool hasNull = false;
@@ -243,7 +245,8 @@ void fillTimestampValues(const std::vector<std::string>& data,
hasNull = true;
} else {
batch->notNull[i] = 1;
- tsBatch->data[i] = atoll(col.c_str());
+ // data is in local timezone
+ tsBatch->data[i] = localTZ.convertToUTC(atoll(col.c_str()));
tsBatch->nanoseconds[i] = 0;
}
}