You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by md...@apache.org on 2018/04/04 01:45:14 UTC

orc git commit: ORC-322: [C++] Fix writing & reading timestamp

Repository: orc
Updated Branches:
  refs/heads/master 8f4fcd120 -> 8ea3592f6


ORC-322: [C++] Fix writing & reading timestamp

Fixes #233

Signed-off-by: Deepak Majeti <md...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/8ea3592f
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/8ea3592f
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/8ea3592f

Branch: refs/heads/master
Commit: 8ea3592f6aeddb5e6930798e4ff95065c6177e00
Parents: 8f4fcd1
Author: Gang Wu <ga...@alibaba-inc.com>
Authored: Sat Mar 17 01:17:02 2018 +0800
Committer: Deepak Majeti <md...@apache.org>
Committed: Tue Apr 3 21:43:27 2018 -0400

----------------------------------------------------------------------
 c++/include/orc/Vector.hh  | 3 +++
 c++/src/ColumnReader.cc    | 3 +--
 c++/src/ColumnWriter.cc    | 3 +--
 c++/src/Timezone.cc        | 4 ++++
 c++/src/Timezone.hh        | 5 +++++
 c++/src/Writer.cc          | 4 +++-
 c++/test/TestWriter.cc     | 2 +-
 tools/src/CSVFileImport.cc | 5 ++++-
 8 files changed, 22 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/orc/blob/8ea3592f/c++/include/orc/Vector.hh
----------------------------------------------------------------------
diff --git a/c++/include/orc/Vector.hh b/c++/include/orc/Vector.hh
index 65101db..86a9c54 100644
--- a/c++/include/orc/Vector.hh
+++ b/c++/include/orc/Vector.hh
@@ -258,6 +258,9 @@ namespace orc {
     uint64_t getMemoryUsage();
 
     // the number of seconds past 1 Jan 1970 00:00 UTC (aka time_t)
+    // Note that we always assume data is in GMT timezone; therefore it is
+    // user's responsibility to convert wall clock time in local timezone
+    // to GMT.
     DataBuffer<int64_t> data;
 
     // the nanoseconds of each value

http://git-wip-us.apache.org/repos/asf/orc/blob/8ea3592f/c++/src/ColumnReader.cc
----------------------------------------------------------------------
diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc
index ee2d80d..d4a5691 100644
--- a/c++/src/ColumnReader.cc
+++ b/c++/src/ColumnReader.cc
@@ -336,8 +336,7 @@ namespace orc {
           }
         }
         int64_t writerTime = secsBuffer[i] + epochOffset;
-        secsBuffer[i] = writerTime +
-          writerTimezone.getVariant(writerTime).gmtOffset;
+        secsBuffer[i] = writerTimezone.convertToUTC(writerTime);
         if (secsBuffer[i] < 0 && nanoBuffer[i] != 0) {
           secsBuffer[i] -= 1;
         }

http://git-wip-us.apache.org/repos/asf/orc/blob/8ea3592f/c++/src/ColumnWriter.cc
----------------------------------------------------------------------
diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index 9453180..e837665 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -1132,7 +1132,7 @@ namespace orc {
                              const WriterOptions& options) :
                                  ColumnWriter(type, factory, options),
                                  rleVersion(RleVersion_1),
-                                 timezone(getLocalTimezone()){
+                                 timezone(getTimezoneByName("GMT")){
     std::unique_ptr<BufferedOutputStream> dataStream =
         factory.createStream(proto::Stream_Kind_DATA);
     std::unique_ptr<BufferedOutputStream> secondaryStream =
@@ -1199,7 +1199,6 @@ namespace orc {
         tsStats->increase(1);
         tsStats->update(millsUTC);
 
-        secs[i] -= timezone.getVariant(secs[i]).gmtOffset;
         secs[i] -= timezone.getEpoch();
         nanos[i] = formatNano(nanos[i]);
       } else if (!hasNull) {

http://git-wip-us.apache.org/repos/asf/orc/blob/8ea3592f/c++/src/Timezone.cc
----------------------------------------------------------------------
diff --git a/c++/src/Timezone.cc b/c++/src/Timezone.cc
index 6083293..9d56d7f 100644
--- a/c++/src/Timezone.cc
+++ b/c++/src/Timezone.cc
@@ -605,6 +605,10 @@ namespace orc {
       return epoch;
     }
 
+    int64_t convertToUTC(int64_t clk) const override {
+      return clk + getVariant(clk).gmtOffset;
+    }
+
   private:
     void parseTimeVariants(const unsigned char* ptr,
                            uint64_t variantOffset,

http://git-wip-us.apache.org/repos/asf/orc/blob/8ea3592f/c++/src/Timezone.hh
----------------------------------------------------------------------
diff --git a/c++/src/Timezone.hh b/c++/src/Timezone.hh
index b1dbd6a..136b7a1 100644
--- a/c++/src/Timezone.hh
+++ b/c++/src/Timezone.hh
@@ -76,6 +76,11 @@ namespace orc {
      * Get the version of the zone file.
      */
     virtual uint64_t getVersion() const =0;
+
+    /**
+     * Convert wall clock time of current timezone to UTC timezone
+     */
+    virtual int64_t convertToUTC(int64_t clk) const = 0;
   };
 
   /**

http://git-wip-us.apache.org/repos/asf/orc/blob/8ea3592f/c++/src/Writer.cc
----------------------------------------------------------------------
diff --git a/c++/src/Writer.cc b/c++/src/Writer.cc
index 22f5750..016ce35 100644
--- a/c++/src/Writer.cc
+++ b/c++/src/Writer.cc
@@ -382,7 +382,9 @@ namespace orc {
       *stripeFooter.add_columns() = encodings[i];
     }
 
-    // TODO: ORC-205 Include writer timezone in stripe footer
+    // use GMT to guarantee TimestampVectorBatch from reader can write
+    // same wall clock time
+    stripeFooter.set_writertimezone("GMT");
 
     // add stripe statistics to metadata
     proto::StripeStatistics* stripeStats = metadata.add_stripestats();

http://git-wip-us.apache.org/repos/asf/orc/blob/8ea3592f/c++/test/TestWriter.cc
----------------------------------------------------------------------
diff --git a/c++/test/TestWriter.cc b/c++/test/TestWriter.cc
index 4e6dbf0..c61d184 100644
--- a/c++/test/TestWriter.cc
+++ b/c++/test/TestWriter.cc
@@ -554,7 +554,7 @@ namespace orc {
 
     uint64_t stripeSize = 16 * 1024;
     uint64_t compressionBlockSize = 1024;
-    uint64_t rowCount = 1024;
+    uint64_t rowCount = 102400;
 
     std::unique_ptr<Writer> writer = createWriter(stripeSize,
                                                   compressionBlockSize,

http://git-wip-us.apache.org/repos/asf/orc/blob/8ea3592f/tools/src/CSVFileImport.cc
----------------------------------------------------------------------
diff --git a/tools/src/CSVFileImport.cc b/tools/src/CSVFileImport.cc
index d40d5b5..2901636 100644
--- a/tools/src/CSVFileImport.cc
+++ b/tools/src/CSVFileImport.cc
@@ -18,6 +18,7 @@
 
 #include "orc/Exceptions.hh"
 #include "orc/OrcFile.hh"
+#include "Timezone.hh"
 
 #include <algorithm>
 #include <fstream>
@@ -233,6 +234,7 @@ void fillTimestampValues(const std::vector<std::string>& data,
                          orc::ColumnVectorBatch* batch,
                          uint64_t numValues,
                          uint64_t colIndex) {
+  const orc::Timezone& localTZ = orc::getLocalTimezone();
   orc::TimestampVectorBatch* tsBatch =
     dynamic_cast<orc::TimestampVectorBatch*>(batch);
   bool hasNull = false;
@@ -243,7 +245,8 @@ void fillTimestampValues(const std::vector<std::string>& data,
       hasNull = true;
     } else {
       batch->notNull[i] = 1;
-      tsBatch->data[i] = atoll(col.c_str());
+      // data is in local timezone
+      tsBatch->data[i] = localTZ.convertToUTC(atoll(col.c_str()));
       tsBatch->nanoseconds[i] = 0;
     }
   }