You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2022/01/04 20:00:09 UTC

[orc] branch main updated: ORC-1055: [C++] Add the timezone option for the csv-import tool (#975)

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new 9a66348  ORC-1055: [C++] Add the timezone option for the csv-import tool (#975)
9a66348 is described below

commit 9a663489f0a79224a3ac3e9d93d90afaf8a7f747
Author: coderex2522 <re...@gmail.com>
AuthorDate: Wed Jan 5 03:59:20 2022 +0800

    ORC-1055: [C++] Add the timezone option for the csv-import tool (#975)
    
    ### What changes were proposed in this pull request?
    
    The pull request provides the csv-import tool with support for timezone settings
    
    ### Why are the changes needed?
    
    This is a new option to mitigate ORC-1055 situation.
    
    ### How was this patch tested?
    
    The unit case is TestCSVFileImport.testTimezoneOption in TestCSVFileImport.cc
---
 examples/TestCSVFileImport.testTimezoneOption.csv |  1 +
 tools/src/CSVFileImport.cc                        | 10 +++++-
 tools/test/TestCSVFileImport.cc                   | 37 +++++++++++++++++++++++
 3 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/examples/TestCSVFileImport.testTimezoneOption.csv b/examples/TestCSVFileImport.testTimezoneOption.csv
new file mode 100644
index 0000000..9e87f23
--- /dev/null
+++ b/examples/TestCSVFileImport.testTimezoneOption.csv
@@ -0,0 +1 @@
+2021-12-27 00:00:00.000
\ No newline at end of file
diff --git a/tools/src/CSVFileImport.cc b/tools/src/CSVFileImport.cc
index 3857adc..7bbc9c6 100644
--- a/tools/src/CSVFileImport.cc
+++ b/tools/src/CSVFileImport.cc
@@ -280,8 +280,10 @@ void usage() {
             << "                  [-s <size>] [--stripe=<size>]\n"
             << "                  [-c <size>] [--block=<size>]\n"
             << "                  [-b <size>] [--batch=<size>]\n"
+            << "                  [-t <string>] [--timezone=<string>]\n"
             << "                  <schema> <input> <output>\n"
             << "Import CSV file into an Orc file using the specified schema.\n"
+            << "The timezone is writer timezone of timestamp types.\n"
             << "Compound types are not yet supported.\n";
 }
 
@@ -289,6 +291,7 @@ int main(int argc, char* argv[]) {
   std::string input;
   std::string output;
   std::string schema;
+  std::string timezoneName="GMT";
   uint64_t stripeSize = (128 << 20); // 128M
   uint64_t blockSize = 64 << 10;     // 64K
   uint64_t batchSize = 1024;
@@ -300,13 +303,14 @@ int main(int argc, char* argv[]) {
     {"stripe", required_argument, ORC_NULLPTR, 'p'},
     {"block", required_argument, ORC_NULLPTR, 'c'},
     {"batch", required_argument, ORC_NULLPTR, 'b'},
+    {"timezone", required_argument, ORC_NULLPTR, 't'},
     {ORC_NULLPTR, 0, ORC_NULLPTR, 0}
   };
   bool helpFlag = false;
   int opt;
   char *tail;
   do {
-    opt = getopt_long(argc, argv, "i:o:s:b:c:p:h", longOptions, ORC_NULLPTR);
+    opt = getopt_long(argc, argv, "i:o:s:b:c:p:t:h", longOptions, ORC_NULLPTR);
     switch (opt) {
       case '?':
       case 'h':
@@ -337,6 +341,9 @@ int main(int argc, char* argv[]) {
           return 1;
         }
         break;
+      case 't':
+        timezoneName = std::string(optarg);
+        break;
     }
   } while (opt != -1);
 
@@ -364,6 +371,7 @@ int main(int argc, char* argv[]) {
   options.setStripeSize(stripeSize);
   options.setCompressionBlockSize(blockSize);
   options.setCompression(compression);
+  options.setTimezoneName(timezoneName);
 
   ORC_UNIQUE_PTR<orc::OutputStream> outStream = orc::writeLocalFile(output);
   ORC_UNIQUE_PTR<orc::Writer> writer =
diff --git a/tools/test/TestCSVFileImport.cc b/tools/test/TestCSVFileImport.cc
index 79bc3de..b4cd967 100644
--- a/tools/test/TestCSVFileImport.cc
+++ b/tools/test/TestCSVFileImport.cc
@@ -53,3 +53,40 @@ TEST (TestCSVFileImport, test10rows) {
   EXPECT_EQ(expected, output);
   EXPECT_EQ("", error);
 }
+
+TEST (TestCSVFileImport, testTimezoneOption) {
+  // create an ORC file from importing the CSV file
+  const std::string pgm1 = findProgram("tools/src/csv-import");
+  const std::string pgm2 = findProgram("tools/src/orc-contents");
+  const std::string csvFile =
+    findExample("TestCSVFileImport.testTimezoneOption.csv");
+  const std::string schema = "'struct<_a:timestamp>'";
+  std::string output;
+  std::string error;
+  {
+    std::string orcFile = "/tmp/test_csv_import_test_timezone_option1.orc";
+    std::string option = "--timezone=America/Los_Angeles";
+    EXPECT_EQ(0, runProgram({pgm1, option, schema, csvFile, orcFile},
+                            output, error));
+    EXPECT_EQ("", error);
+    // verify the ORC file content
+    const std::string expected =
+      "{\"_a\": \"2021-12-26 16:00:00.0\"}\n";
+    EXPECT_EQ(0, runProgram({pgm2, orcFile}, output, error));
+    EXPECT_EQ(expected, output);
+    EXPECT_EQ("", error);
+  }
+  {
+    std::string orcFile = "/tmp/test_csv_import_test_timezone_option2.orc";
+    std::string option = "--timezone=Europe/Paris";
+    EXPECT_EQ(0, runProgram({pgm1, option, schema, csvFile, orcFile},
+                            output, error));
+    EXPECT_EQ("", error);
+    // verify the ORC file content
+    const std::string expected =
+      "{\"_a\": \"2021-12-27 01:00:00.0\"}\n";
+    EXPECT_EQ(0, runProgram({pgm2, orcFile}, output, error));
+    EXPECT_EQ(expected, output);
+    EXPECT_EQ("", error);
+  }
+}