You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@impala.apache.org by jr...@apache.org on 2018/01/10 18:48:03 UTC

[1/4] impala git commit: IMPALA-6128: Add support for AES-CTR encryption when spilling to disk

Repository: impala
Updated Branches:
  refs/heads/master f810458ca -> 31c6a1719


IMPALA-6128: Add support for AES-CTR encryption when spilling to disk

CFB mode is a stream cipher and is secure when used with a different nonce/IV
for every message. However it can be a performance bottleneck.
CTR mode is also stream cipher and is secure, 4~6x faster than CFB mode in
OpenSSL. AES-CTR+SHA256 is about 40~70% faster than AES-CFB+SHA256.

CTR mode is used if OpenSSL version>=1.0.1 at runtime, otherwise
fall back to using CFB mode.

Testing:
run runtime tmp-file-mgr-test, openssl-util-test, buffer-pool-test and
buffered-tuple-stream-test
The ut case openssl-util-test.EncryptInPlace tests encryption in both modes.

Change-Id: I9debc240615dd8cdbf00ec8730cff62ffef52aff
Reviewed-on: http://gerrit.cloudera.org:8080/8861
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/514dfaf9
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/514dfaf9
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/514dfaf9

Branch: refs/heads/master
Commit: 514dfaf9fdff219256eaa9baf3efcc66bfdfafda
Parents: f810458
Author: Xianda Ke <ke...@gmail.com>
Authored: Sun Nov 26 15:35:22 2017 +0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Jan 10 05:39:09 2018 +0000

----------------------------------------------------------------------
 be/src/runtime/tmp-file-mgr.cc   |  4 +--
 be/src/util/openssl-util-test.cc | 65 ++++++++++++++++++++---------------
 be/src/util/openssl-util.cc      | 32 ++++++++++++-----
 be/src/util/openssl-util.h       | 41 +++++++++++++++++-----
 4 files changed, 96 insertions(+), 46 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/514dfaf9/be/src/runtime/tmp-file-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/tmp-file-mgr.cc b/be/src/runtime/tmp-file-mgr.cc
index 24217de..650af0b 100644
--- a/be/src/runtime/tmp-file-mgr.cc
+++ b/be/src/runtime/tmp-file-mgr.cc
@@ -612,8 +612,8 @@ void TmpFileMgr::WriteHandle::WaitForWrite() {
 Status TmpFileMgr::WriteHandle::EncryptAndHash(MemRange buffer) {
   DCHECK(FLAGS_disk_spill_encryption);
   SCOPED_TIMER(encryption_timer_);
-  // Since we're using AES-CFB mode, we must take care not to reuse a key/IV pair.
-  // Regenerate a new key and IV for every data buffer we write.
+  // Since we're using AES-CTR/AES-CFB mode, we must take care not to reuse a
+  // key/IV pair. Regenerate a new key and IV for every data buffer we write.
   key_.InitializeRandom();
   RETURN_IF_ERROR(key_.Encrypt(buffer.data(), buffer.len(), buffer.data()));
   hash_.Compute(buffer.data(), buffer.len());

http://git-wip-us.apache.org/repos/asf/impala/blob/514dfaf9/be/src/util/openssl-util-test.cc
----------------------------------------------------------------------
diff --git a/be/src/util/openssl-util-test.cc b/be/src/util/openssl-util-test.cc
index ef1b28e..8d98b0d 100644
--- a/be/src/util/openssl-util-test.cc
+++ b/be/src/util/openssl-util-test.cc
@@ -56,24 +56,29 @@ TEST_F(OpenSSLUtilTest, Encryption) {
   vector<uint8_t> decrypted(buffer_size);
   GenerateRandomData(original.data(), buffer_size);
 
-  // Iterate multiple times to ensure that key regeneration works correctly.
-  EncryptionKey key;
-  for (int i = 0; i < 2; ++i) {
-    key.InitializeRandom(); // Generate a new key for each iteration.
-
-    // Check that OpenSSL is happy with the amount of entropy we're feeding it.
-    DCHECK_EQ(1, RAND_status());
-
-    ASSERT_OK(key.Encrypt(original.data(), buffer_size, encrypted.data()));
-    if (i > 0) {
-      // Check that we're not somehow reusing the same key.
-      ASSERT_NE(0, memcmp(encrypted.data(), prev_encrypted.data(), buffer_size));
+  // Check both CTR & CFB
+  AES_CIPHER_MODE modes[] = {AES_256_CTR, AES_256_CFB};
+  for (auto m : modes) {
+    // Iterate multiple times to ensure that key regeneration works correctly.
+    EncryptionKey key;
+    for (int i = 0; i < 2; ++i) {
+      key.InitializeRandom(); // Generate a new key for each iteration.
+      key.SetCipherMode(m);
+
+      // Check that OpenSSL is happy with the amount of entropy we're feeding it.
+      DCHECK_EQ(1, RAND_status());
+
+      ASSERT_OK(key.Encrypt(original.data(), buffer_size, encrypted.data()));
+      if (i > 0) {
+        // Check that we're not somehow reusing the same key.
+        ASSERT_NE(0, memcmp(encrypted.data(), prev_encrypted.data(), buffer_size));
+      }
+      memcpy(prev_encrypted.data(), encrypted.data(), buffer_size);
+
+      // We should get the original data by decrypting it.
+      ASSERT_OK(key.Decrypt(encrypted.data(), buffer_size, decrypted.data()));
+      ASSERT_EQ(0, memcmp(original.data(), decrypted.data(), buffer_size));
     }
-    memcpy(prev_encrypted.data(), encrypted.data(), buffer_size);
-
-    // We should get the original data by decrypting it.
-    ASSERT_OK(key.Decrypt(encrypted.data(), buffer_size, decrypted.data()));
-    ASSERT_EQ(0, memcmp(original.data(), decrypted.data(), buffer_size));
   }
 }
 
@@ -83,17 +88,23 @@ TEST_F(OpenSSLUtilTest, EncryptInPlace) {
   vector<uint8_t> original(buffer_size);
   vector<uint8_t> scratch(buffer_size); // Scratch buffer for in-place encryption.
 
-  GenerateRandomData(original.data(), buffer_size);
-  memcpy(scratch.data(), original.data(), buffer_size);
-
   EncryptionKey key;
-  key.InitializeRandom();
-  ASSERT_OK(key.Encrypt(scratch.data(), buffer_size, scratch.data()));
-  // Check that encryption did something
-  ASSERT_NE(0, memcmp(original.data(), scratch.data(), buffer_size));
-  ASSERT_OK(key.Decrypt(scratch.data(), buffer_size, scratch.data()));
-  // Check that we get the original data back.
-  ASSERT_EQ(0, memcmp(original.data(), scratch.data(), buffer_size));
+  // Check both CTR & CFB
+  AES_CIPHER_MODE modes[] = {AES_256_CTR, AES_256_CFB};
+  for (auto m : modes) {
+    GenerateRandomData(original.data(), buffer_size);
+    memcpy(scratch.data(), original.data(), buffer_size);
+
+    key.InitializeRandom();
+    key.SetCipherMode(m);
+
+    ASSERT_OK(key.Encrypt(scratch.data(), buffer_size, scratch.data()));
+    // Check that encryption did something
+    ASSERT_NE(0, memcmp(original.data(), scratch.data(), buffer_size));
+    ASSERT_OK(key.Decrypt(scratch.data(), buffer_size, scratch.data()));
+    // Check that we get the original data back.
+    ASSERT_EQ(0, memcmp(original.data(), scratch.data(), buffer_size));
+  }
 }
 
 /// Test that encryption works with buffer lengths that don't fit in a 32-bit integer.

http://git-wip-us.apache.org/repos/asf/impala/blob/514dfaf9/be/src/util/openssl-util.cc
----------------------------------------------------------------------
diff --git a/be/src/util/openssl-util.cc b/be/src/util/openssl-util.cc
index e3b2299..a8ec976 100644
--- a/be/src/util/openssl-util.cc
+++ b/be/src/util/openssl-util.cc
@@ -26,6 +26,7 @@
 #include <openssl/sha.h>
 
 #include "common/atomic.h"
+#include "gutil/port.h" // ATTRIBUTE_WEAK
 #include "gutil/strings/substitute.h"
 
 #include "common/names.h"
@@ -99,13 +100,15 @@ Status EncryptionKey::EncryptInternal(
   EVP_CIPHER_CTX_init(&ctx);
   EVP_CIPHER_CTX_set_padding(&ctx, 0);
 
-  int success;
-
   // Start encryption/decryption.  We use a 256-bit AES key, and the cipher block mode
-  // is CFB because this gives us a stream cipher, which supports arbitrary
-  // length ciphertexts - it doesn't have to be a multiple of 16 bytes.
-  success = encrypt ? EVP_EncryptInit_ex(&ctx, EVP_aes_256_cfb(), NULL, key_, iv_) :
-                      EVP_DecryptInit_ex(&ctx, EVP_aes_256_cfb(), NULL, key_, iv_);
+  // is either CTR or CFB(stream cipher), both of which support arbitrary length
+  // ciphertexts - it doesn't have to be a multiple of 16 bytes. Additionally, CTR
+  // mode is well-optimized(instruction level parallelism) with hardware acceleration
+  // on x86 and PowerPC
+  const EVP_CIPHER* evpCipher = GetCipher();
+  int success = encrypt ? EVP_EncryptInit_ex(&ctx, evpCipher, NULL, key_, iv_) :
+                          EVP_DecryptInit_ex(&ctx, evpCipher, NULL, key_, iv_);
+
   if (success != 1) {
     return OpenSSLErr(encrypt ? "EVP_EncryptInit_ex" : "EVP_DecryptInit_ex");
   }
@@ -122,7 +125,7 @@ Status EncryptionKey::EncryptInternal(
     if (success != 1) {
       return OpenSSLErr(encrypt ? "EVP_EncryptUpdate" : "EVP_DecryptUpdate");
     }
-    // This is safe because we're using CFB mode without padding.
+    // This is safe because we're using CTR/CFB mode without padding.
     DCHECK_EQ(in_len, out_len);
     offset += in_len;
   }
@@ -134,8 +137,21 @@ Status EncryptionKey::EncryptInternal(
   if (success != 1) {
     return OpenSSLErr(encrypt ? "EVP_EncryptFinal" : "EVP_DecryptFinal");
   }
-  // Again safe due to CFB with no padding
+  // Again safe due to CTR/CFB with no padding
   DCHECK_EQ(final_out_len, 0);
   return Status::OK();
 }
+
+extern "C" {
+ATTRIBUTE_WEAK
+const EVP_CIPHER* EVP_aes_256_ctr();
+}
+
+const EVP_CIPHER* EncryptionKey::GetCipher() const {
+  // use weak symbol to avoid compiling error on OpenSSL 1.0.0 environment
+  if (mode_ == AES_256_CTR && EVP_aes_256_ctr) return EVP_aes_256_ctr();
+
+  // otherwise, fallback to CFB mode
+  return EVP_aes_256_cfb();
+}
 }

http://git-wip-us.apache.org/repos/asf/impala/blob/514dfaf9/be/src/util/openssl-util.h
----------------------------------------------------------------------
diff --git a/be/src/util/openssl-util.h b/be/src/util/openssl-util.h
index 4b32db6..22f8235 100644
--- a/be/src/util/openssl-util.h
+++ b/be/src/util/openssl-util.h
@@ -19,16 +19,25 @@
 #define IMPALA_UTIL_OPENSSL_UTIL_H
 
 #include <openssl/aes.h>
+#include <openssl/evp.h>
 #include <openssl/sha.h>
 
 #include "common/status.h"
 
 namespace impala {
 
+#define OPENSSL_VERSION_1_0_1 0x1000100L
+
 /// Add entropy from the system RNG to OpenSSL's global RNG. Called at system startup
 /// and again periodically to add new entropy.
 void SeedOpenSSLRNG();
 
+enum AES_CIPHER_MODE {
+  AES_256_CTR,
+  AES_256_CFB,
+  AES_256_GCM // not supported now.
+};
+
 /// The hash of a data buffer used for checking integrity. A SHA256 hash is used
 /// internally.
 class IntegrityHash {
@@ -47,20 +56,23 @@ class IntegrityHash {
 /// The key and initialization vector (IV) required to encrypt and decrypt a buffer of
 /// data. This should be regenerated for each buffer of data.
 ///
-/// We use AES with a 256-bit key and CFB cipher block mode, which gives us a stream
-/// cipher that can support arbitrary-length ciphertexts. The IV is used as an input to
-/// the cipher as the "block to supply before the first block of plaintext". This is
-/// required because all ciphers (except the weak ECB) are built such that each block
-/// depends on the output from the previous block. Since the first block doesn't have
-/// a previous block, we supply this IV. Think of it as starting off the chain of
+/// We use AES with a 256-bit key and CTR/CFB cipher block mode, which gives us a stream
+/// cipher that can support arbitrary-length ciphertexts. If OpenSSL version at runtime
+/// is 1.0.1 or above, CTR mode is used, otherwise CFB mode is used. The IV is used as
+/// an input to the cipher as the "block to supply before the first block of plaintext".
+/// This is required because all ciphers (except the weak ECB) are built such that each
+/// block depends on the output from the previous block. Since the first block doesn't
+/// have a previous block, we supply this IV. Think of it  as starting off the chain of
 /// encryption.
 class EncryptionKey {
  public:
-  EncryptionKey() : initialized_(false) {}
+  EncryptionKey() : initialized_(false) {
+    mode_ = SSLeay() < OPENSSL_VERSION_1_0_1 ? AES_256_CFB : AES_256_CTR;
+  }
 
   /// Initialize a key for temporary use with randomly generated data. Reinitializes with
-  /// new random values if the key was already initialized. We use AES-CFB mode so key/IV
-  /// pairs should not be reused. This function automatically reseeds the RNG
+  /// new random values if the key was already initialized. We use AES-CTR/AES-CFB mode
+  /// so key/IV pairs should not be reused. This function automatically reseeds the RNG
   /// periodically, so callers do not need to do it.
   void InitializeRandom();
 
@@ -75,6 +87,11 @@ class EncryptionKey {
   /// otherwise the buffers must not overlap.
   Status Decrypt(const uint8_t* data, int64_t len, uint8_t* out) const WARN_UNUSED_RESULT;
 
+  /// Specify a cipher mode. Currently used only for testing but maybe in future we
+  /// can provide a configuration option for the end user who can choose a preferred
+  /// mode(GCM, CTR, CFB...) based on their software/hardware environment.
+  void SetCipherMode(AES_CIPHER_MODE m) { mode_ = m; }
+
  private:
   /// Helper method that encrypts/decrypts if 'encrypt' is true/false respectively.
   /// A buffer of input data 'data' of length 'len' is encrypted/decrypted with this
@@ -88,11 +105,17 @@ class EncryptionKey {
   /// uninitialized keys.
   bool initialized_;
 
+  /// return a EVP_CIPHER according to cipher mode at runtime
+  const EVP_CIPHER* GetCipher() const;
+
   /// An AES 256-bit key.
   uint8_t key_[32];
 
   /// An initialization vector to feed as the first block to AES.
   uint8_t iv_[AES_BLOCK_SIZE];
+
+  /// Cipher Mode
+  AES_CIPHER_MODE mode_;
 };
 }

[3/4] impala git commit: IMPALA-5317: [DOCS] Doc for DATE_TRUNC() function

Posted by jr...@apache.org.

IMPALA-5317: [DOCS] Doc for DATE_TRUNC() function

Change-Id: Ifcf38903bb10db12cbb8d73a2dc875aef29cd359
Reviewed-on: http://gerrit.cloudera.org:8080/8768
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/1f4d687a
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/1f4d687a
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/1f4d687a

Branch: refs/heads/master
Commit: 1f4d687a9bd51a5c869dd806fb31449cdfb34180
Parents: c86b0a9
Author: John Russell <jr...@cloudera.com>
Authored: Tue Dec 5 14:19:22 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Jan 10 18:41:31 2018 +0000

----------------------------------------------------------------------
 docs/impala_keydefs.ditamap               |  4 ++
 docs/shared/impala_common.xml             |  5 +-
 docs/topics/impala_datetime_functions.xml | 96 ++++++++++++++++++++++++++
 3 files changed, 104 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/1f4d687a/docs/impala_keydefs.ditamap
----------------------------------------------------------------------
diff --git a/docs/impala_keydefs.ditamap b/docs/impala_keydefs.ditamap
index 56de937..02cff8a 100644
--- a/docs/impala_keydefs.ditamap
+++ b/docs/impala_keydefs.ditamap
@@ -10516,6 +10516,7 @@ under the License.
   <keydef href="https://issues.apache.org/jira/browse/IMPALA-9999" scope="external" format="html" keys="IMPALA-9999"/>
 
 <!-- Short form of mapping from Impala release to vendor-specific releases, for use in headings. -->
+  <keydef keys="impala211"><topicmeta><keywords><keyword>Impala 2.11</keyword></keywords></topicmeta></keydef>
   <keydef keys="impala210"><topicmeta><keywords><keyword>Impala 2.10</keyword></keywords></topicmeta></keydef>
   <keydef keys="impala29"><topicmeta><keywords><keyword>Impala 2.9</keyword></keywords></topicmeta></keydef>
   <keydef keys="impala28"><topicmeta><keywords><keyword>Impala 2.8</keyword></keywords></topicmeta></keydef>
@@ -10531,6 +10532,9 @@ under the License.
   <keydef keys="impala13"><topicmeta><keywords><keyword>Impala 1.3</keyword></keywords></topicmeta></keydef>
 
 <!-- 3-part forms of version numbers, for use in release notes. -->
+<!-- For the 2.11.0 entry, have to space out the digits with underscores to avoid a conflict with the
+     keydef for Impala 2.1.10. -->
+  <keydef keys="impala2_11_0"><topicmeta><keywords><keyword>Impala 2.11.0</keyword></keywords></topicmeta></keydef>
   <keydef keys="impala2100"><topicmeta><keywords><keyword>Impala 2.10.0</keyword></keywords></topicmeta></keydef>
   <keydef keys="impala290"><topicmeta><keywords><keyword>Impala 2.9.0</keyword></keywords></topicmeta></keydef>
   <keydef keys="impala280"><topicmeta><keywords><keyword>Impala 2.8.0</keyword></keywords></topicmeta></keydef>

http://git-wip-us.apache.org/repos/asf/impala/blob/1f4d687a/docs/shared/impala_common.xml
----------------------------------------------------------------------
diff --git a/docs/shared/impala_common.xml b/docs/shared/impala_common.xml
index c272893..dc8cdb5 100644
--- a/docs/shared/impala_common.xml
+++ b/docs/shared/impala_common.xml
@@ -2787,7 +2787,10 @@ flight_num:           INT32 SNAPPY DO:83456393 FPO:83488603 SZ:10216514/11474301
         each value.
       </p>
 
-      <p rev="2.9.0" id="added_in_2100">
+      <p rev="2.11.0" id="added_in_2110">
+        <b>Added in:</b> <keyword keyref="impala2_11_0"/>
+      </p>
+      <p rev="2.10.0" id="added_in_2100">
         <b>Added in:</b> <keyword keyref="impala2100"/>
       </p>
       <p rev="2.9.0" id="added_in_290">

http://git-wip-us.apache.org/repos/asf/impala/blob/1f4d687a/docs/topics/impala_datetime_functions.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_datetime_functions.xml b/docs/topics/impala_datetime_functions.xml
index de8291b..f4d062a 100644
--- a/docs/topics/impala_datetime_functions.xml
+++ b/docs/topics/impala_datetime_functions.xml
@@ -389,6 +389,102 @@ select date_sub(cast('2016-05-31' as timestamp), interval 1 months) as 'april_31
 
       </dlentry>
 
+      <dlentry rev="2.11.0 IMPALA-5317" id="date_trunc">
+
+        <dt>
+          <codeph>date_trunc(string unit, timestamp)</codeph>
+        </dt>
+
+        <dd>
+          <indexterm audience="hidden">date_trunc() function</indexterm>
+          <b>Purpose:</b> Truncates a <codeph>TIMESTAMP</codeph> value to the specified precision.
+          <p>
+            <b>Unit argument:</b> The <codeph>unit</codeph> argument value for truncating
+            <codeph>TIMESTAMP</codeph> values is not case-sensitive. This argument string
+            can be one of:
+          </p>
+          <ul>
+            <li>microseconds</li>
+            <li>milliseconds</li>
+            <li>second</li>
+            <li>minute</li>
+            <li>hour</li>
+            <li>day</li>
+            <li>week</li>
+            <li>month</li>
+            <li>year</li>
+            <li>decade</li>
+            <li>century</li>
+            <li>millennium</li>
+          </ul>
+          <p>
+            For example, calling <codeph>date_trunc('hour',ts)</codeph> truncates
+            <codeph>ts</codeph> to the beginning of the corresponding hour, with
+            all minutes, seconds, milliseconds, and so on set to zero. Calling
+            <codeph>date_trunc('milliseconds',ts)</codeph> truncates
+            <codeph>ts</codeph> to the beginning of the corresponding millisecond,
+            with all microseconds and nanoseconds set to zero.
+          </p>
+          <note>
+            The sub-second units are specified in plural form. All units representing
+            one second or more are specified in singular form.
+          </note>
+          <p conref="../shared/impala_common.xml#common/added_in_2110"/>
+          <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+          <p>
+            Although this function is similar to calling <codeph>TRUNC()</codeph>
+            with a <codeph>TIMESTAMP</codeph> argument, the order of arguments
+            and the recognized units are different between <codeph>TRUNC()</codeph>
+            and <codeph>DATE_TRUNC()</codeph>. Therefore, these functions are not
+            interchangeable.
+          </p>
+          <p>
+            This function is typically used in <codeph>GROUP BY</codeph>
+            queries to aggregate results from the same hour, day, week, month, quarter, and so on.
+            You can also use this function in an <codeph>INSERT ... SELECT</codeph> into a
+            partitioned table to divide <codeph>TIMESTAMP</codeph> values into the correct partition.
+          </p>
+          <p>
+            Because the return value is a <codeph>TIMESTAMP</codeph>, if you cast the result of
+            <codeph>DATE_TRUNC()</codeph> to <codeph>STRING</codeph>, you will often see zeroed-out portions such as
+            <codeph>00:00:00</codeph> in the time field. If you only need the individual units such as hour, day,
+            month, or year, use the <codeph>EXTRACT()</codeph> function instead. If you need the individual units
+            from a truncated <codeph>TIMESTAMP</codeph> value, run the <codeph>TRUNCATE()</codeph> function on the
+            original value, then run <codeph>EXTRACT()</codeph> on the result.
+          </p>
+          <p>
+            <b>Return type:</b> <codeph>timestamp</codeph>
+          </p>
+          <p conref="../shared/impala_common.xml#common/example_blurb"/>
+          <p>
+            The following examples show how to call <codeph>DATE_TRUNC()</codeph> with different unit values:
+          </p>
+<codeblock>
+select now(), date_trunc('second', now());
++-------------------------------+-----------------------------------+
+| now()                         | date_trunc('second', now())       |
++-------------------------------+-----------------------------------+
+| 2017-12-05 13:58:04.565403000 | 2017-12-05 13:58:04               |
++-------------------------------+-----------------------------------+
+
+select now(), date_trunc('hour', now());
++-------------------------------+---------------------------+
+| now()                         | date_trunc('hour', now()) |
++-------------------------------+---------------------------+
+| 2017-12-05 13:59:01.884459000 | 2017-12-05 13:00:00       |
++-------------------------------+---------------------------+
+
+select now(), date_trunc('millennium', now());
++-------------------------------+---------------------------------+
+| now()                         | date_trunc('millennium', now()) |
++-------------------------------+---------------------------------+
+| 2017-12-05 14:00:30.296812000 | 2000-01-01 00:00:00             |
++-------------------------------+---------------------------------+
+</codeblock>
+        </dd>
+
+      </dlentry>
+
       <dlentry id="datediff">
 
         <dt>

[4/4] impala git commit: [DOCS] Recommend using Kudu Java API for rapid DMLs

Posted by jr...@apache.org.

[DOCS] Recommend using Kudu Java API for rapid DMLs

Change-Id: I0098f0c3d5d07c89e6bb589c4c04edce300c1ad3
Reviewed-on: http://gerrit.cloudera.org:8080/8976
Reviewed-by: Jean-Daniel Cryans <jd...@apache.org>
Reviewed-by: Thomas Tauber-Marshall <tm...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/31c6a171
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/31c6a171
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/31c6a171

Branch: refs/heads/master
Commit: 31c6a1719a271810f0ec09873a3424311e5627ec
Parents: 1f4d687
Author: John Russell <jr...@cloudera.com>
Authored: Tue Jan 9 11:56:19 2018 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Jan 10 18:42:04 2018 +0000

----------------------------------------------------------------------
 docs/topics/impala_jdbc.xml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/31c6a171/docs/topics/impala_jdbc.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_jdbc.xml b/docs/topics/impala_jdbc.xml
index 8d3599f..c920ec1 100644
--- a/docs/topics/impala_jdbc.xml
+++ b/docs/topics/impala_jdbc.xml
@@ -334,4 +334,18 @@ ARRAY<VARCHAR(10)>     becomes  ARRAY<VARCHAR(10)>
     </conbody>
   </concept>
 
+  <concept id="jdbc_kudu">
+    <title>Kudu Considerations for DML Statements</title>
+    <conbody>
+      <p>
+        Currently, Impala <codeph>INSERT</codeph>, <codeph>UPDATE</codeph>, or
+        other DML statements issued through the JDBC interface against a Kudu
+        table do not return JDBC error codes for conditions such as duplicate
+        primary key columns. Therefore, for applications that issue a high
+        volume of DML statements, prefer to use the Kudu Java API directly
+        rather than a JDBC application.
+      </p>
+    </conbody>
+  </concept>
+
 </concept>

[2/4] impala git commit: IMPALA-5014: Part 2: Round when casting decimal to timestamp

Posted by jr...@apache.org.

IMPALA-5014: Part 2: Round when casting decimal to timestamp

When there are too many digits to the right of the dot in a decimal, we
would always truncate when casting to timestamp. In this patch we change
the behavior to round instead of truncating when decimal_v2 is enabled.

Testing:
- Added some EE tests, ran BE tests on my machine.

Change-Id: I8fb3a7d976ab980b8572d7e9524850572bad57da
Reviewed-on: http://gerrit.cloudera.org:8080/8969
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/c86b0a97
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/c86b0a97
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/c86b0a97

Branch: refs/heads/master
Commit: c86b0a9736ee1e19b95a2d06771ca2ab8577950f
Parents: 514dfaf
Author: Taras Bobrovytsky <tb...@cloudera.com>
Authored: Thu Dec 21 15:47:06 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Jan 10 05:47:23 2018 +0000

----------------------------------------------------------------------
 be/src/exprs/decimal-operators-ir.cc            | 27 ++++++++-----
 be/src/exprs/decimal-operators.h                | 11 ++++--
 be/src/runtime/timestamp-test.cc                | 16 ++++++++
 be/src/runtime/timestamp-value.h                |  3 +-
 .../queries/QueryTest/decimal-exprs.test        | 41 ++++++++++++++++++++
 5 files changed, 83 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/c86b0a97/be/src/exprs/decimal-operators-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/decimal-operators-ir.cc b/be/src/exprs/decimal-operators-ir.cc
index 8612561..fd0c404 100644
--- a/be/src/exprs/decimal-operators-ir.cc
+++ b/be/src/exprs/decimal-operators-ir.cc
@@ -596,7 +596,8 @@ StringVal DecimalOperators::CastToStringVal(
 }
 
 template <typename T>
-IR_ALWAYS_INLINE int32_t DecimalOperators::ConvertToNanoseconds(T val, int scale) {
+IR_ALWAYS_INLINE int32_t DecimalOperators::ConvertToNanoseconds(
+    T val, int scale, bool round) {
   // Nanosecond scale means there should be 9 decimal digits, which is representable
   // with int32_t.
   const int NANOSECOND_SCALE = 9;
@@ -605,10 +606,11 @@ IR_ALWAYS_INLINE int32_t DecimalOperators::ConvertToNanoseconds(T val, int scale
     nanoseconds = val * DecimalUtil::GetScaleMultiplier<T>(
         NANOSECOND_SCALE - scale);
   } else {
-    nanoseconds = val / DecimalUtil::GetScaleMultiplier<T>(
-        scale - NANOSECOND_SCALE);
+    nanoseconds = DecimalUtil::ScaleDownAndRound<T>(
+        val, scale - NANOSECOND_SCALE, round);
+    DCHECK(nanoseconds <= 1000000000);
+    DCHECK(nanoseconds != 1000000000 || round);
   }
-
   DCHECK(nanoseconds >= numeric_limits<int32_t>::min()
       && nanoseconds <= numeric_limits<int32_t>::max());
 
@@ -616,7 +618,8 @@ IR_ALWAYS_INLINE int32_t DecimalOperators::ConvertToNanoseconds(T val, int scale
 }
 
 template <typename T>
-TimestampVal DecimalOperators::ConvertToTimestampVal(const T& decimal_value, int scale) {
+TimestampVal DecimalOperators::ConvertToTimestampVal(
+    const T& decimal_value, int scale, bool round) {
   typename T::StorageType seconds = decimal_value.whole_part(scale);
   if (seconds < numeric_limits<int64_t>::min() ||
       seconds > numeric_limits<int64_t>::max()) {
@@ -624,8 +627,8 @@ TimestampVal DecimalOperators::ConvertToTimestampVal(const T& decimal_value, int
     return TimestampVal::null();
   }
   int32_t nanoseconds =
-      ConvertToNanoseconds(decimal_value.fractional_part(scale), scale);
-  if(decimal_value.is_negative()) nanoseconds *= -1;
+      ConvertToNanoseconds(decimal_value.fractional_part(scale), scale, round);
+  if (decimal_value.is_negative()) nanoseconds *= -1;
   TimestampVal result;
   TimestampValue::FromUnixTimeNanos(seconds, nanoseconds).ToTimestampVal(&result);
   return result;
@@ -637,11 +640,15 @@ TimestampVal DecimalOperators::CastToTimestampVal(
   if (val.is_null) return TimestampVal::null();
   int precision = ctx->impl()->GetConstFnAttr(FunctionContextImpl::ARG_TYPE_PRECISION, 0);
   int scale = ctx->impl()->GetConstFnAttr(FunctionContextImpl::ARG_TYPE_SCALE, 0);
+  bool is_decimal_v2 = ctx->impl()->GetConstFnAttr(FunctionContextImpl::DECIMAL_V2);
   TimestampVal result;
   switch (ColumnType::GetDecimalByteSize(precision)) {
-    case 4: return ConvertToTimestampVal(Decimal4Value(val.val4), scale);
-    case 8: return ConvertToTimestampVal(Decimal8Value(val.val8), scale);
-    case 16: return ConvertToTimestampVal(Decimal16Value(val.val16), scale);
+    case 4:
+      return ConvertToTimestampVal(Decimal4Value(val.val4), scale, is_decimal_v2);
+    case 8:
+      return ConvertToTimestampVal(Decimal8Value(val.val8), scale, is_decimal_v2);
+    case 16:
+      return ConvertToTimestampVal(Decimal16Value(val.val16), scale, is_decimal_v2);
     default:
       DCHECK(false);
       return TimestampVal::null();

http://git-wip-us.apache.org/repos/asf/impala/blob/c86b0a97/be/src/exprs/decimal-operators.h
----------------------------------------------------------------------
diff --git a/be/src/exprs/decimal-operators.h b/be/src/exprs/decimal-operators.h
index c2d8779..e34dbf1 100644
--- a/be/src/exprs/decimal-operators.h
+++ b/be/src/exprs/decimal-operators.h
@@ -163,13 +163,16 @@ class DecimalOperators {
   static T RoundDelta(const DecimalValue<T>& v, int src_scale,
       int target_scale, const DecimalRoundOp& op);
 
-  /// Converts a decimal value (interpreted as unix time) to TimestampVal.
+  /// Converts a decimal value (interpreted as unix time) to TimestampVal. Rounds
+  /// instead of truncating if 'round' is true.
   template <typename T>
-  static TimestampVal ConvertToTimestampVal(const T& decimal_value, int scale);
+  static TimestampVal ConvertToTimestampVal(
+      const T& decimal_value, int scale, bool round);
 
-  /// Converts fractional 'val' with the given 'scale' to nanoseconds.
+  /// Converts fractional 'val' with the given 'scale' to nanoseconds. Rounds
+  /// instead of truncating if 'round' is true.
   template <typename T>
-  static int32_t ConvertToNanoseconds(T val, int scale);
+  static int32_t ConvertToNanoseconds(T val, int scale, bool round);
 };
 
 }

http://git-wip-us.apache.org/repos/asf/impala/blob/c86b0a97/be/src/runtime/timestamp-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/timestamp-test.cc b/be/src/runtime/timestamp-test.cc
index 3a54026..66bd896 100644
--- a/be/src/runtime/timestamp-test.cc
+++ b/be/src/runtime/timestamp-test.cc
@@ -732,6 +732,22 @@ TEST(TimestampTest, Basic) {
   EXPECT_EQ("2038-01-19 03:14:09",
       TimestampValue::FromUnixTime(2147483649).ToString());
 
+  // Tests for the cases where abs(nanoseconds) >= 1e9.
+  EXPECT_EQ("2018-01-10 16:00:00",
+      TimestampValue::FromUnixTimeNanos(1515600000, 0).ToString());
+  EXPECT_EQ("2018-01-10 16:00:00.999999999",
+      TimestampValue::FromUnixTimeNanos(1515600000, 999999999).ToString());
+  EXPECT_EQ("2018-01-10 15:59:59.000000001",
+      TimestampValue::FromUnixTimeNanos(1515600000, -999999999).ToString());
+  EXPECT_EQ("2018-01-10 16:00:01",
+      TimestampValue::FromUnixTimeNanos(1515600000, 1000000000).ToString());
+  EXPECT_EQ("2018-01-10 15:59:59",
+      TimestampValue::FromUnixTimeNanos(1515600000, -1000000000).ToString());
+  EXPECT_EQ("2018-01-10 16:30:00",
+      TimestampValue::FromUnixTimeNanos(1515600000, 1800000000000).ToString());
+  EXPECT_EQ("2018-01-10 15:30:00",
+      TimestampValue::FromUnixTimeNanos(1515600000, -1800000000000).ToString());
+
   // Test FromUnixTime around the boundary of the values that are converted via boost via
   // gmtime (IMPALA-5357). Tests 1 second before and after the values supported by the
   // boost conversion logic.

http://git-wip-us.apache.org/repos/asf/impala/blob/c86b0a97/be/src/runtime/timestamp-value.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/timestamp-value.h b/be/src/runtime/timestamp-value.h
index 445189a..5a5e733 100644
--- a/be/src/runtime/timestamp-value.h
+++ b/be/src/runtime/timestamp-value.h
@@ -100,7 +100,8 @@ class TimestampValue {
   }
 
   /// Same as FromUnixTime() above, but adds the specified number of nanoseconds to the
-  /// resulting TimestampValue. Handles negative nanoseconds too.
+  /// resulting TimestampValue. Handles negative nanoseconds and the case where
+  /// abs(nanos) >= 1e9.
   static TimestampValue FromUnixTimeNanos(time_t unix_time, int64_t nanos) {
     boost::posix_time::ptime temp = UnixTimeToPtime(unix_time);
     temp += boost::posix_time::nanoseconds(nanos);

http://git-wip-us.apache.org/repos/asf/impala/blob/c86b0a97/testdata/workloads/functional-query/queries/QueryTest/decimal-exprs.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/decimal-exprs.test b/testdata/workloads/functional-query/queries/QueryTest/decimal-exprs.test
index 328fbaf..be75c23 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/decimal-exprs.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/decimal-exprs.test
@@ -413,3 +413,44 @@ cast(42608445511 as decimal(18, 0)) % 3
 ---- TYPES
 DECIMAL,DECIMAL,DECIMAL,DECIMAL,DECIMAL,DECIMAL
 ====
+---- QUERY
+# IMPALA-5014: Check that we round when converting a decimal to timestamp
+set decimal_v2=false;
+select
+cast(cast(12333333333.9999999994 as decimal(38, 10)) as timestamp),
+cast(cast(12333333333.9999999995 as decimal(38, 10)) as timestamp),
+cast(cast(333.9999999994 as decimal(13, 10)) as timestamp),
+cast(cast(333.9999999995 as decimal(13, 10)) as timestamp),
+cast(cast(12333333333.1111111114 as decimal(38, 10)) as timestamp),
+cast(cast(12333333333.1111111115 as decimal(38, 10)) as timestamp),
+cast(cast(12333333333.111111111411111 as decimal(38, 15)) as timestamp),
+cast(cast(12333333333.111111111511111 as decimal(38, 15)) as timestamp),
+cast(cast(12333333333.1111111114 as decimal(38, 27)) as timestamp),
+cast(cast(12333333333.1111111115 as decimal(38, 27)) as timestamp),
+cast(cast(12333333333.111 as decimal(38, 3)) as timestamp),
+cast(cast(12333333333 as decimal(38, 0)) as timestamp);
+---- RESULTS
+2360-10-29 21:55:33.999999999,2360-10-29 21:55:33.999999999,1970-01-01 00:05:33.999999999,1970-01-01 00:05:33.999999999,2360-10-29 21:55:33.111111111,2360-10-29 21:55:33.111111111,2360-10-29 21:55:33.111111111,2360-10-29 21:55:33.111111111,2360-10-29 21:55:33.111111111,2360-10-29 21:55:33.111111111,2360-10-29 21:55:33.111000000,2360-10-29 21:55:33
+---- TYPES
+TIMESTAMP, TIMESTAMP, TIMESTAMP, TIMESTAMP, TIMESTAMP, TIMESTAMP, TIMESTAMP, TIMESTAMP, TIMESTAMP, TIMESTAMP, TIMESTAMP, TIMESTAMP
+====
+---- QUERY
+set decimal_v2=true;
+select
+cast(cast(12333333333.9999999994 as decimal(38, 10)) as timestamp),
+cast(cast(12333333333.9999999995 as decimal(38, 10)) as timestamp),
+cast(cast(333.9999999994 as decimal(13, 10)) as timestamp),
+cast(cast(333.9999999995 as decimal(13, 10)) as timestamp),
+cast(cast(12333333333.1111111114 as decimal(38, 10)) as timestamp),
+cast(cast(12333333333.1111111115 as decimal(38, 10)) as timestamp),
+cast(cast(12333333333.111111111411111 as decimal(38, 15)) as timestamp),
+cast(cast(12333333333.111111111511111 as decimal(38, 15)) as timestamp),
+cast(cast(12333333333.1111111114 as decimal(38, 27)) as timestamp),
+cast(cast(12333333333.1111111115 as decimal(38, 27)) as timestamp),
+cast(cast(12333333333.111 as decimal(38, 3)) as timestamp),
+cast(cast(12333333333 as decimal(38, 0)) as timestamp);
+---- RESULTS
+2360-10-29 21:55:33.999999999,2360-10-29 21:55:34,1970-01-01 00:05:33.999999999,1970-01-01 00:05:34,2360-10-29 21:55:33.111111111,2360-10-29 21:55:33.111111112,2360-10-29 21:55:33.111111111,2360-10-29 21:55:33.111111112,2360-10-29 21:55:33.111111111,2360-10-29 21:55:33.111111112,2360-10-29 21:55:33.111000000,2360-10-29 21:55:33
+---- TYPES
+TIMESTAMP, TIMESTAMP, TIMESTAMP, TIMESTAMP, TIMESTAMP, TIMESTAMP, TIMESTAMP, TIMESTAMP, TIMESTAMP, TIMESTAMP, TIMESTAMP, TIMESTAMP
+====