You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by ga...@apache.org on 2021/09/30 02:35:34 UTC

[orc] branch main updated: ORC-1008: Fix overflow detection in the IntegerColumnStatistics

This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new c69e1fc  ORC-1008: Fix overflow detection in the IntegerColumnStatistics
c69e1fc is described below

commit c69e1fcd91e4c0a65052618b0161e3ab1f3d5443
Author: Guiyanakaung <gu...@gmail.com>
AuthorDate: Thu Sep 30 10:35:30 2021 +0800

    ORC-1008: Fix overflow detection in the IntegerColumnStatistics
    
    - Fix overflow detection code for C++ int64_t / java long in the IntegerColumnStatistics.
    - Merging is also done using the same algorithm for overflow detection, and add test cases.
    - Compatibility with different compilers for overflow detection.
    - Use cmakedefine check builtin overflow function.
    
    This fixes #917
---
 c++/src/Adaptor.hh.in                              | 35 ++++++++++++++++++++++
 c++/src/CMakeLists.txt                             |  8 +++++
 c++/src/Statistics.cc                              | 12 --------
 c++/src/Statistics.hh                              | 26 ++++++++++++----
 c++/test/TestColumnStatistics.cc                   |  8 +++++
 .../org/apache/orc/impl/ColumnStatisticsImpl.java  | 19 +++++++-----
 .../test/org/apache/orc/TestColumnStatistics.java  | 11 +++++++
 7 files changed, 94 insertions(+), 25 deletions(-)

diff --git a/c++/src/Adaptor.hh.in b/c++/src/Adaptor.hh.in
index a7795be..cd32b16 100644
--- a/c++/src/Adaptor.hh.in
+++ b/c++/src/Adaptor.hh.in
@@ -31,6 +31,7 @@
 #cmakedefine HAS_POST_2038
 #cmakedefine HAS_STD_ISNAN
 #cmakedefine HAS_STD_MUTEX
+#cmakedefine HAS_BUILTIN_OVERFLOW_CHECK
 #cmakedefine NEEDS_REDUNDANT_MOVE
 #cmakedefine NEEDS_Z_PREFIX
 
@@ -173,6 +174,40 @@ namespace orc {
   std::string to_string(int64_t val);
 }
 
+#ifdef HAS_BUILTIN_OVERFLOW_CHECK
+  #define multiplyExact !__builtin_mul_overflow
+  #define addExact !__builtin_add_overflow
+#else
+namespace orc {
+  /**
+   * Compute value * repetitions, return false if overflow, return true otherwise
+   * and save the result at the address pointed to by result
+   * imitates the jdk Math.multiplyExact implementation
+   * but this method makes the assumption that repetitions > 1
+   */
+  static bool multiplyExact(int64_t value, int64_t repetitions, int64_t* result) {
+    int64_t r = value * repetitions;
+    if (((value < 0 ? -value : value) | repetitions) >> 31 != 0 && r / repetitions != value) {
+      return false;
+    }
+    *result = r;
+    return true;
+  }
+
+  /**
+   * imitates the jdk Math.addExact implementation
+   */
+  static bool addExact(int64_t sum, int64_t increment, int64_t* result) {
+    int64_t r = sum + increment;
+    if (((sum ^ r) & (increment ^ r)) < 0) {
+      return false;
+    }
+    *result = r;
+    return true;
+  }
+}
+#endif
+
 #ifndef HAS_CONSTEXPR
 #define constexpr const
 #endif
diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt
index 772f4c6..ccb48da 100644
--- a/c++/src/CMakeLists.txt
+++ b/c++/src/CMakeLists.txt
@@ -43,6 +43,14 @@ CHECK_CXX_SOURCE_COMPILES("
 )
 
 CHECK_CXX_SOURCE_COMPILES("
+    int main(){
+      int a;
+      return __builtin_add_overflow(1, 2, &a);
+    }"
+  HAS_BUILTIN_OVERFLOW_CHECK
+)
+
+CHECK_CXX_SOURCE_COMPILES("
     #include<stdint.h>
     #include<stdio.h>
     int main(int,char*[]){
diff --git a/c++/src/Statistics.cc b/c++/src/Statistics.cc
index 53e0da2..ccc54c2 100644
--- a/c++/src/Statistics.cc
+++ b/c++/src/Statistics.cc
@@ -177,18 +177,6 @@ namespace orc {
     // PASS
   }
 
-  void IntegerColumnStatisticsImpl::update(int64_t value, int repetitions) {
-    _stats.updateMinMax(value);
-
-    if (_stats.hasSum()) {
-      bool wasPositive = _stats.getSum() >= 0;
-      _stats.setSum(value * repetitions + _stats.getSum());
-      if ((value >= 0) == wasPositive) {
-        _stats.setHasSum((_stats.getSum() >= 0) == wasPositive);
-      }
-    }
-  }
-
   StringColumnStatisticsImpl::~StringColumnStatisticsImpl() {
     // PASS
   }
diff --git a/c++/src/Statistics.hh b/c++/src/Statistics.hh
index f056aba..e281bdb 100644
--- a/c++/src/Statistics.hh
+++ b/c++/src/Statistics.hh
@@ -964,7 +964,23 @@ namespace orc {
       _stats.setSum(sum);
     }
 
-    void update(int64_t value, int repetitions);
+    void update(int64_t value, int repetitions) {
+      _stats.updateMinMax(value);
+
+      if (_stats.hasSum()) {
+        if (repetitions > 1) {
+          _stats.setHasSum(multiplyExact(value, repetitions, &value));
+        }
+
+        if (_stats.hasSum()) {
+          _stats.setHasSum(addExact(_stats.getSum(), value, &value));
+
+          if (_stats.hasSum()) {
+            _stats.setSum(value);
+          }
+        }
+      }
+    }
 
     void merge(const MutableColumnStatistics& other) override {
       const IntegerColumnStatisticsImpl& intStats =
@@ -975,10 +991,10 @@ namespace orc {
       // update sum and check overflow
       _stats.setHasSum(_stats.hasSum() && intStats.hasSum());
       if (_stats.hasSum()) {
-        bool wasPositive = _stats.getSum() >= 0;
-        _stats.setSum(_stats.getSum() + intStats.getSum());
-        if ((intStats.getSum() >= 0) == wasPositive) {
-          _stats.setHasSum((_stats.getSum() >= 0) == wasPositive);
+        int64_t value;
+        _stats.setHasSum(addExact(_stats.getSum(), intStats.getSum(), &value));
+        if (_stats.hasSum()) {
+          _stats.setSum(value);
         }
       }
     }
diff --git a/c++/test/TestColumnStatistics.cc b/c++/test/TestColumnStatistics.cc
index 77f3e2d..11338ad 100644
--- a/c++/test/TestColumnStatistics.cc
+++ b/c++/test/TestColumnStatistics.cc
@@ -106,6 +106,14 @@ namespace orc {
     EXPECT_EQ(std::numeric_limits<int64_t>::min() + 500, other->getSum());
     intStats->merge(*other);
     EXPECT_FALSE(intStats->hasSum());
+
+    std::unique_ptr<IntegerColumnStatisticsImpl> intStats2(
+      new IntegerColumnStatisticsImpl());
+
+    intStats2->update(1, 1);
+    EXPECT_TRUE(intStats2->hasSum());
+    intStats2->update(std::numeric_limits<int64_t>::max(), 3);
+    EXPECT_FALSE(intStats2->hasSum());
   }
 
   TEST(ColumnStatistics, doubleColumnStatistics) {
diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
index 131a658..5573d4b 100644
--- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
@@ -370,10 +370,13 @@ public class ColumnStatisticsImpl implements ColumnStatistics {
         maximum = value;
       }
       if (!overflow) {
-        boolean wasPositive = sum >= 0;
-        sum += value * repetitions;
-        if ((value >= 0) == wasPositive) {
-          overflow = (sum >= 0) != wasPositive;
+        try {
+          long increment = repetitions > 1
+              ? Math.multiplyExact(value, repetitions)
+              : value;
+          sum = Math.addExact(sum, increment);
+        } catch (ArithmeticException e) {
+          overflow = true;
         }
       }
     }
@@ -397,10 +400,10 @@ public class ColumnStatisticsImpl implements ColumnStatistics {
 
         overflow |= otherInt.overflow;
         if (!overflow) {
-          boolean wasPositive = sum >= 0;
-          sum += otherInt.sum;
-          if ((otherInt.sum >= 0) == wasPositive) {
-            overflow = (sum >= 0) != wasPositive;
+          try {
+            sum = Math.addExact(sum, otherInt.sum);
+          } catch (ArithmeticException e) {
+            overflow = true;
           }
         }
       } else {
diff --git a/java/core/src/test/org/apache/orc/TestColumnStatistics.java b/java/core/src/test/org/apache/orc/TestColumnStatistics.java
index d3372b8..d66e5ee 100644
--- a/java/core/src/test/org/apache/orc/TestColumnStatistics.java
+++ b/java/core/src/test/org/apache/orc/TestColumnStatistics.java
@@ -42,6 +42,7 @@ import java.text.SimpleDateFormat;
 import java.util.TimeZone;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertNull;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
@@ -51,6 +52,16 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
 public class TestColumnStatistics {
 
   @Test
+  public void testLongSumOverflow() {
+    TypeDescription schema = TypeDescription.createInt();
+    ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(schema);
+    stats.updateInteger(1, 1);
+    assertTrue(((IntegerColumnStatistics) stats).isSumDefined());
+    stats.updateInteger(Long.MAX_VALUE, 3);
+    assertFalse(((IntegerColumnStatistics) stats).isSumDefined());
+  }
+
+  @Test
   public void testLongMerge() throws Exception {
     TypeDescription schema = TypeDescription.createInt();