You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/06/24 19:07:20 UTC

[GitHub] [arrow] fsaintjacques commented on a change in pull request #6167: PARQUET-1766: [C++] Handle parquet::Statistics NaNs and -0.0f as per upstream parquet-mr

fsaintjacques commented on a change in pull request #6167:
URL: https://github.com/apache/arrow/pull/6167#discussion_r658211389



##########
File path: cpp/src/parquet/statistics.cc
##########
@@ -42,239 +47,325 @@ namespace parquet {
 
 template <typename DType, bool is_signed>
 struct CompareHelper {
-  typedef typename DType::c_type T;
-  static inline bool Compare(int type_length, const T& a, const T& b) { return a < b; }
-};
+  using T = typename DType::c_type;
 
-template <>
-struct CompareHelper<Int96Type, true> {
-  static inline bool Compare(int type_length, const Int96& a, const Int96& b) {
-    // Only the MSB bit is by Signed comparison
-    // For little-endian, this is the last bit of Int96 type
-    const int32_t amsb = static_cast<const int32_t>(a.value[2]);
-    const int32_t bmsb = static_cast<const int32_t>(b.value[2]);
-    if (amsb != bmsb) {
-      return (amsb < bmsb);
-    } else if (a.value[1] != b.value[1]) {
-      return (a.value[1] < b.value[1]);
-    }
-    return (a.value[0] < b.value[0]);
+  constexpr static T DefaultMin() { return std::numeric_limits<T>::max(); }
+  constexpr static T DefaultMax() { return std::numeric_limits<T>::lowest(); }
+
+  // MSVC17 fix, isnan is not overloaded for IntegralType as per C++11
+  // standard requirements.
+  template <typename T1 = T>
+  static arrow::enable_if_t<std::is_floating_point<T1>::value, T> Coalesce(T val,
+                                                                           T fallback) {
+    return std::isnan(val) ? fallback : val;
   }
-};
 
-template <>
-struct CompareHelper<ByteArrayType, true> {
-  static inline bool Compare(int type_length, const ByteArray& a, const ByteArray& b) {
-    const int8_t* aptr = reinterpret_cast<const int8_t*>(a.ptr);
-    const int8_t* bptr = reinterpret_cast<const int8_t*>(b.ptr);
-    return std::lexicographical_compare(aptr, aptr + a.len, bptr, bptr + b.len);
+  template <typename T1 = T>
+  static arrow::enable_if_t<!std::is_floating_point<T1>::value, T> Coalesce(T val,
+                                                                            T fallback) {
+    return val;
   }
+
+  static inline bool Compare(int type_length, const T& a, const T& b) { return a < b; }
+
+  static T Min(int type_length, T a, T b) { return a < b ? a : b; }
+  static T Max(int type_length, T a, T b) { return a < b ? b : a; }
 };
 
-template <>
-struct CompareHelper<FLBAType, true> {
-  static inline bool Compare(int type_length, const FLBA& a, const FLBA& b) {
-    const int8_t* aptr = reinterpret_cast<const int8_t*>(a.ptr);
-    const int8_t* bptr = reinterpret_cast<const int8_t*>(b.ptr);
-    return std::lexicographical_compare(aptr, aptr + type_length, bptr,
-                                        bptr + type_length);
+template <typename DType>
+struct UnsignedCompareHelperBase {
+  using T = typename DType::c_type;
+  using UCType = typename std::make_unsigned<T>::type;
+
+  constexpr static T DefaultMin() { return std::numeric_limits<UCType>::max(); }
+  constexpr static T DefaultMax() { return std::numeric_limits<UCType>::lowest(); }
+  static T Coalesce(T val, T fallback) { return val; }
+
+  static inline bool Compare(int type_length, T a, T b) {
+    return arrow::util::SafeCopy<UCType>(a) < arrow::util::SafeCopy<UCType>(b);

Review comment:
       IIRC, when converting from uintX -> intX and the value is greater than what the destination type support (2^X + 1 and so on) the behaviour is *implementation* defined with static_cast, i.e. it depends on the compiler and arch.
   
   See this specific answer (and the whole thread) https://stackoverflow.com/a/7602006. See also the [https://en.cppreference.com/w/cpp/language/implicit_conversion](Integral conversions
   ) section where this gotcha is not implementation defined after C++20 (in 10 years you'll be able to remove the SafeCopy :))
   
   With regards to this code, it seems it shouldn't be needed since the direction is int -> uint. But I somewhat recall adding SafeCopy to appease some compiler warnings (or UBSAN?).




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org