You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/08/10 13:25:53 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #7507: ARROW-8797: [C++] Read RecordBatch in a different endian

pitrou commented on a change in pull request #7507:
URL: https://github.com/apache/arrow/pull/7507#discussion_r467891282



##########
File path: cpp/src/arrow/util/bit_util.h
##########
@@ -322,6 +322,31 @@ static inline uint16_t ByteSwap(uint16_t value) {
   return static_cast<uint16_t>(ByteSwap(static_cast<int16_t>(value)));
 }
 static inline uint8_t ByteSwap(uint8_t value) { return value; }
+static inline int8_t ByteSwap(int8_t value) { return value; }
+static inline double ByteSwap(double value) {
+#if defined(__GNUC__)
+  // avoid dereferencing type-punned pointer will break strict-aliasing rules
+  int8_t* in_ptr = reinterpret_cast<int8_t*>(&value);
+  auto swapped = ARROW_BYTE_SWAP64(*reinterpret_cast<uint64_t*>(in_ptr));
+  int8_t* out_ptr = reinterpret_cast<int8_t*>(&swapped);
+  return *reinterpret_cast<double*>(out_ptr);
+#else
+  auto swapped = ARROW_BYTE_SWAP64(*reinterpret_cast<uint64_t*>(&value));
+  return *reinterpret_cast<double*>(&swapped);
+#endif
+}
+static inline float ByteSwap(float value) {

Review comment:
       Same here.

##########
File path: cpp/src/arrow/util/bit_util.h
##########
@@ -322,6 +322,31 @@ static inline uint16_t ByteSwap(uint16_t value) {
   return static_cast<uint16_t>(ByteSwap(static_cast<int16_t>(value)));
 }
 static inline uint8_t ByteSwap(uint8_t value) { return value; }
+static inline int8_t ByteSwap(int8_t value) { return value; }
+static inline double ByteSwap(double value) {
+#if defined(__GNUC__)
+  // avoid dereferencing type-punned pointer will break strict-aliasing rules

Review comment:
       Instead, you should use the "Safe" functions from `util/ubsan.h` on all compilers. For example:
   ```c++
   static inline double ByteSwap(double value) {
     const uint64_t swapped = ARROW_BYTE_SWAP64(SafeCopy<uint64_t>(value));
     return SafeCopy<double>(swapped);
   }
   ```
   

##########
File path: cpp/src/arrow/array/util.cc
##########
@@ -74,6 +74,177 @@ class ArrayDataWrapper {
   std::shared_ptr<Array>* out_;
 };
 
+class ArrayDataEndianSwapper {
+ public:
+  ArrayDataEndianSwapper(std::shared_ptr<ArrayData>& data, int64_t length)
+      : data_(data), length_(length) {}
+
+  Status SwapType(const DataType& type) { return VisitTypeInline(type, this); }
+
+  Status SwapChildren(std::vector<std::shared_ptr<Field>> child_fields) {
+    int i = 0;
+    for (const auto& child_field : child_fields) {
+      auto orig_data = data_;
+      auto orig_length = length_;
+      data_ = data_->child_data[i++];

Review comment:
       Rather than patch this instance temporarily, it would seem cleaner to create a new `ArrayDataEndianSwapper` for this child data.

##########
File path: cpp/src/arrow/array/util.cc
##########
@@ -74,6 +74,177 @@ class ArrayDataWrapper {
   std::shared_ptr<Array>* out_;
 };
 
+class ArrayDataEndianSwapper {
+ public:
+  ArrayDataEndianSwapper(std::shared_ptr<ArrayData>& data, int64_t length)
+      : data_(data), length_(length) {}
+
+  Status SwapType(const DataType& type) { return VisitTypeInline(type, this); }
+
+  Status SwapChildren(std::vector<std::shared_ptr<Field>> child_fields) {
+    int i = 0;
+    for (const auto& child_field : child_fields) {
+      auto orig_data = data_;
+      auto orig_length = length_;
+      data_ = data_->child_data[i++];
+      length_ = data_->length;
+      RETURN_NOT_OK(SwapType(*child_field.get()->type()));
+      length_ = orig_length;
+      data_ = orig_data;
+    }
+    return Status::OK();
+  }
+
+  template <typename VALUE_TYPE>
+  Status SwapOffset(int index) {
+    if (data_->buffers[index] == nullptr) {
+      return Status::OK();
+    }
+    auto buffer = const_cast<VALUE_TYPE*>(
+        reinterpret_cast<const VALUE_TYPE*>(data_->buffers[index]->data()));
+    // offset has one more element rather than data->length
+    int64_t length = length_ + 1;
+    for (int64_t i = 0; i < length; i++) {
+#if ARROW_LITTLE_ENDIAN
+      buffer[i] = BitUtil::FromBigEndian(buffer[i]);
+#else
+      buffer[i] = BitUtil::FromLittleEndian(buffer[i]);
+#endif
+    }
+    return Status::OK();
+  }
+
+  Status SwapSmallOffset(int index = 1) { return SwapOffset<int32_t>(index); }
+
+  Status SwapLargeOffset() { return SwapOffset<int64_t>(1); }
+
+  template <typename T>
+  Status Visit(const T&) {
+    using value_type = typename T::c_type;
+    auto buffer = const_cast<value_type*>(
+        reinterpret_cast<const value_type*>(data_->buffers[1]->data()));
+    int64_t length = length_;
+    for (int64_t i = 0; i < length; i++) {
+#if ARROW_LITTLE_ENDIAN
+      buffer[i] = BitUtil::FromBigEndian(buffer[i]);
+#else
+      buffer[i] = BitUtil::FromLittleEndian(buffer[i]);
+#endif
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal128Type& type) {
+    auto buffer = const_cast<uint64_t*>(
+        reinterpret_cast<const uint64_t*>(data_->buffers[1]->data()));
+    int64_t length = length_;
+    for (int64_t i = 0; i < length; i++) {
+      uint64_t tmp;
+      auto idx = i * 2;
+#if ARROW_LITTLE_ENDIAN
+      tmp = BitUtil::FromBigEndian(buffer[idx]);
+      buffer[idx] = BitUtil::FromBigEndian(buffer[idx + 1]);
+      buffer[idx + 1] = tmp;
+#else
+      tmp = BitUtil::FromLittleEndian(buffer[idx]);
+      buffer[idx] = BitUtil::FromLittleEndian(buffer[idx + 1]);
+      buffer[idx + 1] = tmp;
+#endif
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const DayTimeIntervalType& type) {
+    auto buffer = const_cast<uint32_t*>(
+        reinterpret_cast<const uint32_t*>(data_->buffers[1]->data()));
+    int64_t length = length_;
+    for (int64_t i = 0; i < length; i++) {
+      auto idx = i * 2;
+#if ARROW_LITTLE_ENDIAN
+      buffer[idx] = BitUtil::FromBigEndian(buffer[idx]);
+      buffer[idx + 1] = BitUtil::FromBigEndian(buffer[idx + 1]);
+#else
+      buffer[idx] = BitUtil::FromLittleEndian(buffer[idx]);
+      buffer[idx + 1] = BitUtil::FromLittleEndian(buffer[idx + 1]);
+#endif
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const NullType& type) { return Status::OK(); }
+  Status Visit(const BooleanType& type) { return Status::OK(); }
+  Status Visit(const Int8Type& type) { return Status::OK(); }
+  Status Visit(const UInt8Type& type) { return Status::OK(); }
+  Status Visit(const FixedSizeBinaryType& type) { return Status::OK(); }
+
+  Status Visit(const StringType& type) {
+    RETURN_NOT_OK(SwapSmallOffset());
+    return Status::OK();
+  }
+  Status Visit(const LargeStringType& type) {
+    RETURN_NOT_OK(SwapLargeOffset());
+    return Status::OK();
+  }
+  Status Visit(const BinaryType& type) {
+    RETURN_NOT_OK(SwapSmallOffset());
+    return Status::OK();
+  }
+  Status Visit(const LargeBinaryType& type) {
+    RETURN_NOT_OK(SwapLargeOffset());
+    return Status::OK();
+  }
+
+  Status Visit(const ListType& type) {
+    RETURN_NOT_OK(SwapSmallOffset());
+    RETURN_NOT_OK(SwapChildren(type.fields()));
+    return Status::OK();
+  }
+  Status Visit(const LargeListType& type) {
+    RETURN_NOT_OK(SwapLargeOffset());
+    RETURN_NOT_OK(SwapChildren(type.fields()));
+    return Status::OK();
+  }
+  Status Visit(const FixedSizeListType& type) {
+    RETURN_NOT_OK(SwapChildren(type.fields()));
+    return Status::OK();
+  }
+
+  Status Visit(const MapType& type) {
+    RETURN_NOT_OK(SwapSmallOffset());
+    RETURN_NOT_OK(SwapChildren(type.fields()));
+    return Status::OK();
+  }
+
+  Status Visit(const StructType& type) {
+    RETURN_NOT_OK(SwapChildren(type.fields()));

Review comment:
       `SwapChildren` can be called in all cases instead of calling it specifically for some types.

##########
File path: cpp/src/arrow/type.h
##########
@@ -1582,13 +1583,26 @@ class ARROW_EXPORT FieldRef {
 // ----------------------------------------------------------------------
 // Schema
 
+enum class Endianness {
+  LITTLE = 0,
+  BIG = 1,
+#if ARROW_LITTLE_ENDIAN
+  NATIVE = LITTLE
+#else
+  NATIVE = BIG
+#endif
+};
+
 /// \class Schema
 /// \brief Sequence of arrow::Field objects describing the columns of a record
 /// batch or table data structure
 class ARROW_EXPORT Schema : public detail::Fingerprintable,
                             public util::EqualityComparable<Schema>,
                             public util::ToStringOstreamable<Schema> {
  public:
+  explicit Schema(std::vector<std::shared_ptr<Field>> fields, Endianness endianness,
+                  std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);

Review comment:
       I don't know if we want to allow non-native endianness in memory. @wesm What do you think?

##########
File path: cpp/src/arrow/type.h
##########
@@ -29,6 +29,7 @@
 
 #include "arrow/result.h"
 #include "arrow/type_fwd.h"  // IWYU pragma: export
+#include "arrow/util/bit_util.h"

Review comment:
       Is there a way to avoid including `bit_util.h` (which is a non-trivial header) here?

##########
File path: dev/archery/tests/generate_files_for_endian_test.sh
##########
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+: ${ARROW_DIR:=/arrow}
+
+json_file=`archery integration --with-cpp=1 | grep "Testing file" | tail -1 | awk '{ print $3 }'`

Review comment:
       It's not obvious to me what this script is doing precisely. In any case, I don't think we should rely on precise human-readable output of `archery integration`.

##########
File path: cpp/src/arrow/array/util.cc
##########
@@ -74,6 +74,177 @@ class ArrayDataWrapper {
   std::shared_ptr<Array>* out_;
 };
 
+class ArrayDataEndianSwapper {
+ public:
+  ArrayDataEndianSwapper(std::shared_ptr<ArrayData>& data, int64_t length)
+      : data_(data), length_(length) {}
+
+  Status SwapType(const DataType& type) { return VisitTypeInline(type, this); }
+
+  Status SwapChildren(std::vector<std::shared_ptr<Field>> child_fields) {
+    int i = 0;
+    for (const auto& child_field : child_fields) {
+      auto orig_data = data_;
+      auto orig_length = length_;
+      data_ = data_->child_data[i++];
+      length_ = data_->length;
+      RETURN_NOT_OK(SwapType(*child_field.get()->type()));
+      length_ = orig_length;
+      data_ = orig_data;
+    }
+    return Status::OK();
+  }
+
+  template <typename VALUE_TYPE>
+  Status SwapOffset(int index) {
+    if (data_->buffers[index] == nullptr) {
+      return Status::OK();
+    }
+    auto buffer = const_cast<VALUE_TYPE*>(

Review comment:
       This is definitely not safe at all. Another approach should be found IMHO. Worst case we reallocate, which I agree isn't desirable...




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org