You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/08/03 20:44:04 UTC

[GitHub] [arrow] kou commented on a change in pull request #7507: ARROW-8797: [C++] Read RecordBatch in a different endian

kou commented on a change in pull request #7507:
URL: https://github.com/apache/arrow/pull/7507#discussion_r464651784



##########
File path: cpp/src/arrow/array/util.cc
##########
@@ -74,6 +74,186 @@ class ArrayDataWrapper {
   std::shared_ptr<Array>* out_;
 };
 
+class ArrayDataEndianSwapper {
+ public:
+  ArrayDataEndianSwapper(std::shared_ptr<ArrayData>& data, int64_t length)
+      : data_(data), length_(length) {}
+
+  Status SwapType(const DataType& type) { return VisitTypeInline(type, this); }
+
+  Status SwapChildren(std::vector<std::shared_ptr<Field>> child_fields) {
+    int i = 0;
+    for (const auto& child_field : child_fields) {
+      auto orig_data = data_;
+      auto orig_length = length_;
+      data_ = data_->child_data[i++];
+      length_ = data_->length;
+      RETURN_NOT_OK(SwapType(*child_field.get()->type()));
+      length_ = orig_length;
+      data_ = orig_data;
+    }
+    return Status::OK();
+  }
+
+  template <typename T>
+  Status SwapOffset(const T&, int index) {
+    if (data_->buffers[index] == nullptr) {
+      return Status::OK();
+    }
+    using value_type = typename T::c_type;
+    auto buffer = const_cast<value_type*>(
+        reinterpret_cast<const value_type*>(data_->buffers[index]->data()));
+    // offset has one more element rather than data->length
+    int64_t length = length_ + 1;
+    for (int64_t i = 0; i < length; i++) {
+#if ARROW_LITTLE_ENDIAN
+      buffer[i] = BitUtil::FromBigEndian(buffer[i]);
+#else
+      buffer[i] = BitUtil::FromLittleEndian(buffer[i]);
+#endif
+    }
+    return Status::OK();
+  }
+
+  Status SwapSmallOffset(int index = 1) {
+    Int32Type i32;
+    RETURN_NOT_OK(SwapOffset(i32, index));
+    return Status::OK();
+  }
+
+  Status SwapLargeOffset() {
+    Int64Type i64;
+    RETURN_NOT_OK(SwapOffset(i64, 1));
+    return Status::OK();
+  }

Review comment:
       Can we simplify them like the following?
   
   ```diff
   diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc
   index e2f88016f..6f6d4bc78 100644
   --- a/cpp/src/arrow/array/util.cc
   +++ b/cpp/src/arrow/array/util.cc
   @@ -95,14 +95,13 @@ class ArrayDataEndianSwapper {
        return Status::OK();
      }
    
   -  template <typename T>
   -  Status SwapOffset(const T&, int index) {
   +  template <typename VALUE_TYPE>
   +  Status SwapOffset(int index) {
        if (data_->buffers[index] == nullptr) {
          return Status::OK();
        }
   -    using value_type = typename T::c_type;
   -    auto buffer = const_cast<value_type*>(
   -        reinterpret_cast<const value_type*>(data_->buffers[index]->data()));
   +    auto buffer = const_cast<VALUE_TYPE*>(
   +        reinterpret_cast<const VALUE_TYPE*>(data_->buffers[index]->data()));
        // offset has one more element rather than data->length
        int64_t length = length_ + 1;
        for (int64_t i = 0; i < length; i++) {
   @@ -116,15 +115,11 @@ class ArrayDataEndianSwapper {
      }
    
      Status SwapSmallOffset(int index = 1) {
   -    Int32Type i32;
   -    RETURN_NOT_OK(SwapOffset(i32, index));
   -    return Status::OK();
   +    return SwapOffset<int32_t>(index);
      }
    
      Status SwapLargeOffset() {
   -    Int64Type i64;
   -    RETURN_NOT_OK(SwapOffset(i64, 1));
   -    return Status::OK();
   +    return SwapOffset<int64_t>(1);
      }
    
      template <typename T>
   ```

##########
File path: cpp/src/arrow/ipc/reader.cc
##########
@@ -449,7 +449,7 @@ Result<std::shared_ptr<RecordBatch>> LoadRecordBatchSubset(
     const flatbuf::RecordBatch* metadata, const std::shared_ptr<Schema>& schema,
     const std::vector<bool>& inclusion_mask, const DictionaryMemo* dictionary_memo,
     const IpcReadOptions& options, MetadataVersion metadata_version,
-    Compression::type compression, io::RandomAccessFile* file) {
+    Compression::type compression, io::RandomAccessFile* file, bool swap_endian = false) {

Review comment:
       I'm not sure `swap_endian` is useful.
   `native_endian` may be more useful.

##########
File path: cpp/src/arrow/type.h
##########
@@ -1582,13 +1583,23 @@ class ARROW_EXPORT FieldRef {
 // ----------------------------------------------------------------------
 // Schema
 
+enum class Endianness { LITTLE, BIG };
+#if ARROW_LITTLE_ENDIAN
+#define NATIVE_ENDIANNESS Endianness::LITTLE
+#else
+#define NATIVE_ENDIANNESS Endianness::BIG

Review comment:
       Could you hide this from public API?
   It doesn't have `ARROW_` prefix.

##########
File path: cpp/src/arrow/type.h
##########
@@ -1600,6 +1611,12 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable,
   bool Equals(const Schema& other, bool check_metadata = false) const;
   bool Equals(const std::shared_ptr<Schema>& other, bool check_metadata = false) const;
 
+  /// \brief Set platform-native endianness in the schema
+  void setNativeEndianness();

Review comment:
       `Schema` is read-only.
   
   How about adding `withNativeEndianness()` or `withEndianness()` like `withMetadata()` instead?

##########
File path: cpp/src/arrow/ipc/reader.cc
##########
@@ -449,7 +449,7 @@ Result<std::shared_ptr<RecordBatch>> LoadRecordBatchSubset(
     const flatbuf::RecordBatch* metadata, const std::shared_ptr<Schema>& schema,
     const std::vector<bool>& inclusion_mask, const DictionaryMemo* dictionary_memo,
     const IpcReadOptions& options, MetadataVersion metadata_version,
-    Compression::type compression, io::RandomAccessFile* file) {
+    Compression::type compression, io::RandomAccessFile* file, bool swap_endian = false) {

Review comment:
       Can we put the `swap_endian` to `IpcReadOptions` from function argument?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org