You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/12/07 19:41:29 UTC

[GitHub] [arrow] kiszk commented on a change in pull request #7507: ARROW-8797: [C++] Read RecordBatch in a different endian

kiszk commented on a change in pull request #7507:
URL: https://github.com/apache/arrow/pull/7507#discussion_r537780408



##########
File path: cpp/src/arrow/array/util.cc
##########
@@ -74,6 +75,204 @@ class ArrayDataWrapper {
   std::shared_ptr<Array>* out_;
 };
 
+class ArrayDataEndianSwapper {
+ public:
+  ArrayDataEndianSwapper(std::shared_ptr<ArrayData>& data, int64_t length)
+      : data_(data), length_(length) {}
+
+  Status SwapType(const DataType& type) {
+    RETURN_NOT_OK(VisitTypeInline(type, this));
+    RETURN_NOT_OK(SwapChildren(type.fields()));
+    return Status::OK();
+  }
+
+  Status SwapChildren(std::vector<std::shared_ptr<Field>> child_fields) {
+    int i = 0;
+    for (const auto& child_field : child_fields) {
+      ArrayDataEndianSwapper swapper_child_visitor(data_->child_data[i],
+                                                   data_->child_data[i]->length);
+      RETURN_NOT_OK(VisitTypeInline(*child_field.get()->type(), &swapper_child_visitor));
+      RETURN_NOT_OK(
+          swapper_child_visitor.SwapChildren((*child_field.get()->type()).fields()));
+      i++;
+    }
+    return Status::OK();
+  }
+
+  template <typename VALUE_TYPE>
+  Status SwapOffset(int index) {
+    if (data_->buffers[index] == nullptr) {
+      return Status::OK();
+    }
+    auto data = reinterpret_cast<const VALUE_TYPE*>(data_->buffers[index]->data());
+    ARROW_ASSIGN_OR_RAISE(auto new_buffer,
+                          AllocateBuffer(data_->buffers[index]->size() + 1));
+    auto new_data = reinterpret_cast<VALUE_TYPE*>(new_buffer->mutable_data());
+    // offset has one more element rather than data->length
+    int64_t length = length_ + 1;
+    for (int64_t i = 0; i < length; i++) {
+#if ARROW_LITTLE_ENDIAN
+      new_data[i] = BitUtil::FromBigEndian(data[i]);
+#else
+      new_data[i] = BitUtil::FromLittleEndian(data[i]);
+#endif
+    }
+    data_->buffers[index] = std::move(new_buffer);
+    return Status::OK();
+  }
+
+  Status SwapSmallOffset(int index = 1) { return SwapOffset<int32_t>(index); }
+
+  Status SwapLargeOffset() { return SwapOffset<int64_t>(1); }
+
+  template <typename T>
+  Status Visit(const T&) {
+    using value_type = typename T::c_type;
+    auto data = reinterpret_cast<const value_type*>(data_->buffers[1]->data());
+    ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size()));
+    auto new_data = reinterpret_cast<value_type*>(new_buffer->mutable_data());
+    int64_t length = length_;
+    for (int64_t i = 0; i < length; i++) {
+#if ARROW_LITTLE_ENDIAN
+      new_data[i] = BitUtil::FromBigEndian(data[i]);
+#else
+      new_data[i] = BitUtil::FromLittleEndian(data[i]);
+#endif
+    }
+    data_->buffers[1] = std::move(new_buffer);
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal128Type& type) {
+    auto data = reinterpret_cast<const uint64_t*>(data_->buffers[1]->data());
+    ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size()));
+    auto new_data = reinterpret_cast<uint64_t*>(new_buffer->mutable_data());
+    int64_t length = length_;
+    for (int64_t i = 0; i < length; i++) {
+      uint64_t tmp;
+      auto idx = i * 2;
+#if ARROW_LITTLE_ENDIAN
+      tmp = BitUtil::FromBigEndian(data[idx]);
+      new_data[idx] = BitUtil::FromBigEndian(data[idx + 1]);
+      new_data[idx + 1] = tmp;
+#else
+      tmp = BitUtil::FromLittleEndian(data[idx]);
+      new_data[idx] = BitUtil::FromLittleEndian(data[idx + 1]);
+      new_data[idx + 1] = tmp;
+#endif
+    }
+    data_->buffers[1] = std::move(new_buffer);
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal256Type& type) {
+    auto data = reinterpret_cast<const uint64_t*>(data_->buffers[1]->data());
+    ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size()));
+    auto new_data = reinterpret_cast<uint64_t*>(new_buffer->mutable_data());
+    int64_t length = length_;
+    for (int64_t i = 0; i < length; i++) {
+      uint64_t tmp0, tmp1, tmp2;
+      auto idx = i * 4;
+#if ARROW_LITTLE_ENDIAN
+      tmp0 = BitUtil::FromBigEndian(data[idx]);
+      tmp1 = BitUtil::FromBigEndian(data[idx + 1]);
+      tmp2 = BitUtil::FromBigEndian(data[idx + 2]);
+      new_data[idx] = BitUtil::FromBigEndian(data[idx + 3]);
+      new_data[idx + 1] = tmp2;
+      new_data[idx + 2] = tmp1;
+      new_data[idx + 3] = tmp0;
+#else
+      tmp0 = BitUtil::FromLittleEndian(data[idx]);
+      tmp1 = BitUtil::FromLittleEndian(data[idx + 1]);
+      tmp2 = BitUtil::FromLittleEndian(data[idx + 2]);
+      new_data[idx] = BitUtil::FromLittleEndian(data[idx + 3]);
+      new_data[idx + 1] = tmp2;
+      new_data[idx + 2] = tmp1;
+      new_data[idx + 3] = tmp0;
+#endif
+    }
+    data_->buffers[1] = std::move(new_buffer);
+    return Status::OK();
+  }
+
+  Status Visit(const DayTimeIntervalType& type) {
+    auto data = reinterpret_cast<const uint32_t*>(data_->buffers[1]->data());
+    ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size()));
+    auto new_data = reinterpret_cast<uint32_t*>(new_buffer->mutable_data());
+    int64_t length = length_;
+    for (int64_t i = 0; i < length; i++) {
+      auto idx = i * 2;
+#if ARROW_LITTLE_ENDIAN
+      new_data[idx] = BitUtil::FromBigEndian(data[idx]);
+      new_data[idx + 1] = BitUtil::FromBigEndian(data[idx + 1]);
+#else
+      new_data[idx] = BitUtil::FromLittleEndian(data[idx]);
+      new_data[idx + 1] = BitUtil::FromLittleEndian(data[idx + 1]);
+#endif
+    }
+    data_->buffers[1] = std::move(new_buffer);
+    return Status::OK();
+  }
+
+  Status Visit(const NullType& type) { return Status::OK(); }
+  Status Visit(const BooleanType& type) { return Status::OK(); }
+  Status Visit(const Int8Type& type) { return Status::OK(); }
+  Status Visit(const UInt8Type& type) { return Status::OK(); }
+  Status Visit(const FixedSizeBinaryType& type) { return Status::OK(); }
+  Status Visit(const FixedSizeListType& type) { return Status::OK(); }
+  Status Visit(const StructType& type) { return Status::OK(); }
+  Status Visit(const SparseUnionType& type) { return Status::OK(); }
+
+  Status Visit(const StringType& type) {
+    RETURN_NOT_OK(SwapSmallOffset());
+    return Status::OK();
+  }
+  Status Visit(const LargeStringType& type) {
+    RETURN_NOT_OK(SwapLargeOffset());
+    return Status::OK();
+  }
+  Status Visit(const BinaryType& type) {
+    RETURN_NOT_OK(SwapSmallOffset());
+    return Status::OK();
+  }
+  Status Visit(const LargeBinaryType& type) {
+    RETURN_NOT_OK(SwapLargeOffset());
+    return Status::OK();
+  }
+
+  Status Visit(const ListType& type) {
+    RETURN_NOT_OK(SwapSmallOffset());
+    return Status::OK();
+  }
+  Status Visit(const LargeListType& type) {
+    RETURN_NOT_OK(SwapLargeOffset());
+    return Status::OK();
+  }
+
+  Status Visit(const MapType& type) {

Review comment:
       @wesm would it be possible to give me some comments?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org