You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/09/30 08:58:20 UTC

[arrow] branch master updated: PARQUET-1160: [C++] Implement BYTE_ARRAY-backed Decimal reads

This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 74bf736  PARQUET-1160: [C++] Implement BYTE_ARRAY-backed Decimal reads
74bf736 is described below

commit 74bf7367669f12cd48627e0848887b504c560fa8
Author: Ted Haining <th...@xcalar.com>
AuthorDate: Sun Sep 30 10:58:11 2018 +0200

    PARQUET-1160: [C++] Implement BYTE_ARRAY-backed Decimal reads
    
    This change enables support for DECIMALs backed by BYTE_ARRAYs on disk. It does this by creating a TransferFunctor routine that transforms a ByteArrayType to an ::arrow::Decimal128Type.
    
    The routine does this by:
    1. Creating an arrow::BinaryArray from the RecordReader's builder
    2. Allocating a buffer for the arrow::Decimal128Array
    3. Converting the big-endian bytes in each BinaryArray entry to two integers
       representing the high and low bits of each decimal value.
    
    Author: Ted Haining <th...@xcalar.com>
    
    Closes #2646 from thaining/parquet-1160-byte-array-decimals and squashes the following commits:
    
    0bad382e <Ted Haining> Updated parquet-testing to SHA that includes necessary test files.
    30f3a278 <Ted Haining> This change enables support for DECIMALs backed by BYTE_ARRAYs on disk. It does this by creating a TransferFunctor routine that transforms a ByteArrayType to an ::arrow::Decimal128Type.
---
 cpp/src/parquet/arrow/arrow-reader-writer-test.cc |  9 ++--
 cpp/src/parquet/arrow/reader.cc                   | 64 ++++++++++++++++++++++-
 cpp/submodules/parquet-testing                    |  2 +-
 3 files changed, 69 insertions(+), 6 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
index 5f4e123..30dbf4a 100644
--- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -2316,11 +2316,11 @@ TEST(TestArrowReaderAdHoc, Int96BadMemoryAccess) {
   ASSERT_OK_NO_THROW(arrow_reader->ReadTable(&table));
 }
 
-class TestArrowReaderAdHocSpark
+class TestArrowReaderAdHocSparkAndHvr
     : public ::testing::TestWithParam<
           std::tuple<std::string, std::shared_ptr<::DataType>>> {};
 
-TEST_P(TestArrowReaderAdHocSpark, ReadDecimals) {
+TEST_P(TestArrowReaderAdHocSparkAndHvr, ReadDecimals) {
   std::string path(test::get_data_dir());
 
   std::string filename;
@@ -2364,12 +2364,13 @@ TEST_P(TestArrowReaderAdHocSpark, ReadDecimals) {
 }
 
 INSTANTIATE_TEST_CASE_P(
-    ReadDecimals, TestArrowReaderAdHocSpark,
+    ReadDecimals, TestArrowReaderAdHocSparkAndHvr,
     ::testing::Values(
         std::make_tuple("int32_decimal.parquet", ::arrow::decimal(4, 2)),
         std::make_tuple("int64_decimal.parquet", ::arrow::decimal(10, 2)),
         std::make_tuple("fixed_length_decimal.parquet", ::arrow::decimal(25, 2)),
-        std::make_tuple("fixed_length_decimal_legacy.parquet", ::arrow::decimal(13, 2))));
+        std::make_tuple("fixed_length_decimal_legacy.parquet", ::arrow::decimal(13, 2)),
+        std::make_tuple("byte_array_decimal.parquet", ::arrow::decimal(4, 2))));
 
 }  // namespace arrow
 
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 11fb20c..da6061d 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -1221,6 +1221,64 @@ struct TransferFunctor<::arrow::Decimal128Type, FLBAType> {
   }
 };
 
+/// \brief Convert an arrow::BinaryArray to an arrow::Decimal128Array
+/// We do this by:
+/// 1. Creating an arrow::BinaryArray from the RecordReader's builder
+/// 2. Allocating a buffer for the arrow::Decimal128Array
+/// 3. Converting the big-endian bytes in each BinaryArray entry to two integers
+///    representing the high and low bits of each decimal value.
+template <>
+struct TransferFunctor<::arrow::Decimal128Type, ByteArrayType> {
+  Status operator()(RecordReader* reader, MemoryPool* pool,
+                    const std::shared_ptr<::arrow::DataType>& type,
+                    std::shared_ptr<Array>* out) {
+    DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL);
+
+    // Finish the built data into a temporary array
+    std::shared_ptr<Array> array;
+    RETURN_NOT_OK(reader->builder()->Finish(&array));
+    const auto& binary_array = static_cast<const ::arrow::BinaryArray&>(*array);
+
+    const int64_t length = binary_array.length();
+
+    const auto& decimal_type = static_cast<const ::arrow::Decimal128Type&>(*type);
+    const int64_t type_length = decimal_type.byte_width();
+
+    std::shared_ptr<Buffer> data;
+    RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data));
+
+    // raw bytes that we can write to
+    uint8_t* out_ptr = data->mutable_data();
+
+    const int64_t null_count = binary_array.null_count();
+
+    // convert each BinaryArray value to valid decimal bytes
+    for (int64_t i = 0; i < length; i++, out_ptr += type_length) {
+      int32_t record_len = 0;
+      const uint8_t* record_loc = binary_array.GetValue(i, &record_len);
+
+      if ((record_len < 0) || (record_len > type_length)) {
+        return Status::Invalid("Invalid BYTE_ARRAY size");
+      }
+
+      auto out_ptr_view = reinterpret_cast<uint64_t*>(out_ptr);
+      out_ptr_view[0] = 0;
+      out_ptr_view[1] = 0;
+
+      // only convert rows that are not null if there are nulls, or
+      // all rows, if there are not
+      if (((null_count > 0) && !binary_array.IsNull(i)) || (null_count <= 0)) {
+        RawBytesToDecimalBytes(record_loc, record_len, out_ptr);
+      }
+    }
+
+    *out = std::make_shared<::arrow::Decimal128Array>(
+        type, length, data, binary_array.null_bitmap(), null_count);
+
+    return Status::OK();
+  }
+};
+
 /// \brief Convert an Int32 or Int64 array into a Decimal128Array
 /// The parquet spec allows systems to write decimals in int32, int64 if the values are
 /// small enough to fit in less 4 bytes or less than 8 bytes, respectively.
@@ -1353,12 +1411,16 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, std::shared_ptr<Array>*
         case ::parquet::Type::INT64: {
           TRANSFER_DATA(::arrow::Decimal128Type, Int64Type);
         } break;
+        case ::parquet::Type::BYTE_ARRAY: {
+          TRANSFER_DATA(::arrow::Decimal128Type, ByteArrayType);
+        } break;
         case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: {
           TRANSFER_DATA(::arrow::Decimal128Type, FLBAType);
         } break;
         default:
           return Status::Invalid(
-              "Physical type for decimal must be int32, int64, or fixed length binary");
+              "Physical type for decimal must be int32, int64, byte array, or fixed "
+              "length binary");
       }
     } break;
     case ::arrow::Type::TIMESTAMP: {
diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing
index 48a657c..46ae260 160000
--- a/cpp/submodules/parquet-testing
+++ b/cpp/submodules/parquet-testing
@@ -1 +1 @@
-Subproject commit 48a657ca05eb308539f3f00c698e8bb5185d9b38
+Subproject commit 46ae2605c2de306f5740587107dcf333a527f2d1