You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@parquet.apache.org by "ASF GitHub Bot (JIRA)" <ji...@apache.org> on 2018/09/27 22:20:00 UTC

[jira] [Commented] (PARQUET-1160) [C++] Implement BYTE_ARRAY-backed Decimal reads

    [ https://issues.apache.org/jira/browse/PARQUET-1160?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16631112#comment-16631112 ] 

ASF GitHub Bot commented on PARQUET-1160:
-----------------------------------------

wesm closed pull request #495: PARQUET-1160: [C++] Implement BYTE_ARRAY-backed Decimal reads
URL: https://github.com/apache/parquet-cpp/pull/495
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/data/byte_array_decimal.parquet b/data/byte_array_decimal.parquet
new file mode 100644
index 00000000..798cb2aa
Binary files /dev/null and b/data/byte_array_decimal.parquet differ
diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc
index 5f4e1234..30dbf4ad 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -2316,11 +2316,11 @@ TEST(TestArrowReaderAdHoc, Int96BadMemoryAccess) {
   ASSERT_OK_NO_THROW(arrow_reader->ReadTable(&table));
 }
 
-class TestArrowReaderAdHocSpark
+class TestArrowReaderAdHocSparkAndHvr
     : public ::testing::TestWithParam<
           std::tuple<std::string, std::shared_ptr<::DataType>>> {};
 
-TEST_P(TestArrowReaderAdHocSpark, ReadDecimals) {
+TEST_P(TestArrowReaderAdHocSparkAndHvr, ReadDecimals) {
   std::string path(test::get_data_dir());
 
   std::string filename;
@@ -2364,12 +2364,13 @@ TEST_P(TestArrowReaderAdHocSpark, ReadDecimals) {
 }
 
 INSTANTIATE_TEST_CASE_P(
-    ReadDecimals, TestArrowReaderAdHocSpark,
+    ReadDecimals, TestArrowReaderAdHocSparkAndHvr,
     ::testing::Values(
         std::make_tuple("int32_decimal.parquet", ::arrow::decimal(4, 2)),
         std::make_tuple("int64_decimal.parquet", ::arrow::decimal(10, 2)),
         std::make_tuple("fixed_length_decimal.parquet", ::arrow::decimal(25, 2)),
-        std::make_tuple("fixed_length_decimal_legacy.parquet", ::arrow::decimal(13, 2))));
+        std::make_tuple("fixed_length_decimal_legacy.parquet", ::arrow::decimal(13, 2)),
+        std::make_tuple("byte_array_decimal.parquet", ::arrow::decimal(4, 2))));
 
 }  // namespace arrow
 
diff --git a/src/parquet/arrow/reader.cc b/src/parquet/arrow/reader.cc
index 2e4dc815..de7261ce 100644
--- a/src/parquet/arrow/reader.cc
+++ b/src/parquet/arrow/reader.cc
@@ -1220,6 +1220,66 @@ struct TransferFunctor<::arrow::Decimal128Type, FLBAType> {
   }
 };
 
+/// \brief Convert an arrow::BinaryArray to an arrow::Decimal128Array
+/// We do this by:
+/// 1. Creating an arrow::BinaryArray from the RecordReader's builder
+/// 2. Allocating a buffer for the arrow::Decimal128Array
+/// 3. Converting the big-endian bytes in each BinaryArray entry to two integers
+///    representing the high and low bits of each decimal value.
+template <>
+struct TransferFunctor<::arrow::Decimal128Type, ByteArrayType> {
+  Status operator()(RecordReader* reader, MemoryPool* pool,
+                    const std::shared_ptr<::arrow::DataType>& type,
+                    std::shared_ptr<Array>* out) {
+    DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL);
+
+    // Finish the built data into a temporary array
+    std::shared_ptr<Array> array;
+    RETURN_NOT_OK(reader->builder()->Finish(&array));
+    const auto& binary_array = static_cast<const ::arrow::BinaryArray&>(*array);
+
+    const int64_t length = binary_array.length();
+
+    const auto& decimal_type = static_cast<const ::arrow::Decimal128Type&>(*type);
+    const int64_t type_length = decimal_type.byte_width();
+
+    std::shared_ptr<Buffer> data;
+    RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data));
+
+    // raw bytes that we can write to
+    uint8_t* out_ptr = data->mutable_data();
+
+    const int64_t null_count = binary_array.null_count();
+
+    // convert each BinaryArray value to valid decimal bytes
+    for (int64_t i = 0; i < length; i++, out_ptr += type_length) {
+
+      int32_t record_len = 0;
+      const uint8_t *record_loc = binary_array.GetValue(i, &record_len);
+
+      if ((record_len < 0) || (record_len > type_length)) {
+        return Status::Invalid("Invalid BYTE_ARRAY size");
+      }
+
+      auto out_ptr_view = reinterpret_cast<uint64_t*>(out_ptr);
+      out_ptr_view[0] = 0;
+      out_ptr_view[1] = 0;
+      
+      // only convert rows that are not null if there are nulls, or
+      // all rows, if there are not
+      if (((null_count > 0) && !binary_array.IsNull(i)) || (null_count <= 0)) {
+        RawBytesToDecimalBytes(record_loc, record_len, out_ptr);
+      }
+    }
+
+    *out = std::make_shared<::arrow::Decimal128Array>(
+        type, length, data, binary_array.null_bitmap(), null_count);
+
+    return Status::OK();
+  }
+};
+
+
 /// \brief Convert an Int32 or Int64 array into a Decimal128Array
 /// The parquet spec allows systems to write decimals in int32, int64 if the values are
 /// small enough to fit in less 4 bytes or less than 8 bytes, respectively.
@@ -1352,12 +1412,16 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, std::shared_ptr<Array>*
         case ::parquet::Type::INT64: {
           TRANSFER_DATA(::arrow::Decimal128Type, Int64Type);
         } break;
+        case ::parquet::Type::BYTE_ARRAY: {
+          TRANSFER_DATA(::arrow::Decimal128Type, ByteArrayType);
+        } break;
         case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: {
           TRANSFER_DATA(::arrow::Decimal128Type, FLBAType);
         } break;
         default:
           return Status::Invalid(
-              "Physical type for decimal must be int32, int64, or fixed length binary");
+              "Physical type for decimal must be int32, int64, byte array, or fixed "
+              "length binary");
       }
     } break;
     case ::arrow::Type::TIMESTAMP: {


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> [C++] Implement BYTE_ARRAY-backed Decimal reads
> -----------------------------------------------
>
>                 Key: PARQUET-1160
>                 URL: https://issues.apache.org/jira/browse/PARQUET-1160
>             Project: Parquet
>          Issue Type: Task
>          Components: parquet-cpp
>    Affects Versions: cpp-1.3.0
>            Reporter: Phillip Cloud
>            Assignee: Phillip Cloud
>            Priority: Major
>              Labels: pull-request-available
>         Attachments: 20180726193815980.parquet
>
>          Time Spent: 10m
>  Remaining Estimate: 0h
>
> These are valid in the parquet spec, but it seems like no system in use today implements a writer for this type.
> What systems support writing Decimals with this underlying type?



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)