You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/09/30 08:58:20 UTC
[arrow] branch master updated: PARQUET-1160: [C++] Implement
BYTE_ARRAY-backed Decimal reads
This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 74bf736 PARQUET-1160: [C++] Implement BYTE_ARRAY-backed Decimal reads
74bf736 is described below
commit 74bf7367669f12cd48627e0848887b504c560fa8
Author: Ted Haining <th...@xcalar.com>
AuthorDate: Sun Sep 30 10:58:11 2018 +0200
PARQUET-1160: [C++] Implement BYTE_ARRAY-backed Decimal reads
This change enables support for DECIMALs backed by BYTE_ARRAYs on disk. It does this by creating a TransferFunctor routine that transforms a ByteArrayType to an ::arrow::Decimal128Type.
The routine does this by:
1. Creating an arrow::BinaryArray from the RecordReader's builder
2. Allocating a buffer for the arrow::Decimal128Array
3. Converting the big-endian bytes in each BinaryArray entry to two integers
representing the high and low bits of each decimal value.
Author: Ted Haining <th...@xcalar.com>
Closes #2646 from thaining/parquet-1160-byte-array-decimals and squashes the following commits:
0bad382e <Ted Haining> Updated parquet-testing to SHA that includes necessary test files.
30f3a278 <Ted Haining> This change enables support for DECIMALs backed by BYTE_ARRAYs on disk. It does this by creating a TransferFunctor routine that transforms a ByteArrayType to an ::arrow::Decimal128Type.
---
cpp/src/parquet/arrow/arrow-reader-writer-test.cc | 9 ++--
cpp/src/parquet/arrow/reader.cc | 64 ++++++++++++++++++++++-
cpp/submodules/parquet-testing | 2 +-
3 files changed, 69 insertions(+), 6 deletions(-)
diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
index 5f4e123..30dbf4a 100644
--- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -2316,11 +2316,11 @@ TEST(TestArrowReaderAdHoc, Int96BadMemoryAccess) {
ASSERT_OK_NO_THROW(arrow_reader->ReadTable(&table));
}
-class TestArrowReaderAdHocSpark
+class TestArrowReaderAdHocSparkAndHvr
: public ::testing::TestWithParam<
std::tuple<std::string, std::shared_ptr<::DataType>>> {};
-TEST_P(TestArrowReaderAdHocSpark, ReadDecimals) {
+TEST_P(TestArrowReaderAdHocSparkAndHvr, ReadDecimals) {
std::string path(test::get_data_dir());
std::string filename;
@@ -2364,12 +2364,13 @@ TEST_P(TestArrowReaderAdHocSpark, ReadDecimals) {
}
INSTANTIATE_TEST_CASE_P(
- ReadDecimals, TestArrowReaderAdHocSpark,
+ ReadDecimals, TestArrowReaderAdHocSparkAndHvr,
::testing::Values(
std::make_tuple("int32_decimal.parquet", ::arrow::decimal(4, 2)),
std::make_tuple("int64_decimal.parquet", ::arrow::decimal(10, 2)),
std::make_tuple("fixed_length_decimal.parquet", ::arrow::decimal(25, 2)),
- std::make_tuple("fixed_length_decimal_legacy.parquet", ::arrow::decimal(13, 2))));
+ std::make_tuple("fixed_length_decimal_legacy.parquet", ::arrow::decimal(13, 2)),
+ std::make_tuple("byte_array_decimal.parquet", ::arrow::decimal(4, 2))));
} // namespace arrow
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 11fb20c..da6061d 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -1221,6 +1221,64 @@ struct TransferFunctor<::arrow::Decimal128Type, FLBAType> {
}
};
+/// \brief Convert an arrow::BinaryArray to an arrow::Decimal128Array
+/// We do this by:
+/// 1. Creating an arrow::BinaryArray from the RecordReader's builder
+/// 2. Allocating a buffer for the arrow::Decimal128Array
+/// 3. Converting the big-endian bytes in each BinaryArray entry to two integers
+/// representing the high and low bits of each decimal value.
+template <>
+struct TransferFunctor<::arrow::Decimal128Type, ByteArrayType> {
+ Status operator()(RecordReader* reader, MemoryPool* pool,
+ const std::shared_ptr<::arrow::DataType>& type,
+ std::shared_ptr<Array>* out) {
+ DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL);
+
+ // Finish the built data into a temporary array
+ std::shared_ptr<Array> array;
+ RETURN_NOT_OK(reader->builder()->Finish(&array));
+ const auto& binary_array = static_cast<const ::arrow::BinaryArray&>(*array);
+
+ const int64_t length = binary_array.length();
+
+ const auto& decimal_type = static_cast<const ::arrow::Decimal128Type&>(*type);
+ const int64_t type_length = decimal_type.byte_width();
+
+ std::shared_ptr<Buffer> data;
+ RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data));
+
+ // raw bytes that we can write to
+ uint8_t* out_ptr = data->mutable_data();
+
+ const int64_t null_count = binary_array.null_count();
+
+ // convert each BinaryArray value to valid decimal bytes
+ for (int64_t i = 0; i < length; i++, out_ptr += type_length) {
+ int32_t record_len = 0;
+ const uint8_t* record_loc = binary_array.GetValue(i, &record_len);
+
+ if ((record_len < 0) || (record_len > type_length)) {
+ return Status::Invalid("Invalid BYTE_ARRAY size");
+ }
+
+ auto out_ptr_view = reinterpret_cast<uint64_t*>(out_ptr);
+ out_ptr_view[0] = 0;
+ out_ptr_view[1] = 0;
+
+ // only convert rows that are not null if there are nulls, or
+ // all rows, if there are not
+ if (((null_count > 0) && !binary_array.IsNull(i)) || (null_count <= 0)) {
+ RawBytesToDecimalBytes(record_loc, record_len, out_ptr);
+ }
+ }
+
+ *out = std::make_shared<::arrow::Decimal128Array>(
+ type, length, data, binary_array.null_bitmap(), null_count);
+
+ return Status::OK();
+ }
+};
+
/// \brief Convert an Int32 or Int64 array into a Decimal128Array
/// The parquet spec allows systems to write decimals in int32, int64 if the values are
/// small enough to fit in less 4 bytes or less than 8 bytes, respectively.
@@ -1353,12 +1411,16 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, std::shared_ptr<Array>*
case ::parquet::Type::INT64: {
TRANSFER_DATA(::arrow::Decimal128Type, Int64Type);
} break;
+ case ::parquet::Type::BYTE_ARRAY: {
+ TRANSFER_DATA(::arrow::Decimal128Type, ByteArrayType);
+ } break;
case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: {
TRANSFER_DATA(::arrow::Decimal128Type, FLBAType);
} break;
default:
return Status::Invalid(
- "Physical type for decimal must be int32, int64, or fixed length binary");
+ "Physical type for decimal must be int32, int64, byte array, or fixed "
+ "length binary");
}
} break;
case ::arrow::Type::TIMESTAMP: {
diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing
index 48a657c..46ae260 160000
--- a/cpp/submodules/parquet-testing
+++ b/cpp/submodules/parquet-testing
@@ -1 +1 @@
-Subproject commit 48a657ca05eb308539f3f00c698e8bb5185d9b38
+Subproject commit 46ae2605c2de306f5740587107dcf333a527f2d1