You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ma...@apache.org on 2024/02/02 19:54:53 UTC
(arrow) branch main updated: GH-39843: [C++][Parquet] Parquet binary length overflow exception should contain the length of binary (#39844)
This is an automated email from the ASF dual-hosted git repository.
maplefu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 32bd01fa64 GH-39843: [C++][Parquet] Parquet binary length overflow exception should contain the length of binary (#39844)
32bd01fa64 is described below
commit 32bd01fa64b275937ca90aa50b11f275eeefde94
Author: mwish <ma...@gmail.com>
AuthorDate: Sat Feb 3 03:54:47 2024 +0800
GH-39843: [C++][Parquet] Parquet binary length overflow exception should contain the length of binary (#39844)
### Rationale for this change
See https://github.com/apache/arrow/issues/39843
It will be great to contain a string length in decoder.
### What changes are included in this PR?
change the logging of encoding
### Are these changes tested?
no
### Are there any user-facing changes?
more specific error logging?
* Closes: #39843
Authored-by: mwish <ma...@gmail.com>
Signed-off-by: mwish <ma...@gmail.com>
---
cpp/src/parquet/column_writer.cc | 3 ++-
cpp/src/parquet/encoding.cc | 18 ++++++++++++------
2 files changed, 14 insertions(+), 7 deletions(-)
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index 23366b2daa..eae8fc6125 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -442,7 +442,8 @@ class SerializedPageWriter : public PageWriter {
if (offset_index_builder_ != nullptr) {
const int64_t compressed_size = output_data_len + header_size;
if (compressed_size > std::numeric_limits<int32_t>::max()) {
- throw ParquetException("Compressed page size overflows INT32_MAX.");
+ throw ParquetException("Compressed page size ", compressed_size,
+ " overflows INT32_MAX.");
}
if (!page.first_row_index().has_value()) {
throw ParquetException("First row index is not set in data page.");
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 5573f5b9ae..a3d1746536 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -160,7 +160,8 @@ class PlainEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
*array.data(),
[&](::std::string_view view) {
if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
- return Status::Invalid("Parquet cannot store strings with size 2GB or more");
+ return Status::Invalid(
+ "Parquet cannot store strings with size 2GB or more, got: ", view.size());
}
UnsafePutByteArray(view.data(), static_cast<uint32_t>(view.size()));
return Status::OK();
@@ -571,7 +572,8 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> {
*array.data(),
[&](::std::string_view view) {
if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
- return Status::Invalid("Parquet cannot store strings with size 2GB or more");
+ return Status::Invalid(
+ "Parquet cannot store strings with size 2GB or more, got: ", view.size());
}
PutByteArray(view.data(), static_cast<uint32_t>(view.size()));
return Status::OK();
@@ -585,7 +587,8 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> {
for (int64_t i = 0; i < array.length(); i++) {
auto v = array.GetView(i);
if (ARROW_PREDICT_FALSE(v.size() > kMaxByteArraySize)) {
- throw ParquetException("Parquet cannot store strings with size 2GB or more");
+ throw ParquetException(
+ "Parquet cannot store strings with size 2GB or more, got: ", v.size());
}
dict_encoded_size_ += static_cast<int>(v.size() + sizeof(uint32_t));
int32_t unused_memo_index;
@@ -2671,7 +2674,8 @@ class DeltaLengthByteArrayEncoder : public EncoderImpl,
*array.data(),
[&](::std::string_view view) {
if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
- return Status::Invalid("Parquet cannot store strings with size 2GB or more");
+ return Status::Invalid(
+ "Parquet cannot store strings with size 2GB or more, got: ", view.size());
}
length_encoder_.Put({static_cast<int32_t>(view.length())}, 1);
PARQUET_THROW_NOT_OK(sink_.Append(view.data(), view.length()));
@@ -3200,7 +3204,8 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder<DT
*array.data(),
[&](::std::string_view view) {
if (ARROW_PREDICT_FALSE(view.size() >= kMaxByteArraySize)) {
- return Status::Invalid("Parquet cannot store strings with size 2GB or more");
+ return Status::Invalid(
+ "Parquet cannot store strings with size 2GB or more, got: ", view.size());
}
const ByteArray src{view};
@@ -3246,7 +3251,8 @@ struct ByteArrayVisitor {
std::string_view operator[](int i) const {
if (ARROW_PREDICT_FALSE(src[i].len >= kMaxByteArraySize)) {
- throw ParquetException("Parquet cannot store strings with size 2GB or more");
+ throw ParquetException("Parquet cannot store strings with size 2GB or more, got: ",
+ src[i].len);
}
return std::string_view{src[i]};
}