You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "wjones127 (via GitHub)" <gi...@apache.org> on 2023/04/07 21:37:32 UTC

[GitHub] [arrow] wjones127 commented on a diff in pull request #34955: GH-34335: [C++][Parquet] Optimize Decoding DELTA_LENGTH_BYTE_ARRAY

wjones127 commented on code in PR #34955:
URL: https://github.com/apache/arrow/pull/34955#discussion_r1160976252


##########
cpp/src/parquet/encoding.cc:
##########
@@ -2745,6 +2738,7 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
     int32_t data_size = 0;
     const int32_t* length_ptr =
         reinterpret_cast<const int32_t*>(buffered_length_->data()) + length_idx_;
+    int bytes_left = decoder_->bytes_left();

Review Comment:
   nit: could we rewrite this as `bytes_offset`? That way it's clearer why we are computing this before calling `decoder_->Advance()`
   
   ```suggestion
       int bytes_offset = len_ - decoder_->bytes_left();
   ```



##########
cpp/src/parquet/encoding.cc:
##########
@@ -2756,13 +2750,10 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
       }
     }
     length_idx_ += max_values;
-
-    PARQUET_THROW_NOT_OK(buffered_data_->Resize(data_size));
-    if (decoder_->GetBatch(8, buffered_data_->mutable_data(), data_size) != data_size) {
+    if (ARROW_PREDICT_FALSE(!decoder_->Advance(8 * static_cast<int64_t>(data_size)))) {
       ParquetException::EofException();
     }
-    const uint8_t* data_ptr = buffered_data_->data();
-
+    const uint8_t* data_ptr = data_ + (len_ - bytes_left);

Review Comment:
   ```suggestion
       const uint8_t* data_ptr = data_ + bytes_offset;
   ```



##########
cpp/src/parquet/encoding.cc:
##########
@@ -3071,8 +3061,14 @@ class DeltaByteArrayDecoder : public DecoderImpl,
     prefix_len_offset_ = 0;
     num_valid_values_ = num_prefix;
 
+    int bytes_left = decoder_->bytes_left();
+    int suffix_begins = len - bytes_left;
+    if (suffix_begins < 0) {
+      throw ParquetException(
+          "Wrong data in DeltaByteArrayDecoder: bytes_left more than length");
+    }

Review Comment:
   If this happens, doesn't that mean we have already read past the buffer end (buffer overflow)? Seems like that should be handled in the lengths decoder.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org