You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/28 09:25:16 UTC

[GitHub] [arrow] pitrou commented on a diff in pull request #12829: ARROW-16116: [C++] Handle non-nullable fields when reading Parquet

pitrou commented on code in PR #12829:
URL: https://github.com/apache/arrow/pull/12829#discussion_r860676066


##########
cpp/src/parquet/arrow/reader_internal.cc:
##########
@@ -434,14 +480,26 @@ Status TransferBinary(RecordReader* reader, MemoryPool* pool,
   DCHECK(binary_reader);
   auto chunks = binary_reader->GetBuilderChunks();
   for (auto& chunk : chunks) {
-    if (!chunk->type()->Equals(*logical_value_type)) {
+    if (!chunk->type()->Equals(*logical_type_field->type())) {
       // XXX: if a LargeBinary chunk is larger than 2GB, the MSBs of offsets
       // will be lost because they are first created as int32 and then cast to int64.
       ARROW_ASSIGN_OR_RAISE(
-          chunk, ::arrow::compute::Cast(*chunk, logical_value_type, cast_options, &ctx));
+          chunk,
+          ::arrow::compute::Cast(*chunk, logical_type_field->type(), cast_options, &ctx));
     }
   }
-  *out = std::make_shared<ChunkedArray>(chunks, logical_value_type);
+  if (!logical_type_field->nullable()) {

Review Comment:
   Perhaps make a helper function for this operation and refactor?



##########
cpp/src/parquet/arrow/reader_internal.cc:
##########
@@ -409,22 +441,36 @@ Status TransferDate64(RecordReader* reader, MemoryPool* pool,
 
 Status TransferDictionary(RecordReader* reader,
                           const std::shared_ptr<DataType>& logical_value_type,
-                          std::shared_ptr<ChunkedArray>* out) {
+                          bool nullable, std::shared_ptr<ChunkedArray>* out) {
   auto dict_reader = dynamic_cast<DictionaryRecordReader*>(reader);
   DCHECK(dict_reader);
   *out = dict_reader->GetResult();
   if (!logical_value_type->Equals(*(*out)->type())) {
     ARROW_ASSIGN_OR_RAISE(*out, (*out)->View(logical_value_type));
   }
+  if (!nullable) {
+    // Reconstruct each chunk without nulls.

Review Comment:
   Is this necessary for correctness or is it just an optimization of eliding the null buffer?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org