You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@parquet.apache.org by "jiang,longshan (Jira)" <ji...@apache.org> on 2021/09/25 15:59:00 UTC

[jira] [Created] (PARQUET-2095) [C++] Read Parquet file with MapArray

jiang,longshan created PARQUET-2095:
---------------------------------------

             Summary: [C++] Read Parquet file with MapArray
                 Key: PARQUET-2095
                 URL: https://issues.apache.org/jira/browse/PARQUET-2095
             Project: Parquet
          Issue Type: Improvement
          Components: parquet-cpp
    Affects Versions: cpp-4.0.0
         Environment: arrow-apache-arrow-3.0.0
C++ library

Linux operating system
            Reporter: jiang,longshan
             Fix For: cpp-6.0.0


Parquet format can reduce storage space effectively, and we use the format with hdfs+Hive Jni(call c++)+Spark Jni(call c++), and it works well. Now we are starting a new project only use c++ language with higher performance expectation, but we meet a blocking issue on how to read the parquet file with MapArray such as 

list<array_element: map<string, list<array_element: int64>>>

list<array_element: map<string, string>>

map<string, list<array_element: int64>>

 

And I know how to  work well only without map struct such as 

list<array_element: string>, list<array_element: list<array_element: string>>

Here is the code example, please give me some advice on how to read parquet file with map type, thanks a lot!

 
{code:java}
// code placeholder
#include "gflags/gflags.h"
#include "parquet_file_reader.h"
#include "parquet/arrow/reader.h"

int main(int argc, char** argv) {
    gflags::ParseCommandLineFlags(&argc, &argv, true);
    arrow::Status st;
    arrow::MemoryPool* pool = ::arrow::default_memory_pool();
    std::shared_ptr<arrow::io::RandomAccessFile> input = nullptr;


    std::shared_ptr<::arrow::io::RandomAccessFile> _infile;
    PARQUET_ASSIGN_OR_THROW(
                    _infile,
                    ::arrow::io::ReadableFile::Open(FLAGS_input_file,
                            ::arrow::default_memory_pool()));
    // Open Parquet file reader
    std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
    st = parquet::arrow::OpenFile(_infile, pool, &arrow_reader);
    if (!st.ok()) {
            LOG(ERROR) << "open file failed " << FLAGS_input_file;
            return 0;
    }


    // Read entire file as a single Arrow table
    std::shared_ptr<arrow::Table> table;
    st = arrow_reader->ReadTable(&table);
    if (!st.ok()) {
            LOG(INFO) << "read file to table successfully " << FLAGS_input_file;
    }   
     
    size_t num_cols = table->num_columns();
    for (size_t idx = 0; idx < num_cols; idx++) {
      auto this_field = table->field(idx);
      auto this_column = table->column(idx);
    if (this_field->name() == "lls_column") { // works. LLS of list<array_element: list<array_element: string>>
        for (size_t c_idx = 0; c_idx < this_column->num_chunks(); c_idx++) {
          auto row_array =
            std::static_pointer_cast<arrow::ListArray>(this_column->chunk(c_idx));
          auto sample_array =
            std::static_pointer_cast<arrow::ListArray>(row_array->values());
          auto id_array =
            std::static_pointer_cast<arrow::StringArray>(sample_array->values());
          for (int64_t i = 0; i < table->num_rows(); i++) {
            auto offset = row_array->value_offset(i);
            auto count = row_array->value_length(i);
            for (auto x = 0; x < count; x++) {
              std::vector<std::string> result;
              auto sample_offset = sample_array->value_offset(offset+x);
              auto id_count = sample_array->value_length(offset+x);
              for (auto id = 0; id < id_count; id++) {
                int32_t len;
                const uint8_t* addr = id_array->GetValue(sample_offset + id, &len);
                result.push_back(std::string(reinterpret_cast<const char*>(addr), (int16_t)len));
              }
              LOG(INFO) << "LLS " << count << " " << this_field->name() << " " << to_string(result);
            }
          }
        }
      }
      else if (this_field->name() == "ms2li_column") { // MS2LI type: map<string, list<array_element: int64>> 
        LOG(INFO)  << "col name: " << this_field->name() << " type: " << this_field->type()->ToString();
        LOG(INFO)  << "length: " << this_column->length() << " chunk num: " << this_column->num_chunks();
        for (size_t c_idx = 0; c_idx < this_column->num_chunks(); c_idx++) {
          auto row_array =
            std::static_pointer_cast<arrow::MapArray>(this_column->chunk(c_idx));
          auto keys_array =
            std::static_pointer_cast<arrow::StringArray>(row_array->keys());
          auto item_array =
            std::static_pointer_cast<arrow::ListArray>(row_array->items());
          auto item_value_array =
            std::static_pointer_cast<arrow::ListArray>(item_array->values());
          auto id_array =
            std::static_pointer_cast<arrow::Int64Array>(item_value_array->values());
        // I've no idea how to traverse the map<string, list<array_element: int64>> correctly, 
       }
      }
    }

{code}
It seems that arrow::MayArray :: keys() and items() lose each map pair's offset, and cannot find the right pair in list<array_element: map<string, string>> format. Can anyone help me on this? Thanks a lot!



--
This message was sent by Atlassian Jira
(v8.3.4#803005)