You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@arrow.apache.org by Kartik Thakore <ka...@hotg.ai> on 2022/07/19 01:35:24 UTC

[C++][Parquet] How to write List of UTF8 to parquet::StreamWriter

Hello,

I am working on the following schema:

1. Repeat

std::shared_ptr<parquet::schema::GroupNode> GetSchema()
{
parquet::schema::NodeVector fields;
{
auto element = parquet::schema::PrimitiveNode::Make("string", parquet::
Repetition::OPTIONAL,
parquet::Type::BYTE_ARRAY, parquet::ConvertedType::UTF8);
auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::
REPEATED, {element});
fields.push_back(
parquet::schema::GroupNode::Make("path", parquet::Repetition::REQUIRED, {
list}, parquet::ConvertedType::LIST));

fields.push_back(
parquet::schema::PrimitiveNode::Make(
"vstr", parquet::Repetition::OPTIONAL, parquet::Type::BYTE_ARRAY, parquet::
ConvertedType::UTF8));
fields.push_back(
parquet::schema::PrimitiveNode::Make(
"vint", parquet::Repetition::OPTIONAL, parquet::Type::INT64, parquet::
ConvertedType::NONE));
fields.push_back(
parquet::schema::PrimitiveNode::Make(
"vfloat", parquet::Repetition::OPTIONAL, parquet::Type::DOUBLE, parquet::
ConvertedType::NONE));
fields.push_back(
parquet::schema::PrimitiveNode::Make(
"vbool", parquet::Repetition::OPTIONAL, parquet::Type::BOOLEAN, parquet::
ConvertedType::NONE));
}

return std::static_pointer_cast<parquet::schema::GroupNode>(
parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED,
fields));
}


...

parquet::schema::NodeVector columnNames_{};

std::shared_ptr<parquet::schema::GroupNode> schema = GetSchema();

this->pq_oss_stream = parquet::StreamWriter{
parquet::ParquetFileWriter::Open(outfile, schema, builder.build())};

...

vector<string> current_path ...; //

this->pq_oss_stream << current_path << optional<uint32_t>{{}} << optional<
float>{{}} << optional<bool>{{}} << parquet::EndRow;


How do I write the current_path to pq_oss_stream?

-- 
*Kartik Thakore*

Re: [C++][Parquet] How to write List of UTF8 to parquet::StreamWriter

Posted by Micah Kornfield <em...@gmail.com>.
I don't think th c++ stream Api supports repeated or nested fields so I
don't think this is possible unless you use the lower level APIs or go
through the Arrow bindings

On Monday, July 18, 2022, Kartik Thakore <ka...@hotg.ai> wrote:

> Hello,
>
> I am working on the following schema:
>
> 1. Repeat
>
> std::shared_ptr<parquet::schema::GroupNode> GetSchema()
> {
> parquet::schema::NodeVector fields;
> {
> auto element = parquet::schema::PrimitiveNode::Make("string", parquet::
> Repetition::OPTIONAL,
> parquet::Type::BYTE_ARRAY, parquet::ConvertedType::UTF8);
> auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::
> REPEATED, {element});
> fields.push_back(
> parquet::schema::GroupNode::Make("path", parquet::Repetition::REQUIRED, {
> list}, parquet::ConvertedType::LIST));
>
> fields.push_back(
> parquet::schema::PrimitiveNode::Make(
> "vstr", parquet::Repetition::OPTIONAL, parquet::Type::BYTE_ARRAY, parquet
> ::ConvertedType::UTF8));
> fields.push_back(
> parquet::schema::PrimitiveNode::Make(
> "vint", parquet::Repetition::OPTIONAL, parquet::Type::INT64, parquet::
> ConvertedType::NONE));
> fields.push_back(
> parquet::schema::PrimitiveNode::Make(
> "vfloat", parquet::Repetition::OPTIONAL, parquet::Type::DOUBLE, parquet::
> ConvertedType::NONE));
> fields.push_back(
> parquet::schema::PrimitiveNode::Make(
> "vbool", parquet::Repetition::OPTIONAL, parquet::Type::BOOLEAN, parquet::
> ConvertedType::NONE));
> }
>
> return std::static_pointer_cast<parquet::schema::GroupNode>(
> parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED,
> fields));
> }
>
>
> ...
>
> parquet::schema::NodeVector columnNames_{};
>
> std::shared_ptr<parquet::schema::GroupNode> schema = GetSchema();
>
> this->pq_oss_stream = parquet::StreamWriter{
> parquet::ParquetFileWriter::Open(outfile, schema, builder.build())};
>
> ...
>
> vector<string> current_path ...; //
>
> this->pq_oss_stream << current_path << optional<uint32_t>{{}} << optional<
> float>{{}} << optional<bool>{{}} << parquet::EndRow;
>
>
> How do I write the current_path to pq_oss_stream?
>
> --
> *Kartik Thakore*
>