You are viewing a plain text version of this content. The canonical link for it is here.
Posted to jira@arrow.apache.org by "Ke Jia (Jira)" <ji...@apache.org> on 2022/10/24 05:28:00 UTC

[jira] [Updated] (ARROW-18140) The metadata info will lost in parquet file schema after writing the parquet file by calling the FileSystemDataset::Write() method.

     [ https://issues.apache.org/jira/browse/ARROW-18140?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Ke Jia updated ARROW-18140:
---------------------------
    Description: 
This issue can be reproduced by the following code.

auto format = std::make_shared<ParquetFileFormat>();
auto fs = std::make_shared<fs::internal::MockFileSystem>(fs::kNoTime);
FileSystemDatasetWriteOptions write_options;
write_options.file_write_options = format->DefaultWriteOptions();
write_options.filesystem = fs;
write_options.base_dir = "root";
write_options.partitioning = std::make_shared<HivePartitioning>(schema({}));
write_options.basename_template = "\{i}.parquet";
auto metadata =
    std::shared_ptr<KeyValueMetadata>(new KeyValueMetadata(\{"foo"}, \{"bar"}));
auto dataset_schema = schema(\{field("a", int64())}, metadata);
RecordBatchVector batches{
    ConstantArrayGenerator::Zeroes(kRowsPerBatch, dataset_schema)};
ASSERT_EQ(0, batches[0]->column(0)->null_count());
auto dataset = std::make_shared<InMemoryDataset>(dataset_schema, batches);
ASSERT_OK_AND_ASSIGN(auto scanner_builder, dataset->NewScan());
ASSERT_OK(scanner_builder->Project(
    \{compute::call("add", {compute::field_ref("a"), compute::literal(1)})},
    \{"a_plus_one"}));
ASSERT_OK_AND_ASSIGN(auto scanner, scanner_builder->Finish());

// Before write the schema has the metadata info.
ASSERT_EQ(1, dataset_schema->HasMetadata());
ASSERT_OK(FileSystemDataset::Write(write_options, scanner));

ASSERT_OK_AND_ASSIGN(auto dataset_factory, FileSystemDatasetFactory::Make(
                                               fs, \{"root/0.parquet"}, format, {}));
ASSERT_OK_AND_ASSIGN(auto written_dataset, dataset_factory->Finish(FinishOptions{}));

// After write the schema does not has the metadata info.
ASSERT_EQ(0, written_dataset->schema()->HasMetadata());

> The metadata info will lost in parquet file schema after writing the parquet file by calling the FileSystemDataset::Write() method.
> -----------------------------------------------------------------------------------------------------------------------------------
>
>                 Key: ARROW-18140
>                 URL: https://issues.apache.org/jira/browse/ARROW-18140
>             Project: Apache Arrow
>          Issue Type: Bug
>            Reporter: Ke Jia
>            Priority: Major
>
> This issue can be reproduced by the following code.
> auto format = std::make_shared<ParquetFileFormat>();
> auto fs = std::make_shared<fs::internal::MockFileSystem>(fs::kNoTime);
> FileSystemDatasetWriteOptions write_options;
> write_options.file_write_options = format->DefaultWriteOptions();
> write_options.filesystem = fs;
> write_options.base_dir = "root";
> write_options.partitioning = std::make_shared<HivePartitioning>(schema({}));
> write_options.basename_template = "\{i}.parquet";
> auto metadata =
>     std::shared_ptr<KeyValueMetadata>(new KeyValueMetadata(\{"foo"}, \{"bar"}));
> auto dataset_schema = schema(\{field("a", int64())}, metadata);
> RecordBatchVector batches{
>     ConstantArrayGenerator::Zeroes(kRowsPerBatch, dataset_schema)};
> ASSERT_EQ(0, batches[0]->column(0)->null_count());
> auto dataset = std::make_shared<InMemoryDataset>(dataset_schema, batches);
> ASSERT_OK_AND_ASSIGN(auto scanner_builder, dataset->NewScan());
> ASSERT_OK(scanner_builder->Project(
>     \{compute::call("add", {compute::field_ref("a"), compute::literal(1)})},
>     \{"a_plus_one"}));
> ASSERT_OK_AND_ASSIGN(auto scanner, scanner_builder->Finish());
> // Before write the schema has the metadata info.
> ASSERT_EQ(1, dataset_schema->HasMetadata());
> ASSERT_OK(FileSystemDataset::Write(write_options, scanner));
> ASSERT_OK_AND_ASSIGN(auto dataset_factory, FileSystemDatasetFactory::Make(
>                                                fs, \{"root/0.parquet"}, format, {}));
> ASSERT_OK_AND_ASSIGN(auto written_dataset, dataset_factory->Finish(FinishOptions{}));
> // After write the schema does not has the metadata info.
> ASSERT_EQ(0, written_dataset->schema()->HasMetadata());



--
This message was sent by Atlassian Jira
(v8.20.10#820010)