You are viewing a plain text version of this content. The canonical link for it is here.
Posted to jira@arrow.apache.org by "Ke Jia (Jira)" <ji...@apache.org> on 2022/10/24 05:28:00 UTC
[jira] [Updated] (ARROW-18140) The metadata info will lost in parquet file schema after writing the parquet file by calling the FileSystemDataset::Write() method.
[ https://issues.apache.org/jira/browse/ARROW-18140?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Ke Jia updated ARROW-18140:
---------------------------
Description:
This issue can be reproduced by the following code.
auto format = std::make_shared<ParquetFileFormat>();
auto fs = std::make_shared<fs::internal::MockFileSystem>(fs::kNoTime);
FileSystemDatasetWriteOptions write_options;
write_options.file_write_options = format->DefaultWriteOptions();
write_options.filesystem = fs;
write_options.base_dir = "root";
write_options.partitioning = std::make_shared<HivePartitioning>(schema({}));
write_options.basename_template = "\{i}.parquet";
auto metadata =
std::shared_ptr<KeyValueMetadata>(new KeyValueMetadata(\{"foo"}, \{"bar"}));
auto dataset_schema = schema(\{field("a", int64())}, metadata);
RecordBatchVector batches{
ConstantArrayGenerator::Zeroes(kRowsPerBatch, dataset_schema)};
ASSERT_EQ(0, batches[0]->column(0)->null_count());
auto dataset = std::make_shared<InMemoryDataset>(dataset_schema, batches);
ASSERT_OK_AND_ASSIGN(auto scanner_builder, dataset->NewScan());
ASSERT_OK(scanner_builder->Project(
\{compute::call("add", {compute::field_ref("a"), compute::literal(1)})},
\{"a_plus_one"}));
ASSERT_OK_AND_ASSIGN(auto scanner, scanner_builder->Finish());
// Before write the schema has the metadata info.
ASSERT_EQ(1, dataset_schema->HasMetadata());
ASSERT_OK(FileSystemDataset::Write(write_options, scanner));
ASSERT_OK_AND_ASSIGN(auto dataset_factory, FileSystemDatasetFactory::Make(
fs, \{"root/0.parquet"}, format, {}));
ASSERT_OK_AND_ASSIGN(auto written_dataset, dataset_factory->Finish(FinishOptions{}));
// After write the schema does not has the metadata info.
ASSERT_EQ(0, written_dataset->schema()->HasMetadata());
> The metadata info will lost in parquet file schema after writing the parquet file by calling the FileSystemDataset::Write() method.
> -----------------------------------------------------------------------------------------------------------------------------------
>
> Key: ARROW-18140
> URL: https://issues.apache.org/jira/browse/ARROW-18140
> Project: Apache Arrow
> Issue Type: Bug
> Reporter: Ke Jia
> Priority: Major
>
> This issue can be reproduced by the following code.
> auto format = std::make_shared<ParquetFileFormat>();
> auto fs = std::make_shared<fs::internal::MockFileSystem>(fs::kNoTime);
> FileSystemDatasetWriteOptions write_options;
> write_options.file_write_options = format->DefaultWriteOptions();
> write_options.filesystem = fs;
> write_options.base_dir = "root";
> write_options.partitioning = std::make_shared<HivePartitioning>(schema({}));
> write_options.basename_template = "\{i}.parquet";
> auto metadata =
> std::shared_ptr<KeyValueMetadata>(new KeyValueMetadata(\{"foo"}, \{"bar"}));
> auto dataset_schema = schema(\{field("a", int64())}, metadata);
> RecordBatchVector batches{
> ConstantArrayGenerator::Zeroes(kRowsPerBatch, dataset_schema)};
> ASSERT_EQ(0, batches[0]->column(0)->null_count());
> auto dataset = std::make_shared<InMemoryDataset>(dataset_schema, batches);
> ASSERT_OK_AND_ASSIGN(auto scanner_builder, dataset->NewScan());
> ASSERT_OK(scanner_builder->Project(
> \{compute::call("add", {compute::field_ref("a"), compute::literal(1)})},
> \{"a_plus_one"}));
> ASSERT_OK_AND_ASSIGN(auto scanner, scanner_builder->Finish());
> // Before write the schema has the metadata info.
> ASSERT_EQ(1, dataset_schema->HasMetadata());
> ASSERT_OK(FileSystemDataset::Write(write_options, scanner));
> ASSERT_OK_AND_ASSIGN(auto dataset_factory, FileSystemDatasetFactory::Make(
> fs, \{"root/0.parquet"}, format, {}));
> ASSERT_OK_AND_ASSIGN(auto written_dataset, dataset_factory->Finish(FinishOptions{}));
> // After write the schema does not has the metadata info.
> ASSERT_EQ(0, written_dataset->schema()->HasMetadata());
--
This message was sent by Atlassian Jira
(v8.20.10#820010)