You are viewing a plain text version of this content. The canonical link for it is here.
Posted to jira@arrow.apache.org by "Qingxiang Chen (Jira)" <ji...@apache.org> on 2021/12/06 09:49:00 UTC

[jira] [Updated] (ARROW-14987) [C++]Memory leak while reading parquet file

     [ https://issues.apache.org/jira/browse/ARROW-14987?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Qingxiang Chen updated ARROW-14987:
-----------------------------------
    Description: 
When I used parquet to access data, I found that the memory usage was still high after the function ended. I reproduced this problem in the example. code show as below:

 
{code:c++}
#include <arrow/api.h>
#include <arrow/io/api.h>
#include <parquet/arrow/reader.h>
#include <parquet/arrow/writer.h>
#include <parquet/exception.h>
#include <unistd.h>
#include <iostream>

std::shared_ptr<arrow::Table> generate_table() {
  arrow::Int64Builder i64builder;
  for (int i=0;i<320000;i++){
	  i64builder.Append(i);
  }
  std::shared_ptr<arrow::Array> i64array;
  PARQUET_THROW_NOT_OK(i64builder.Finish(&i64array));

  std::shared_ptr<arrow::Schema> schema = arrow::schema(
      {arrow::field("int", arrow::int64())});

  return arrow::Table::Make(schema, {i64array});
}

void write_parquet_file(const arrow::Table& table) {
  std::shared_ptr<arrow::io::FileOutputStream> outfile;
  PARQUET_ASSIGN_OR_THROW(
      outfile, arrow::io::FileOutputStream::Open("parquet-arrow-example.parquet"));
  PARQUET_THROW_NOT_OK(
      parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3));
}

void read_whole_file() {
  std::cout << "Reading parquet-arrow-example.parquet at once" << std::endl;
  std::shared_ptr<arrow::io::ReadableFile> infile;
  PARQUET_ASSIGN_OR_THROW(infile,
                          arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
                                                        arrow::default_memory_pool()));

  std::unique_ptr<parquet::arrow::FileReader> reader;
  PARQUET_THROW_NOT_OK(
      parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
  std::shared_ptr<arrow::Table> table;
  PARQUET_THROW_NOT_OK(reader->ReadTable(&table));
  std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns()
            << " columns." << std::endl;
}

int main(int argc, char** argv) {
  std::shared_ptr<arrow::Table> table = generate_table();
  write_parquet_file(*table);
  std::cout << "start " <<std::endl;
  read_whole_file();
  std::cout << "end " <<std::endl;
  sleep(100);
}

{code}
After the end, during sleep, the memory usage is still more than 100M and has not dropped. When I increase the data volume by 5 times, the memory usage is about 500M, and it will not drop.
I want to know whether this part of the data is cached by the memory pool, or whether it is a memory leak problem.

  was:
When I used parquet to access data, I found that the memory usage was still high after the function ended. I reproduced this problem in the example. code show as below:

 
{code:c++}
#include <arrow/api.h>
#include <arrow/io/api.h>
#include <parquet/arrow/reader.h>
#include <parquet/arrow/writer.h>
#include <parquet/exception.h>
#include <unistd.h>
#include <iostream>

std::shared_ptr<arrow::Table> generate_table() {
  arrow::Int64Builder i64builder;
  PARQUET_THROW_NOT_OK(i64builder.AppendValues({1, 2, 3, 4, 5}));
  for (int i=0;i<320000;i++){
	  i64builder.Append(i);
  }
  std::shared_ptr<arrow::Array> i64array;
  PARQUET_THROW_NOT_OK(i64builder.Finish(&i64array));

  std::shared_ptr<arrow::Schema> schema = arrow::schema(
      {arrow::field("int", arrow::int64())});

  return arrow::Table::Make(schema, {i64array});
}

void write_parquet_file(const arrow::Table& table) {
  std::shared_ptr<arrow::io::FileOutputStream> outfile;
  PARQUET_ASSIGN_OR_THROW(
      outfile, arrow::io::FileOutputStream::Open("parquet-arrow-example.parquet"));
  PARQUET_THROW_NOT_OK(
      parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3));
}

void read_whole_file() {
  std::cout << "Reading parquet-arrow-example.parquet at once" << std::endl;
  std::shared_ptr<arrow::io::ReadableFile> infile;
  PARQUET_ASSIGN_OR_THROW(infile,
                          arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
                                                        arrow::default_memory_pool()));

  std::unique_ptr<parquet::arrow::FileReader> reader;
  PARQUET_THROW_NOT_OK(
      parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
  std::shared_ptr<arrow::Table> table;
  PARQUET_THROW_NOT_OK(reader->ReadTable(&table));
  std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns()
            << " columns." << std::endl;
}

int main(int argc, char** argv) {
  std::shared_ptr<arrow::Table> table = generate_table();
  write_parquet_file(*table);
  std::cout << "start " <<std::endl;
  read_whole_file();
  std::cout << "end " <<std::endl;
  sleep(100);
}

{code}
After the end, during sleep, the memory usage is still more than 100M and has not dropped. When I increase the data volume by 5 times, the memory usage is about 500M, and it will not drop.
I want to know whether this part of the data is cached by the memory pool, or whether it is a memory leak problem.


> [C++]Memory leak while reading parquet file
> -------------------------------------------
>
>                 Key: ARROW-14987
>                 URL: https://issues.apache.org/jira/browse/ARROW-14987
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: C++
>    Affects Versions: 6.0.1
>            Reporter: Qingxiang Chen
>            Priority: Major
>
> When I used parquet to access data, I found that the memory usage was still high after the function ended. I reproduced this problem in the example. code show as below:
>  
> {code:c++}
> #include <arrow/api.h>
> #include <arrow/io/api.h>
> #include <parquet/arrow/reader.h>
> #include <parquet/arrow/writer.h>
> #include <parquet/exception.h>
> #include <unistd.h>
> #include <iostream>
> std::shared_ptr<arrow::Table> generate_table() {
>   arrow::Int64Builder i64builder;
>   for (int i=0;i<320000;i++){
> 	  i64builder.Append(i);
>   }
>   std::shared_ptr<arrow::Array> i64array;
>   PARQUET_THROW_NOT_OK(i64builder.Finish(&i64array));
>   std::shared_ptr<arrow::Schema> schema = arrow::schema(
>       {arrow::field("int", arrow::int64())});
>   return arrow::Table::Make(schema, {i64array});
> }
> void write_parquet_file(const arrow::Table& table) {
>   std::shared_ptr<arrow::io::FileOutputStream> outfile;
>   PARQUET_ASSIGN_OR_THROW(
>       outfile, arrow::io::FileOutputStream::Open("parquet-arrow-example.parquet"));
>   PARQUET_THROW_NOT_OK(
>       parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3));
> }
> void read_whole_file() {
>   std::cout << "Reading parquet-arrow-example.parquet at once" << std::endl;
>   std::shared_ptr<arrow::io::ReadableFile> infile;
>   PARQUET_ASSIGN_OR_THROW(infile,
>                           arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
>                                                         arrow::default_memory_pool()));
>   std::unique_ptr<parquet::arrow::FileReader> reader;
>   PARQUET_THROW_NOT_OK(
>       parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
>   std::shared_ptr<arrow::Table> table;
>   PARQUET_THROW_NOT_OK(reader->ReadTable(&table));
>   std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns()
>             << " columns." << std::endl;
> }
> int main(int argc, char** argv) {
>   std::shared_ptr<arrow::Table> table = generate_table();
>   write_parquet_file(*table);
>   std::cout << "start " <<std::endl;
>   read_whole_file();
>   std::cout << "end " <<std::endl;
>   sleep(100);
> }
> {code}
> After the end, during sleep, the memory usage is still more than 100M and has not dropped. When I increase the data volume by 5 times, the memory usage is about 500M, and it will not drop.
> I want to know whether this part of the data is cached by the memory pool, or whether it is a memory leak problem.



--
This message was sent by Atlassian Jira
(v8.20.1#820001)