You are viewing a plain text version of this content. The canonical link for it is here.
Posted to jira@arrow.apache.org by "Andy Grove (Jira)" <ji...@apache.org> on 2020/12/15 15:47:00 UTC

[jira] [Created] (ARROW-10920) [Rust] Segmentation fault in Arrow Parquet writer with huge arrays

Andy Grove created ARROW-10920:
----------------------------------

             Summary: [Rust] Segmentation fault in Arrow Parquet writer with huge arrays
                 Key: ARROW-10920
                 URL: https://issues.apache.org/jira/browse/ARROW-10920
             Project: Apache Arrow
          Issue Type: Bug
          Components: Rust
            Reporter: Andy Grove


I stumbled across this by chance. I am not too surprised that this fails but I would expect it to fail gracefully and not with a segmentation fault.

 
{code:java}
 use std::fs::File;
use std::sync::Arc;

use arrow::array::StringBuilder;
use arrow::datatypes::{DataType, Field, Schema};
use arrow::error::Result;
use arrow::record_batch::RecordBatch;

use parquet::arrow::ArrowWriter;

fn main() -> Result<()> {
    let schema = Schema::new(vec![
        Field::new("c0", DataType::Utf8, false),
        Field::new("c1", DataType::Utf8, true),
    ]);
    let batch_size = 2500000;
    let repeat_count = 140;
    let file = File::create("/tmp/test.parquet")?;
    let mut writer = ArrowWriter::try_new(file, Arc::new(schema.clone()), None).unwrap();
    let mut c0_builder = StringBuilder::new(batch_size);
    let mut c1_builder = StringBuilder::new(batch_size);

    println!("Start of loop");
    for i in 0..batch_size {
        let c0_value = format!("{:032}", i);
        let c1_value = c0_value.repeat(repeat_count);
        c0_builder.append_value(&c0_value)?;
        c1_builder.append_value(&c1_value)?;
    }

    println!("Finish building c0");
    let c0 = Arc::new(c0_builder.finish());

    println!("Finish building c1");
    let c1 = Arc::new(c1_builder.finish());

    println!("Creating RecordBatch");
    let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![c0, c1])?;

    // write the batch to parquet
    println!("Writing RecordBatch");
    writer.write(&batch).unwrap();

    println!("Closing writer");
    writer.close().unwrap();

    Ok(())
}
{code}
output:
{code:java}
Start of loop
Finish building c0
Finish building c1
Creating RecordBatch
Writing RecordBatch
Segmentation fault (core dumped)
 {code}
 



--
This message was sent by Atlassian Jira
(v8.3.4#803005)