You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/06/16 03:44:18 UTC

[GitHub] [arrow-rs] liyongjing opened a new issue, #1886: how read/write REPEATED

liyongjing opened a new issue, #1886:
URL: https://github.com/apache/arrow-rs/issues/1886

   **Which part is this question about**
   ```
   use std::{fs::File, path::Path, sync::Arc};
   
   use parquet::{
       basic::Compression,
       data_type::{ByteArray, ByteArrayType, Int32Type},
       file::{
           properties::{WriterProperties, WriterVersion},
           reader::FileReader,
           serialized_reader::SerializedFileReader,
           writer::SerializedFileWriter,
       },
       record::{Row, RowAccessor},
       schema::parser::parse_message_type,
   };
   
   const MESSAGE_TYPE: &'static str = "
   message Log {
     OPTIONAL INT32 eventType;
     REPEATED BYTE_ARRAY category;
   }
   ";
   
   pub struct Item {
       pub event_type: i32,
       pub category: Vec<String>,
   }
   
   pub struct Batch {
       pub event_types: Vec<i32>,
       pub categories: Vec<ByteArray>,
   }
   
   fn data() -> Batch {
       let items = vec![
           Item {
               event_type: 1,
               category: vec!["test11".to_string(), "test12".to_string()],
           },
           Item {
               event_type: 2,
               category: vec!["test21".to_string(), "test22".to_string()],
           },
       ];
       let mut b = Batch {
           event_types: vec![],
           categories: vec![],
       };
   
       for item in &items {
           b.event_types.push(item.event_type);
           for cate in &item.category {
               b.categories.push(ByteArray::from(cate.as_str()));
           }
       }
       b
   }
   
   fn write() {
       let path = Path::new("sample.parquet");
       let file = File::create(&path).unwrap();
       let schema = Arc::new(parse_message_type(MESSAGE_TYPE).unwrap());
   
       let props = Arc::new(
           WriterProperties::builder()
               .set_compression(Compression::SNAPPY)
               .set_writer_version(WriterVersion::PARQUET_2_0)
               .build(),
       );
   
       let mut writer = SerializedFileWriter::new(file, schema, props).unwrap();
       let mut row_group_writer = writer.next_row_group().unwrap();
   
       let batch = data();
       // column 0
       let mut col_writer = row_group_writer
           .next_column()
           .expect("next column")
           .unwrap();
       col_writer
           .typed::<Int32Type>()
           .write_batch(&batch.event_types, None, None)
           .expect("writing column");
       col_writer.close().expect("close column");
       // column 1 how write REPEATED?
       let mut col_writer = row_group_writer
           .next_column()
           .expect("next column")
           .unwrap();
       col_writer
           .typed::<ByteArrayType>()
           .write_batch(&batch.categories, None, None)
           .expect("writing column");
       col_writer.close().expect("close column");
   
       let rg_md = row_group_writer.close().expect("close row group");
       println!("total rows written: {}", rg_md.num_rows());
   
       writer.close().unwrap();
   }
   
   fn read() {
       let path = Path::new("sample.parquet");
       let file = File::open(path).expect("Unable to open file");
       let reader = SerializedFileReader::new(file).expect("Unable to read file");
   
       let iter = reader.get_row_iter(None).expect("get iterator");
       for record in iter {
           let event_type = record.get_int(0).unwrap();
           read_category(&record, 1);
           println!("event_type{}", event_type);
       }
   }
   
   // public static List<String> getCategory(Group value) {
   //     List<String> categoryList = new ArrayList<>();
   //     try {
   //       int count = value.getFieldRepetitionCount("category");
   //       if (count > 0) {
   //         int index = 0;
   //         while (index < count) {
   //           categoryList.add(value.getString("category", index++).trim());
   //         }
   //       }
   //     } catch (Exception e) {
   //     }
   //     return categoryList;
   //   }
   fn read_category(record: &Row, i: usize) {
       // where is getFieldRepetitionCount
       match record.get_bytes(i) {
           Ok(v) => println!("{:?}", v.as_utf8()),
           Err(_) => {}
       };
   }
   ```
   
   **Describe your question**
   how read/write category using parquet
   
   **Additional context**
   Add any other context about the problem here.
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] tustvold commented on issue #1886: how read/write REPEATED

Posted by GitBox <gi...@apache.org>.
tustvold commented on issue #1886:
URL: https://github.com/apache/arrow-rs/issues/1886#issuecomment-1159371207

   Hi, I'm not very familiar with parquet-mr which your example appears to be based on, nor am I hugely knowledgeable about the record APIs for reading parquet, but I'll try to help out here 😅
   
   Perusing the docs it would appear you can use https://docs.rs/parquet/latest/parquet/file/reader/trait.FileReader.html#tymethod.get_row_iter to get a row iterator, and then call https://docs.rs/parquet/latest/parquet/record/trait.RowAccessor.html#tymethod.get_list on the row.
   
   FWIW I would strongly encourage you to consider trying out the arrow interface, it should be faster, better tested and better documented than the record APIs which are somewhat orphaned at the moment...


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] liyongjing closed issue #1886: how read/write REPEATED

Posted by GitBox <gi...@apache.org>.
liyongjing closed issue #1886: how read/write REPEATED
URL: https://github.com/apache/arrow-rs/issues/1886


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org