You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/08/23 21:07:33 UTC

[GitHub] [arrow-rs] alamb commented on a change in pull request #709: Support arrow readers for strings with DELTA_BYTE_ARRAY encoding

alamb commented on a change in pull request #709:
URL: https://github.com/apache/arrow-rs/pull/709#discussion_r694308857



##########
File path: parquet/src/arrow/arrow_array_reader.rs
##########
@@ -1559,4 +1605,120 @@ mod tests {
             array_reader.get_rep_levels()
         );
     }
+
+    /// Allows to write parquet into memory. Intended only for use in tests.
+    #[derive(Clone)]
+    struct VecWriter {
+        data: Arc<Mutex<Cursor<Vec<u8>>>>,
+    }
+
+    impl VecWriter {
+        pub fn new() -> VecWriter {
+            VecWriter {
+                data: Arc::new(Mutex::new(Cursor::new(Vec::new()))),
+            }
+        }
+
+        pub fn consume(self) -> Vec<u8> {
+            Arc::try_unwrap(self.data)
+                .unwrap()
+                .into_inner()
+                .unwrap()
+                .into_inner()
+        }
+    }
+
+    impl TryClone for VecWriter {
+        fn try_clone(&self) -> std::io::Result<Self> {
+            Ok(self.clone())
+        }
+    }
+
+    impl Seek for VecWriter {
+        fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> {
+            self.data.lock().unwrap().seek(pos)
+        }
+
+        fn stream_position(&mut self) -> std::io::Result<u64> {
+            self.data.lock().unwrap().stream_position()
+        }
+    }
+
+    impl Write for VecWriter {
+        fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+            self.data.lock().unwrap().write(buf)
+        }
+
+        fn flush(&mut self) -> std::io::Result<()> {
+            self.data.lock().unwrap().flush()
+        }
+    }
+
+    #[test]
+    fn test_string_delta_byte_array() {
+        use crate::basic;
+        use crate::schema::types::Type;
+
+        let data = VecWriter::new();
+        let schema = Arc::new(
+            Type::group_type_builder("string_test")
+                .with_fields(&mut vec![Arc::new(
+                    Type::primitive_type_builder("c", basic::Type::BYTE_ARRAY)
+                        .with_converted_type(ConvertedType::UTF8)
+                        .build()
+                        .unwrap(),
+                )])
+                .build()
+                .unwrap(),
+        );
+        // Disable dictionary and use the fallback encoding.
+        let p = Arc::new(
+            WriterProperties::builder()
+                .set_dictionary_enabled(false)
+                .set_encoding(Encoding::DELTA_BYTE_ARRAY)
+                .build(),
+        );
+        // Write a few strings.
+        let mut w = SerializedFileWriter::new(data.clone(), schema, p).unwrap();
+        let mut rg = w.next_row_group().unwrap();
+        let mut c = rg.next_column().unwrap().unwrap();
+        match &mut c {
+            ColumnWriter::ByteArrayColumnWriter(c) => {
+                c.write_batch(
+                    &[ByteArray::from("foo"), ByteArray::from("bar")],

Review comment:
       I recommend also adding a test here for `Null` / `None` (aka definition level 0)




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org