You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ji...@apache.org on 2022/11/13 13:25:15 UTC

[arrow-rs] branch add-bloom-filter-2 created (now d3d407b29)

This is an automated email from the ASF dual-hosted git repository.

jiayuliu pushed a change to branch add-bloom-filter-2
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


      at d3d407b29 add api

This branch includes the following new commits:

     new 5e200d981 add feature flag
     new d3d407b29 add api

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[arrow-rs] 02/02: add api

Posted by ji...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jiayuliu pushed a commit to branch add-bloom-filter-2
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git

commit d3d407b293091bd71c04f865b0c7c896ac52d452
Author: Jiayu Liu <ji...@hey.com>
AuthorDate: Sun Nov 13 13:24:10 2022 +0000

    add api
---
 parquet/src/file/reader.rs            |  6 ++++++
 parquet/src/file/serialized_reader.rs | 15 +++++++++++----
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs
index 70ff37a41..325944c21 100644
--- a/parquet/src/file/reader.rs
+++ b/parquet/src/file/reader.rs
@@ -21,6 +21,8 @@
 use bytes::Bytes;
 use std::{boxed::Box, io::Read, sync::Arc};
 
+#[cfg(feature = "bloom")]
+use crate::bloom_filter::Sbbf;
 use crate::column::page::PageIterator;
 use crate::column::{page::PageReader, reader::ColumnReader};
 use crate::errors::{ParquetError, Result};
@@ -143,6 +145,10 @@ pub trait RowGroupReader: Send + Sync {
         Ok(col_reader)
     }
 
+    #[cfg(feature = "bloom")]
+    /// Get bloom filter for the `i`th column chunk, if present.
+    fn get_column_bloom_filter(&self, i: usize) -> Result<Option<Sbbf>>;
+
     /// Get iterator of `Row`s from this row group.
     ///
     /// Projected schema can be a subset of or equal to the file schema, when it is None,
diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs
index a400d4dab..8cefe1c5e 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -22,11 +22,9 @@ use std::collections::VecDeque;
 use std::io::Cursor;
 use std::{convert::TryFrom, fs::File, io::Read, path::Path, sync::Arc};
 
-use crate::format::{PageHeader, PageLocation, PageType};
-use bytes::{Buf, Bytes};
-use thrift::protocol::TCompactInputProtocol;
-
 use crate::basic::{Encoding, Type};
+#[cfg(feature = "bloom")]
+use crate::bloom_filter::Sbbf;
 use crate::column::page::{Page, PageMetadata, PageReader};
 use crate::compression::{create_codec, Codec};
 use crate::errors::{ParquetError, Result};
@@ -38,10 +36,13 @@ use crate::file::{
     reader::*,
     statistics,
 };
+use crate::format::{PageHeader, PageLocation, PageType};
 use crate::record::reader::RowIter;
 use crate::record::Row;
 use crate::schema::types::Type as SchemaType;
 use crate::util::{io::TryClone, memory::ByteBufferPtr};
+use bytes::{Buf, Bytes};
+use thrift::protocol::TCompactInputProtocol;
 // export `SliceableCursor` and `FileSource` publically so clients can
 // re-use the logic in their own ParquetFileWriter wrappers
 pub use crate::util::io::FileSource;
@@ -387,6 +388,12 @@ impl<'a, R: 'static + ChunkReader> RowGroupReader for SerializedRowGroupReader<'
         )?))
     }
 
+    #[cfg(feature = "bloom")]
+    /// get bloom filter for the ith column
+    fn get_column_bloom_filter(&self, i: usize) -> Result<Option<Sbbf>> {
+        todo!()
+    }
+
     fn get_row_iter(&self, projection: Option<SchemaType>) -> Result<RowIter> {
         RowIter::from_row_group(projection, self)
     }


[arrow-rs] 01/02: add feature flag

Posted by ji...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jiayuliu pushed a commit to branch add-bloom-filter-2
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git

commit 5e200d9819669175f3ae2a3a3de384541fec9056
Author: Jiayu Liu <ji...@hey.com>
AuthorDate: Sun Nov 13 13:13:05 2022 +0000

    add feature flag
---
 .github/workflows/arrow.yml | 2 --
 parquet/README.md           | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml
index 2e1c64ebe..3e62ed775 100644
--- a/.github/workflows/arrow.yml
+++ b/.github/workflows/arrow.yml
@@ -39,7 +39,6 @@ on:
       - .github/**
 
 jobs:
-
   # test the crate
   linux-test:
     name: Test
@@ -134,7 +133,6 @@ jobs:
       - name: Check compilation --features simd --all-targets
         run: cargo check -p arrow --features simd --all-targets
 
-
   # test the arrow crate builds against wasm32 in nightly rust
   wasm32-build:
     name: Build wasm32
diff --git a/parquet/README.md b/parquet/README.md
index d904fc64e..c9245b082 100644
--- a/parquet/README.md
+++ b/parquet/README.md
@@ -41,6 +41,7 @@ However, for historical reasons, this crate uses versions with major numbers gre
 The `parquet` crate provides the following features which may be enabled in your `Cargo.toml`:
 
 - `arrow` (default) - support for reading / writing [`arrow`](https://crates.io/crates/arrow) arrays to / from parquet
+- `bloom` (default) - support for [split block bloom filter](https://github.com/apache/parquet-format/blob/master/BloomFilter.md) for reading from / writing to parquet
 - `async` - support `async` APIs for reading parquet
 - `json` - support for reading / writing `json` data to / from parquet
 - `brotli` (default) - support for parquet using `brotli` compression