You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ji...@apache.org on 2022/11/13 13:25:15 UTC
[arrow-rs] branch add-bloom-filter-2 created (now d3d407b29)
This is an automated email from the ASF dual-hosted git repository.
jiayuliu pushed a change to branch add-bloom-filter-2
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
at d3d407b29 add api
This branch includes the following new commits:
new 5e200d981 add feature flag
new d3d407b29 add api
The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
[arrow-rs] 02/02: add api
Posted by ji...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
jiayuliu pushed a commit to branch add-bloom-filter-2
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
commit d3d407b293091bd71c04f865b0c7c896ac52d452
Author: Jiayu Liu <ji...@hey.com>
AuthorDate: Sun Nov 13 13:24:10 2022 +0000
add api
---
parquet/src/file/reader.rs | 6 ++++++
parquet/src/file/serialized_reader.rs | 15 +++++++++++----
2 files changed, 17 insertions(+), 4 deletions(-)
diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs
index 70ff37a41..325944c21 100644
--- a/parquet/src/file/reader.rs
+++ b/parquet/src/file/reader.rs
@@ -21,6 +21,8 @@
use bytes::Bytes;
use std::{boxed::Box, io::Read, sync::Arc};
+#[cfg(feature = "bloom")]
+use crate::bloom_filter::Sbbf;
use crate::column::page::PageIterator;
use crate::column::{page::PageReader, reader::ColumnReader};
use crate::errors::{ParquetError, Result};
@@ -143,6 +145,10 @@ pub trait RowGroupReader: Send + Sync {
Ok(col_reader)
}
+ #[cfg(feature = "bloom")]
+ /// Get bloom filter for the `i`th column chunk, if present.
+ fn get_column_bloom_filter(&self, i: usize) -> Result<Option<Sbbf>>;
+
/// Get iterator of `Row`s from this row group.
///
/// Projected schema can be a subset of or equal to the file schema, when it is None,
diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs
index a400d4dab..8cefe1c5e 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -22,11 +22,9 @@ use std::collections::VecDeque;
use std::io::Cursor;
use std::{convert::TryFrom, fs::File, io::Read, path::Path, sync::Arc};
-use crate::format::{PageHeader, PageLocation, PageType};
-use bytes::{Buf, Bytes};
-use thrift::protocol::TCompactInputProtocol;
-
use crate::basic::{Encoding, Type};
+#[cfg(feature = "bloom")]
+use crate::bloom_filter::Sbbf;
use crate::column::page::{Page, PageMetadata, PageReader};
use crate::compression::{create_codec, Codec};
use crate::errors::{ParquetError, Result};
@@ -38,10 +36,13 @@ use crate::file::{
reader::*,
statistics,
};
+use crate::format::{PageHeader, PageLocation, PageType};
use crate::record::reader::RowIter;
use crate::record::Row;
use crate::schema::types::Type as SchemaType;
use crate::util::{io::TryClone, memory::ByteBufferPtr};
+use bytes::{Buf, Bytes};
+use thrift::protocol::TCompactInputProtocol;
// export `SliceableCursor` and `FileSource` publically so clients can
// re-use the logic in their own ParquetFileWriter wrappers
pub use crate::util::io::FileSource;
@@ -387,6 +388,12 @@ impl<'a, R: 'static + ChunkReader> RowGroupReader for SerializedRowGroupReader<'
)?))
}
+ #[cfg(feature = "bloom")]
+ /// get bloom filter for the ith column
+ fn get_column_bloom_filter(&self, i: usize) -> Result<Option<Sbbf>> {
+ todo!()
+ }
+
fn get_row_iter(&self, projection: Option<SchemaType>) -> Result<RowIter> {
RowIter::from_row_group(projection, self)
}
[arrow-rs] 01/02: add feature flag
Posted by ji...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
jiayuliu pushed a commit to branch add-bloom-filter-2
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
commit 5e200d9819669175f3ae2a3a3de384541fec9056
Author: Jiayu Liu <ji...@hey.com>
AuthorDate: Sun Nov 13 13:13:05 2022 +0000
add feature flag
---
.github/workflows/arrow.yml | 2 --
parquet/README.md | 1 +
2 files changed, 1 insertion(+), 2 deletions(-)
diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml
index 2e1c64ebe..3e62ed775 100644
--- a/.github/workflows/arrow.yml
+++ b/.github/workflows/arrow.yml
@@ -39,7 +39,6 @@ on:
- .github/**
jobs:
-
# test the crate
linux-test:
name: Test
@@ -134,7 +133,6 @@ jobs:
- name: Check compilation --features simd --all-targets
run: cargo check -p arrow --features simd --all-targets
-
# test the arrow crate builds against wasm32 in nightly rust
wasm32-build:
name: Build wasm32
diff --git a/parquet/README.md b/parquet/README.md
index d904fc64e..c9245b082 100644
--- a/parquet/README.md
+++ b/parquet/README.md
@@ -41,6 +41,7 @@ However, for historical reasons, this crate uses versions with major numbers gre
The `parquet` crate provides the following features which may be enabled in your `Cargo.toml`:
- `arrow` (default) - support for reading / writing [`arrow`](https://crates.io/crates/arrow) arrays to / from parquet
+- `bloom` (default) - support for [split block bloom filter](https://github.com/apache/parquet-format/blob/master/BloomFilter.md) for reading from / writing to parquet
- `async` - support `async` APIs for reading parquet
- `json` - support for reading / writing `json` data to / from parquet
- `brotli` (default) - support for parquet using `brotli` compression