You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/08/02 19:43:37 UTC
[arrow-rs] branch master updated: Improve `object_store crate` documentation (#2260)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 3e17891b6 Improve `object_store crate` documentation (#2260)
3e17891b6 is described below
commit 3e17891b6f752d8df4be7c2e586c41c7518f98ac
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Tue Aug 2 15:43:33 2022 -0400
Improve `object_store crate` documentation (#2260)
* Improve crates.io page
* Improve builder doc examples
* Add examples in main library docs
* Apply suggestions from code review
Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
---
object_store/Cargo.toml | 4 +-
object_store/README.md | 17 +++++-
object_store/src/aws.rs | 5 +-
object_store/src/azure.rs | 5 +-
object_store/src/gcp.rs | 5 +-
object_store/src/lib.rs | 128 +++++++++++++++++++++++++++++++++++++++++++---
6 files changed, 147 insertions(+), 17 deletions(-)
diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml
index 741539891..b5b1ae1a3 100644
--- a/object_store/Cargo.toml
+++ b/object_store/Cargo.toml
@@ -21,7 +21,7 @@ version = "0.3.0"
edition = "2021"
license = "MIT/Apache-2.0"
readme = "README.md"
-description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage and Azure Blob Storage"
+description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage, Azure Blob Storage and local files."
keywords = [
"object",
"storage",
@@ -77,4 +77,4 @@ aws = ["rusoto_core", "rusoto_credential", "rusoto_s3", "rusoto_sts", "hyper", "
[dev-dependencies] # In alphabetical order
dotenv = "0.15.0"
tempfile = "3.1.0"
-futures-test = "0.3"
+futures-test = "0.3"
\ No newline at end of file
diff --git a/object_store/README.md b/object_store/README.md
index 313588b4a..fd10414a9 100644
--- a/object_store/README.md
+++ b/object_store/README.md
@@ -19,8 +19,21 @@
# Rust Object Store
-A crate providing a generic interface to object stores, such as S3, Azure Blob Storage and Google Cloud Storage.
+A focused, easy to use, idiomatic, high performance, `async` object
+store library interacting with object stores.
-Originally developed for [InfluxDB IOx](https://github.com/influxdata/influxdb_iox/) and later split out and donated to Apache Arrow.
+Using this crate, the same binary and code can easily run in multiple
+clouds and local test environments, via a simple runtime configuration
+change. Supported object stores include:
+
+* [AWS S3](https://aws.amazon.com/s3/)
+* [Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/)
+* [Google Cloud Storage](https://cloud.google.com/storage)
+* Local files
+* Memory
+* Custom implementations
+
+
+Originally developed for [InfluxDB IOx](https://github.com/influxdata/influxdb_iox/) and later split out and donated to [Apache Arrow](https://arrow.apache.org/).
See [docs.rs](https://docs.rs/object_store) for usage instructions
diff --git a/object_store/src/aws.rs b/object_store/src/aws.rs
index cedd4651e..d59f48bce 100644
--- a/object_store/src/aws.rs
+++ b/object_store/src/aws.rs
@@ -260,7 +260,7 @@ impl From<Error> for super::Error {
}
}
-/// Configuration for connecting to [Amazon S3](https://aws.amazon.com/s3/).
+/// Interface for [Amazon S3](https://aws.amazon.com/s3/).
pub struct AmazonS3 {
/// S3 client w/o any connection limit.
///
@@ -599,7 +599,8 @@ fn convert_object_meta(object: rusoto_s3::Object, bucket: &str) -> Result<Object
/// # let BUCKET_NAME = "foo";
/// # let ACCESS_KEY_ID = "foo";
/// # let SECRET_KEY = "foo";
-/// let s3 = object_store::aws::AmazonS3Builder::new()
+/// # use object_store::aws::AmazonS3Builder;
+/// let s3 = AmazonS3Builder::new()
/// .with_region(REGION)
/// .with_bucket_name(BUCKET_NAME)
/// .with_access_key_id(ACCESS_KEY_ID)
diff --git a/object_store/src/azure.rs b/object_store/src/azure.rs
index dca52a356..0d5f2fa7d 100644
--- a/object_store/src/azure.rs
+++ b/object_store/src/azure.rs
@@ -209,7 +209,7 @@ impl From<Error> for super::Error {
}
}
-/// Configuration for connecting to [Microsoft Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/).
+/// Interface for [Microsoft Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/).
#[derive(Debug)]
pub struct MicrosoftAzure {
container_client: Arc<ContainerClient>,
@@ -587,7 +587,8 @@ fn url_from_env(env_name: &str, default_url: &str) -> Result<Url> {
/// # let ACCOUNT = "foo";
/// # let BUCKET_NAME = "foo";
/// # let ACCESS_KEY = "foo";
-/// let azure = object_store::azure::MicrosoftAzureBuilder::new()
+/// # use object_store::azure::MicrosoftAzureBuilder;
+/// let azure = MicrosoftAzureBuilder::new()
/// .with_account(ACCOUNT)
/// .with_access_key(ACCESS_KEY)
/// .with_container_name(BUCKET_NAME)
diff --git a/object_store/src/gcp.rs b/object_store/src/gcp.rs
index dea8769a7..dd9c84498 100644
--- a/object_store/src/gcp.rs
+++ b/object_store/src/gcp.rs
@@ -192,7 +192,7 @@ struct CompleteMultipartUpload {
parts: Vec<MultipartPart>,
}
-/// Configuration for connecting to [Google Cloud Storage](https://cloud.google.com/storage/).
+/// Interface for [Google Cloud Storage](https://cloud.google.com/storage/).
#[derive(Debug)]
pub struct GoogleCloudStorage {
client: Arc<GoogleCloudStorageClient>,
@@ -792,7 +792,8 @@ fn reader_credentials_file(
/// ```
/// # let BUCKET_NAME = "foo";
/// # let SERVICE_ACCOUNT_PATH = "/tmp/foo.json";
-/// let gcs = object_store::gcp::GoogleCloudStorageBuilder::new()
+/// # use object_store::gcp::GoogleCloudStorageBuilder;
+/// let gcs = GoogleCloudStorageBuilder::new()
/// .with_service_account_path(SERVICE_ACCOUNT_PATH)
/// .with_bucket_name(BUCKET_NAME)
/// .build();
diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs
index 33e8452d0..c1d7e3ebd 100644
--- a/object_store/src/lib.rs
+++ b/object_store/src/lib.rs
@@ -28,15 +28,129 @@
//! # object_store
//!
-//! This crate provides APIs for interacting with object storage services.
+//! This crate provides a uniform API for interacting with object storage services and
+//! local files via the the [`ObjectStore`] trait.
//!
-//! It currently supports PUT (single or chunked/concurrent), GET, DELETE, HEAD and list for:
+//! # Create an [`ObjectStore`] implementation:
//!
-//! * [Google Cloud Storage](https://cloud.google.com/storage/)
-//! * [Amazon S3](https://aws.amazon.com/s3/)
-//! * [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/#overview)
-//! * In-memory
-//! * Local file storage
+//! * [Google Cloud Storage](https://cloud.google.com/storage/): [`GoogleCloudStorageBuilder`](gcp::GoogleCloudStorageBuilder)
+//! * [Amazon S3](https://aws.amazon.com/s3/): [`AmazonS3Builder`](aws::AmazonS3Builder)
+//! * [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/):: [`MicrosoftAzureBuilder`](azure::MicrosoftAzureBuilder)
+//! * In Memory: [`InMemory`](memory::InMemory)
+//! * Local filesystem: [`LocalFileSystem`](local::LocalFileSystem)
+//!
+//! # Adapters
+//!
+//! [`ObjectStore`] instances can be composed with various adapters
+//! which add additional functionality:
+//!
+//! * Rate Throttling: [`ThrottleConfig`](throttle::ThrottleConfig)
+//! * Concurrent Request Limit: [`LimitStore`](limit::LimitStore)
+//!
+//!
+//! # Listing objects:
+//!
+//! Use the [`ObjectStore::list`] method to iterate over objects in
+//! remote storage or files in the local filesystem:
+//!
+//! ```
+//! # use object_store::local::LocalFileSystem;
+//! # // use LocalFileSystem for example
+//! # fn get_object_store() -> LocalFileSystem {
+//! # LocalFileSystem::new_with_prefix("/tmp").unwrap()
+//! # }
+//!
+//! # async fn example() {
+//! use std::sync::Arc;
+//! use object_store::{path::Path, ObjectStore};
+//! use futures::stream::StreamExt;
+//!
+//! // create an ObjectStore
+//! let object_store: Arc<dyn ObjectStore> = Arc::new(get_object_store());
+//!
+//! // Recursively list all files below the 'data' path.
+//! // 1. On AWS S3 this would be the 'data/' prefix
+//! // 2. On a local filesystem, this would be the 'data' directory
+//! let prefix: Path = "data".try_into().unwrap();
+//!
+//! // Get an `async` stream of Metadata objects:
+//! let list_stream = object_store
+//! .list(Some(&prefix))
+//! .await
+//! .expect("Error listing files");
+//!
+//! // Print a line about each object based on its metadata
+//! // using for_each from `StreamExt` trait.
+//! list_stream
+//! .for_each(move |meta| {
+//! async {
+//! let meta = meta.expect("Error listing");
+//! println!("Name: {}, size: {}", meta.location, meta.size);
+//! }
+//! })
+//! .await;
+//! # }
+//! ```
+//!
+//! Which will print out something like the following:
+//!
+//! ```text
+//! Name: data/file01.parquet, size: 112832
+//! Name: data/file02.parquet, size: 143119
+//! Name: data/child/file03.parquet, size: 100
+//! ...
+//! ```
+//!
+//! # Fetching objects
+//!
+//! Use the [`ObjectStore::get`] method to fetch the data bytes
+//! from remote storage or files in the local filesystem as a stream.
+//!
+//! ```
+//! # use object_store::local::LocalFileSystem;
+//! # // use LocalFileSystem for example
+//! # fn get_object_store() -> LocalFileSystem {
+//! # LocalFileSystem::new_with_prefix("/tmp").unwrap()
+//! # }
+//!
+//! # async fn example() {
+//! use std::sync::Arc;
+//! use object_store::{path::Path, ObjectStore};
+//! use futures::stream::StreamExt;
+//!
+//! // create an ObjectStore
+//! let object_store: Arc<dyn ObjectStore> = Arc::new(get_object_store());
+//!
+//! // Retrieve a specific file
+//! let path: Path = "data/file01.parquet".try_into().unwrap();
+//!
+//! // fetch the bytes from object store
+//! let stream = object_store
+//! .get(&path)
+//! .await
+//! .unwrap()
+//! .into_stream();
+//!
+//! // Count the '0's using `map` from `StreamExt` trait
+//! let num_zeros = stream
+//! .map(|bytes| {
+//! let bytes = bytes.unwrap();
+//! bytes.iter().filter(|b| **b == 0).count()
+//! })
+//! .collect::<Vec<usize>>()
+//! .await
+//! .into_iter()
+//! .sum::<usize>();
+//!
+//! println!("Num zeros in {} is {}", path, num_zeros);
+//! # }
+//! ```
+//!
+//! Which will print out something like the following:
+//!
+//! ```text
+//! Num zeros in data/file01.parquet is 657
+//! ```
//!
#[cfg(feature = "aws")]