You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/08/02 19:43:37 UTC

[arrow-rs] branch master updated: Improve `object_store crate` documentation (#2260)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 3e17891b6 Improve `object_store crate` documentation (#2260)
3e17891b6 is described below

commit 3e17891b6f752d8df4be7c2e586c41c7518f98ac
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Tue Aug 2 15:43:33 2022 -0400

    Improve `object_store crate` documentation (#2260)
    
    * Improve crates.io page
    
    * Improve builder doc examples
    
    * Add examples in main library docs
    
    * Apply suggestions from code review
    
    Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
    
    Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
---
 object_store/Cargo.toml   |   4 +-
 object_store/README.md    |  17 +++++-
 object_store/src/aws.rs   |   5 +-
 object_store/src/azure.rs |   5 +-
 object_store/src/gcp.rs   |   5 +-
 object_store/src/lib.rs   | 128 +++++++++++++++++++++++++++++++++++++++++++---
 6 files changed, 147 insertions(+), 17 deletions(-)

diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml
index 741539891..b5b1ae1a3 100644
--- a/object_store/Cargo.toml
+++ b/object_store/Cargo.toml
@@ -21,7 +21,7 @@ version = "0.3.0"
 edition = "2021"
 license = "MIT/Apache-2.0"
 readme = "README.md"
-description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage and Azure Blob Storage"
+description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage, Azure Blob Storage and local files."
 keywords = [
     "object",
     "storage",
@@ -77,4 +77,4 @@ aws = ["rusoto_core", "rusoto_credential", "rusoto_s3", "rusoto_sts", "hyper", "
 [dev-dependencies] # In alphabetical order
 dotenv = "0.15.0"
 tempfile = "3.1.0"
-futures-test = "0.3"
+futures-test = "0.3"
\ No newline at end of file
diff --git a/object_store/README.md b/object_store/README.md
index 313588b4a..fd10414a9 100644
--- a/object_store/README.md
+++ b/object_store/README.md
@@ -19,8 +19,21 @@
 
 # Rust Object Store
 
-A crate providing a generic interface to object stores, such as S3, Azure Blob Storage and Google Cloud Storage.
+A focused, easy to use, idiomatic, high performance, `async` object
+store library interacting with object stores.
 
-Originally developed for [InfluxDB IOx](https://github.com/influxdata/influxdb_iox/) and later split out and donated to Apache Arrow.
+Using this crate, the same binary and code can easily run in multiple
+clouds and local test environments, via a simple runtime configuration
+change. Supported object stores include:
+
+* [AWS S3](https://aws.amazon.com/s3/)
+* [Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/)
+* [Google Cloud Storage](https://cloud.google.com/storage)
+* Local files
+* Memory
+* Custom implementations
+
+
+Originally developed for [InfluxDB IOx](https://github.com/influxdata/influxdb_iox/) and later split out and donated to [Apache Arrow](https://arrow.apache.org/).
 
 See [docs.rs](https://docs.rs/object_store) for usage instructions
diff --git a/object_store/src/aws.rs b/object_store/src/aws.rs
index cedd4651e..d59f48bce 100644
--- a/object_store/src/aws.rs
+++ b/object_store/src/aws.rs
@@ -260,7 +260,7 @@ impl From<Error> for super::Error {
     }
 }
 
-/// Configuration for connecting to [Amazon S3](https://aws.amazon.com/s3/).
+/// Interface for [Amazon S3](https://aws.amazon.com/s3/).
 pub struct AmazonS3 {
     /// S3 client w/o any connection limit.
     ///
@@ -599,7 +599,8 @@ fn convert_object_meta(object: rusoto_s3::Object, bucket: &str) -> Result<Object
 /// # let BUCKET_NAME = "foo";
 /// # let ACCESS_KEY_ID = "foo";
 /// # let SECRET_KEY = "foo";
-/// let s3 = object_store::aws::AmazonS3Builder::new()
+/// # use object_store::aws::AmazonS3Builder;
+/// let s3 = AmazonS3Builder::new()
 ///  .with_region(REGION)
 ///  .with_bucket_name(BUCKET_NAME)
 ///  .with_access_key_id(ACCESS_KEY_ID)
diff --git a/object_store/src/azure.rs b/object_store/src/azure.rs
index dca52a356..0d5f2fa7d 100644
--- a/object_store/src/azure.rs
+++ b/object_store/src/azure.rs
@@ -209,7 +209,7 @@ impl From<Error> for super::Error {
     }
 }
 
-/// Configuration for connecting to [Microsoft Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/).
+/// Interface for [Microsoft Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/).
 #[derive(Debug)]
 pub struct MicrosoftAzure {
     container_client: Arc<ContainerClient>,
@@ -587,7 +587,8 @@ fn url_from_env(env_name: &str, default_url: &str) -> Result<Url> {
 /// # let ACCOUNT = "foo";
 /// # let BUCKET_NAME = "foo";
 /// # let ACCESS_KEY = "foo";
-/// let azure = object_store::azure::MicrosoftAzureBuilder::new()
+/// # use object_store::azure::MicrosoftAzureBuilder;
+/// let azure = MicrosoftAzureBuilder::new()
 ///  .with_account(ACCOUNT)
 ///  .with_access_key(ACCESS_KEY)
 ///  .with_container_name(BUCKET_NAME)
diff --git a/object_store/src/gcp.rs b/object_store/src/gcp.rs
index dea8769a7..dd9c84498 100644
--- a/object_store/src/gcp.rs
+++ b/object_store/src/gcp.rs
@@ -192,7 +192,7 @@ struct CompleteMultipartUpload {
     parts: Vec<MultipartPart>,
 }
 
-/// Configuration for connecting to [Google Cloud Storage](https://cloud.google.com/storage/).
+/// Interface for [Google Cloud Storage](https://cloud.google.com/storage/).
 #[derive(Debug)]
 pub struct GoogleCloudStorage {
     client: Arc<GoogleCloudStorageClient>,
@@ -792,7 +792,8 @@ fn reader_credentials_file(
 /// ```
 /// # let BUCKET_NAME = "foo";
 /// # let SERVICE_ACCOUNT_PATH = "/tmp/foo.json";
-/// let gcs = object_store::gcp::GoogleCloudStorageBuilder::new()
+/// # use object_store::gcp::GoogleCloudStorageBuilder;
+/// let gcs = GoogleCloudStorageBuilder::new()
 ///  .with_service_account_path(SERVICE_ACCOUNT_PATH)
 ///  .with_bucket_name(BUCKET_NAME)
 ///  .build();
diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs
index 33e8452d0..c1d7e3ebd 100644
--- a/object_store/src/lib.rs
+++ b/object_store/src/lib.rs
@@ -28,15 +28,129 @@
 
 //! # object_store
 //!
-//! This crate provides APIs for interacting with object storage services.
+//! This crate provides a uniform API for interacting with object storage services and
+//! local files via the the [`ObjectStore`] trait.
 //!
-//! It currently supports PUT (single or chunked/concurrent), GET, DELETE, HEAD and list for:
+//! # Create an [`ObjectStore`] implementation:
 //!
-//! * [Google Cloud Storage](https://cloud.google.com/storage/)
-//! * [Amazon S3](https://aws.amazon.com/s3/)
-//! * [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/#overview)
-//! * In-memory
-//! * Local file storage
+//! * [Google Cloud Storage](https://cloud.google.com/storage/): [`GoogleCloudStorageBuilder`](gcp::GoogleCloudStorageBuilder)
+//! * [Amazon S3](https://aws.amazon.com/s3/): [`AmazonS3Builder`](aws::AmazonS3Builder)
+//! * [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/):: [`MicrosoftAzureBuilder`](azure::MicrosoftAzureBuilder)
+//! * In Memory: [`InMemory`](memory::InMemory)
+//! * Local filesystem: [`LocalFileSystem`](local::LocalFileSystem)
+//!
+//! # Adapters
+//!
+//! [`ObjectStore`] instances can be composed with various adapters
+//! which add additional functionality:
+//!
+//! * Rate Throttling: [`ThrottleConfig`](throttle::ThrottleConfig)
+//! * Concurrent Request Limit: [`LimitStore`](limit::LimitStore)
+//!
+//!
+//! # Listing objects:
+//!
+//! Use the [`ObjectStore::list`] method to iterate over objects in
+//! remote storage or files in the local filesystem:
+//!
+//! ```
+//! # use object_store::local::LocalFileSystem;
+//! # // use LocalFileSystem for example
+//! # fn get_object_store() -> LocalFileSystem {
+//! #   LocalFileSystem::new_with_prefix("/tmp").unwrap()
+//! # }
+//!
+//! # async fn example() {
+//! use std::sync::Arc;
+//! use object_store::{path::Path, ObjectStore};
+//! use futures::stream::StreamExt;
+//!
+//! // create an ObjectStore
+//! let object_store: Arc<dyn ObjectStore> = Arc::new(get_object_store());
+//!
+//! // Recursively list all files below the 'data' path.
+//! // 1. On AWS S3 this would be the 'data/' prefix
+//! // 2. On a local filesystem, this would be the 'data' directory
+//! let prefix: Path = "data".try_into().unwrap();
+//!
+//! // Get an `async` stream of Metadata objects:
+//!  let list_stream = object_store
+//!      .list(Some(&prefix))
+//!      .await
+//!      .expect("Error listing files");
+//!
+//!  // Print a line about each object based on its metadata
+//!  // using for_each from `StreamExt` trait.
+//!  list_stream
+//!      .for_each(move |meta|  {
+//!          async {
+//!              let meta = meta.expect("Error listing");
+//!              println!("Name: {}, size: {}", meta.location, meta.size);
+//!          }
+//!      })
+//!      .await;
+//! # }
+//! ```
+//!
+//! Which will print out something like the following:
+//!
+//! ```text
+//! Name: data/file01.parquet, size: 112832
+//! Name: data/file02.parquet, size: 143119
+//! Name: data/child/file03.parquet, size: 100
+//! ...
+//! ```
+//!
+//! # Fetching objects
+//!
+//! Use the [`ObjectStore::get`] method to fetch the data bytes
+//! from remote storage or files in the local filesystem as a stream.
+//!
+//! ```
+//! # use object_store::local::LocalFileSystem;
+//! # // use LocalFileSystem for example
+//! # fn get_object_store() -> LocalFileSystem {
+//! #   LocalFileSystem::new_with_prefix("/tmp").unwrap()
+//! # }
+//!
+//! # async fn example() {
+//! use std::sync::Arc;
+//! use object_store::{path::Path, ObjectStore};
+//! use futures::stream::StreamExt;
+//!
+//! // create an ObjectStore
+//! let object_store: Arc<dyn ObjectStore> = Arc::new(get_object_store());
+//!
+//! // Retrieve a specific file
+//! let path: Path = "data/file01.parquet".try_into().unwrap();
+//!
+//! // fetch the bytes from object store
+//! let stream = object_store
+//!     .get(&path)
+//!     .await
+//!     .unwrap()
+//!     .into_stream();
+//!
+//! // Count the '0's using `map` from `StreamExt` trait
+//! let num_zeros = stream
+//!     .map(|bytes| {
+//!         let bytes = bytes.unwrap();
+//!        bytes.iter().filter(|b| **b == 0).count()
+//!     })
+//!     .collect::<Vec<usize>>()
+//!     .await
+//!     .into_iter()
+//!     .sum::<usize>();
+//!
+//! println!("Num zeros in {} is {}", path, num_zeros);
+//! # }
+//! ```
+//!
+//! Which will print out something like the following:
+//!
+//! ```text
+//! Num zeros in data/file01.parquet is 657
+//! ```
 //!
 
 #[cfg(feature = "aws")]