You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/10/28 14:19:09 UTC
[arrow-datafusion] branch master updated: Minor: Add some docstrings to `FileScanConfig` and `RuntimeEnv` (#3962)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/master by this push:
new ca42f4cdb Minor: Add some docstrings to `FileScanConfig` and `RuntimeEnv` (#3962)
ca42f4cdb is described below
commit ca42f4cdb9ce3cddd46447d289b3a89824b7e8d7
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Fri Oct 28 10:19:04 2022 -0400
Minor: Add some docstrings to `FileScanConfig` and `RuntimeEnv` (#3962)
* Minor: Add some docstrings to FileScanConfig
* fix grammar
Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
---
datafusion/core/src/execution/runtime_env.rs | 15 +++++++++++----
datafusion/core/src/physical_plan/file_format/mod.rs | 19 +++++++++++++++----
2 files changed, 26 insertions(+), 8 deletions(-)
diff --git a/datafusion/core/src/execution/runtime_env.rs b/datafusion/core/src/execution/runtime_env.rs
index 72afd4327..7c3e9b4e6 100644
--- a/datafusion/core/src/execution/runtime_env.rs
+++ b/datafusion/core/src/execution/runtime_env.rs
@@ -93,10 +93,15 @@ impl RuntimeEnv {
self.memory_manager.shrink_tracker_usage(delta)
}
- /// Registers an object store with scheme using a custom `ObjectStore` so that
- /// an external file system or object storage system could be used against this context.
+ /// Registers a custom `ObjectStore` to be used when accessing a
+ /// specific scheme and host. This allows DataFusion to create
+ /// external tables from urls that do not have built in support
+ /// such as `hdfs://...`.
///
- /// Returns the `ObjectStore` previously registered for this scheme, if any
+ /// Returns the [`ObjectStore`] previously registered for this
+ /// scheme, if any.
+ ///
+ /// See [`ObjectStoreRegistry`] for more details
pub fn register_object_store(
&self,
scheme: impl AsRef<str>,
@@ -115,7 +120,9 @@ impl RuntimeEnv {
self.table_factories.extend(table_factories)
}
- /// Retrieves a `ObjectStore` instance for a url
+ /// Retrieves a `ObjectStore` instance for a url by consulting the
+ /// registery. See [`ObjectStoreRegistry::get_by_url`] for more
+ /// details.
pub fn object_store(&self, url: impl AsRef<Url>) -> Result<Arc<dyn ObjectStore>> {
self.object_store_registry
.get_by_url(url)
diff --git a/datafusion/core/src/physical_plan/file_format/mod.rs b/datafusion/core/src/physical_plan/file_format/mod.rs
index c33e2bc14..dbf348b09 100644
--- a/datafusion/core/src/physical_plan/file_format/mod.rs
+++ b/datafusion/core/src/physical_plan/file_format/mod.rs
@@ -74,19 +74,30 @@ lazy_static! {
/// any given file format.
#[derive(Debug, Clone)]
pub struct FileScanConfig {
- /// Object store URL
+ /// Object store URL, used to get an [`ObjectStore`] instance from
+ /// [`RuntimeEnv::object_store`]
pub object_store_url: ObjectStoreUrl,
- /// Schema before projection. It contains the columns that are expected
- /// to be in the files without the table partition columns.
+ /// Schema before `projection` is applied. It contains the all columns that may
+ /// appear in the files. It does not include table partition columns
+ /// that may be added.
pub file_schema: SchemaRef,
/// List of files to be processed, grouped into partitions
+ ///
+ /// Each file must have a schema of `file_schema` or a subset. If
+ /// a particular file has a subset, the missing columns are
+ /// padded with with NULLs.
+ ///
+ /// DataFusion may attempt to read each partition of files
+ /// concurrently, however files *within* a partition will be read
+ /// sequentially, one after the next.
pub file_groups: Vec<Vec<PartitionedFile>>,
/// Estimated overall statistics of the files, taking `filters` into account.
pub statistics: Statistics,
/// Columns on which to project the data. Indexes that are higher than the
/// number of columns of `file_schema` refer to `table_partition_cols`.
pub projection: Option<Vec<usize>>,
- /// The minimum number of records required from this source plan
+ /// The maximum number of records to read from this plan. If None,
+ /// all records after filtering are returned.
pub limit: Option<usize>,
/// The partitioning column names
pub table_partition_cols: Vec<String>,