You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/10/28 14:19:09 UTC
[arrow-datafusion] branch master updated: Minor: Add some docstrings to `FileScanConfig` and `RuntimeEnv` (#3962)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new ca42f4cdb Minor: Add some docstrings to `FileScanConfig` and `RuntimeEnv` (#3962)
ca42f4cdb is described below

commit ca42f4cdb9ce3cddd46447d289b3a89824b7e8d7
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Fri Oct 28 10:19:04 2022 -0400

    Minor: Add some docstrings to `FileScanConfig` and `RuntimeEnv` (#3962)
    
    * Minor: Add some docstrings to FileScanConfig
    
    * fix grammar
    
    Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
    
    Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
---
 datafusion/core/src/execution/runtime_env.rs         | 15 +++++++++++----
 datafusion/core/src/physical_plan/file_format/mod.rs | 19 +++++++++++++++----
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/datafusion/core/src/execution/runtime_env.rs b/datafusion/core/src/execution/runtime_env.rs
index 72afd4327..7c3e9b4e6 100644
--- a/datafusion/core/src/execution/runtime_env.rs
+++ b/datafusion/core/src/execution/runtime_env.rs
@@ -93,10 +93,15 @@ impl RuntimeEnv {
         self.memory_manager.shrink_tracker_usage(delta)
     }
 
-    /// Registers an object store with scheme using a custom `ObjectStore` so that
-    /// an external file system or object storage system could be used against this context.
+    /// Registers a custom `ObjectStore` to be used when accessing a
+    /// specific scheme and host. This allows DataFusion to create
+    /// external tables from urls that do not have built in support
+    /// such as `hdfs://...`.
     ///
-    /// Returns the `ObjectStore` previously registered for this scheme, if any
+    /// Returns the [`ObjectStore`] previously registered for this
+    /// scheme, if any.
+    ///
+    /// See [`ObjectStoreRegistry`] for more details
     pub fn register_object_store(
         &self,
         scheme: impl AsRef<str>,
@@ -115,7 +120,9 @@ impl RuntimeEnv {
         self.table_factories.extend(table_factories)
     }
 
-    /// Retrieves a `ObjectStore` instance for a url
+    /// Retrieves a `ObjectStore` instance for a url by consulting the
+    /// registery. See [`ObjectStoreRegistry::get_by_url`] for more
+    /// details.
     pub fn object_store(&self, url: impl AsRef<Url>) -> Result<Arc<dyn ObjectStore>> {
         self.object_store_registry
             .get_by_url(url)
diff --git a/datafusion/core/src/physical_plan/file_format/mod.rs b/datafusion/core/src/physical_plan/file_format/mod.rs
index c33e2bc14..dbf348b09 100644
--- a/datafusion/core/src/physical_plan/file_format/mod.rs
+++ b/datafusion/core/src/physical_plan/file_format/mod.rs
@@ -74,19 +74,30 @@ lazy_static! {
 /// any given file format.
 #[derive(Debug, Clone)]
 pub struct FileScanConfig {
-    /// Object store URL
+    /// Object store URL, used to get an [`ObjectStore`] instance from
+    /// [`RuntimeEnv::object_store`]
     pub object_store_url: ObjectStoreUrl,
-    /// Schema before projection. It contains the columns that are expected
-    /// to be in the files without the table partition columns.
+    /// Schema before `projection` is applied. It contains the all columns that may
+    /// appear in the files. It does not include table partition columns
+    /// that may be added.
     pub file_schema: SchemaRef,
     /// List of files to be processed, grouped into partitions
+    ///
+    /// Each file must have a schema of `file_schema` or a subset. If
+    /// a particular file has a subset, the missing columns are
+    /// padded with with NULLs.
+    ///
+    /// DataFusion may attempt to read each partition of files
+    /// concurrently, however files *within* a partition will be read
+    /// sequentially, one after the next.
     pub file_groups: Vec<Vec<PartitionedFile>>,
     /// Estimated overall statistics of the files, taking `filters` into account.
     pub statistics: Statistics,
     /// Columns on which to project the data. Indexes that are higher than the
     /// number of columns of `file_schema` refer to `table_partition_cols`.
     pub projection: Option<Vec<usize>>,
-    /// The minimum number of records required from this source plan
+    /// The maximum number of records to read from this plan. If None,
+    /// all records after filtering are returned.
     pub limit: Option<usize>,
     /// The partitioning column names
     pub table_partition_cols: Vec<String>,