You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2023/04/26 15:15:37 UTC

[arrow-datafusion] branch main updated: chore: Update api docs for `SessionContext`, `TaskContext`, etc (#6106)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new a38480951f chore: Update api docs for `SessionContext`, `TaskContext`, etc (#6106)
a38480951f is described below

commit a38480951f40abce7ee2d5919251a1d1607f1dee
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Wed Apr 26 11:15:31 2023 -0400

    chore: Update api docs for `SessionContext`, `TaskContext`, etc (#6106)
    
    * chore: Update api docs for `SessionContext`, `TaskContext`, etc
    
    * clarify RuntimeEnv resource enforcement
---
 datafusion/core/src/execution/context.rs        | 63 +++++++++++++++++++------
 datafusion/core/src/execution/mod.rs            | 25 +---------
 datafusion/execution/src/runtime_env.rs         | 14 ++++--
 datafusion/execution/src/task.rs                |  7 ++-
 datafusion/physical-expr/src/execution_props.rs | 10 ++--
 5 files changed, 73 insertions(+), 46 deletions(-)

diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs
index 265595c59d..50aea80fc9 100644
--- a/datafusion/core/src/execution/context.rs
+++ b/datafusion/core/src/execution/context.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! SessionContext contains methods for registering data sources and executing queries
+//! [`SessionContext`] contains methods for registering data sources and executing queries
 use crate::{
     catalog::catalog::{CatalogList, MemoryCatalogList},
     datasource::{
@@ -158,11 +158,15 @@ where
     }
 }
 
-/// SessionContext is the main interface for executing queries with DataFusion. It stands for
-/// the connection between user and DataFusion/Ballista cluster.
-/// The context provides the following functionality
+/// Main interface for executing queries with DataFusion. Maintains
+/// the state of the connection between a user and an instance of the
+/// DataFusion engine.
 ///
-/// * Create DataFrame from a CSV or Parquet data source.
+/// # Overview
+///
+/// [`SessionContext`] provides the following functionality:
+///
+/// * Create a DataFrame from a CSV or Parquet data source.
 /// * Register a CSV or Parquet data source as a table that can be referenced from a SQL query.
 /// * Register a custom data source that can be referenced from a SQL query.
 /// * Execution a SQL query
@@ -199,6 +203,20 @@ where
 /// # Ok(())
 /// # }
 /// ```
+///
+/// # `SessionContext`, `SessionState`, and `TaskContext`
+///
+/// A [`SessionContext`] can be created from a [`SessionConfig`] and
+/// stores the state for a particular query session. A single
+/// [`SessionContext`] can run multiple queries.
+///
+/// [`SessionState`] contains information available during query
+/// planning (creating [`LogicalPlan`]s and [`ExecutionPlan`]s).
+///
+/// [`TaskContext`] contains the state available during query
+/// execution [`ExecutionPlan::execute`].  It contains a subset of the
+/// information in[`SessionState`] and is created from a
+/// [`SessionContext`] or a [`SessionState`].
 #[derive(Clone)]
 pub struct SessionContext {
     /// UUID for the session
@@ -216,7 +234,7 @@ impl Default for SessionContext {
 }
 
 impl SessionContext {
-    /// Creates a new execution context using a default session configuration.
+    /// Creates a new `SessionContext` using the default [`SessionConfig`].
     pub fn new() -> Self {
         Self::with_config(SessionConfig::new())
     }
@@ -241,19 +259,35 @@ impl SessionContext {
         Ok(())
     }
 
-    /// Creates a new session context using the provided session configuration.
+    /// Creates a new `SessionContext` using the provided
+    /// [`SessionConfig`] and a new [`RuntimeEnv`].
+    ///
+    /// See [`Self::with_config_rt`] for more details on resource
+    /// limits.
     pub fn with_config(config: SessionConfig) -> Self {
         let runtime = Arc::new(RuntimeEnv::default());
         Self::with_config_rt(config, runtime)
     }
 
-    /// Creates a new session context using the provided configuration and [`RuntimeEnv`].
+    /// Creates a new `SessionContext` using the provided
+    /// [`SessionConfig`] and a [`RuntimeEnv`].
+    ///
+    /// # Resource Limits
+    ///
+    /// By default, each new `SessionContext` creates a new
+    /// `RuntimeEnv`, and therefore will not enforce memory or disk
+    /// limits for queries run on different `SessionContext`s.
+    ///
+    /// To enforce resource limits (e.g. to limit the total amount of
+    /// memory used) across all DataFusion queries in a process,
+    /// all `SessionContext`'s should be configured with the
+    /// same `RuntimeEnv`.
     pub fn with_config_rt(config: SessionConfig, runtime: Arc<RuntimeEnv>) -> Self {
         let state = SessionState::with_config_rt(config, runtime);
         Self::with_state(state)
     }
 
-    /// Creates a new session context using the provided session state.
+    /// Creates a new `SessionContext` using the provided [`SessionState`]
     pub fn with_state(state: SessionState) -> Self {
         Self {
             session_id: state.session_id.clone(),
@@ -262,7 +296,7 @@ impl SessionContext {
         }
     }
 
-    /// Returns the time this session was created
+    /// Returns the time this `SessionContext` was created
     pub fn session_start_time(&self) -> DateTime<Utc> {
         self.session_start_time
     }
@@ -282,12 +316,12 @@ impl SessionContext {
         )
     }
 
-    /// Return the [RuntimeEnv] used to run queries with this [SessionContext]
+    /// Return the [RuntimeEnv] used to run queries with this `SessionContext`
     pub fn runtime_env(&self) -> Arc<RuntimeEnv> {
         self.state.read().runtime_env.clone()
     }
 
-    /// Return the `session_id` of this Session
+    /// Returns an id that uniquely identifies this `SessionContext`.
     pub fn session_id(&self) -> String {
         self.session_id.clone()
     }
@@ -1205,7 +1239,7 @@ impl QueryPlanner for DefaultQueryPlanner {
 /// Execution context for registering data sources and executing queries
 #[derive(Clone)]
 pub struct SessionState {
-    /// UUID for the session
+    /// A unique UUID that identifies the session
     session_id: String,
     /// Responsible for analyzing and rewrite a logical plan before optimization
     analyzer: Analyzer,
@@ -1252,7 +1286,8 @@ pub fn default_session_builder(config: SessionConfig) -> SessionState {
 }
 
 impl SessionState {
-    /// Returns new SessionState using the provided configuration and runtime
+    /// Returns new [`SessionState`] using the provided
+    /// [`SessionConfig`] and [`RuntimeEnv`].
     pub fn with_config_rt(config: SessionConfig, runtime: Arc<RuntimeEnv>) -> Self {
         let catalog_list = Arc::new(MemoryCatalogList::new()) as Arc<dyn CatalogList>;
         Self::with_config_rt_and_catalog_list(config, runtime, catalog_list)
diff --git a/datafusion/core/src/execution/mod.rs b/datafusion/core/src/execution/mod.rs
index ad9b9ce212..fa6c4e118e 100644
--- a/datafusion/core/src/execution/mod.rs
+++ b/datafusion/core/src/execution/mod.rs
@@ -15,30 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! This module contains the shared state available at different parts
-//! of query planning and execution
-//!
-//! # Runtime Environment
-//!
-//! [`runtime_env::RuntimeEnv`] can be created from a [`runtime_env::RuntimeConfig`] and
-//! stores state to be shared across multiple sessions. In most applications there will
-//! be a single [`runtime_env::RuntimeEnv`] for the entire process
-//!
-//! # Session Context
-//!
-//! [`context::SessionContext`] can be created from a [`context::SessionConfig`] and
-//! an optional [`runtime_env::RuntimeConfig`], and stores the state for a particular
-//! query session.
-//!
-//! In particular [`context::SessionState`] is the information available to query planning
-//!
-//! # Task Context
-//!
-//! [`context::TaskContext`] is typically created from a [`context::SessionContext`] or
-//! [`context::SessionState`], and represents the state available to query execution.
-//!
-//! In particular it is the state passed to [`crate::physical_plan::ExecutionPlan::execute`]
-//!
+//! Shared state for query planning and execution.
 
 pub mod context;
 // backwards compatibility
diff --git a/datafusion/execution/src/runtime_env.rs b/datafusion/execution/src/runtime_env.rs
index 67736edf68..8f9c594681 100644
--- a/datafusion/execution/src/runtime_env.rs
+++ b/datafusion/execution/src/runtime_env.rs
@@ -15,8 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Execution runtime environment that holds object Store, memory manager, disk manager
-//! and various system level components that are used during physical plan execution.
+//! Execution [`RuntimeEnv`] environment that manages access to object
+//! store, memory manager, disk manager.
 
 use crate::{
     disk_manager::{DiskManager, DiskManagerConfig},
@@ -32,7 +32,15 @@ use std::sync::Arc;
 use url::Url;
 
 #[derive(Clone)]
-/// Execution runtime environment.
+/// Execution runtime environment that manages system resources such
+/// as memory, disk and storage.
+///
+/// A [`RuntimeEnv`] is created from a [`RuntimeConfig`] and has the
+/// following resource management functionality:
+///
+/// * [`MemoryPool`]: Manage memory
+/// * [`DiskManager`]: Manage temporary files on local disk
+/// * [`ObjectStoreRegistry`]: Manage mapping URLs to object store instances
 pub struct RuntimeEnv {
     /// Runtime memory management
     pub memory_pool: Arc<dyn MemoryPool>,
diff --git a/datafusion/execution/src/task.rs b/datafusion/execution/src/task.rs
index 9f73f767af..ca1bc9369e 100644
--- a/datafusion/execution/src/task.rs
+++ b/datafusion/execution/src/task.rs
@@ -32,6 +32,11 @@ use crate::{
 };
 
 /// Task Execution Context
+///
+/// A [`TaskContext`] has represents the state available during a single query's
+/// execution.
+///
+/// # Task Context
 pub struct TaskContext {
     /// Session Id
     session_id: String,
@@ -98,7 +103,7 @@ impl TaskContext {
         ))
     }
 
-    /// Return the SessionConfig associated with the Task
+    /// Return the SessionConfig associated with this [TaskContext]
     pub fn session_config(&self) -> &SessionConfig {
         &self.session_config
     }
diff --git a/datafusion/physical-expr/src/execution_props.rs b/datafusion/physical-expr/src/execution_props.rs
index ff413be361..5849850031 100644
--- a/datafusion/physical-expr/src/execution_props.rs
+++ b/datafusion/physical-expr/src/execution_props.rs
@@ -20,10 +20,12 @@ use chrono::{DateTime, TimeZone, Utc};
 use std::collections::HashMap;
 use std::sync::Arc;
 
-/// Holds per-execution properties and data (such as starting timestamps, etc).
-/// An instance of this struct is created each time a [`LogicalPlan`] is prepared for
-/// execution (optimized). If the same plan is optimized multiple times, a new
-/// `ExecutionProps` is created each time.
+/// Holds per-query execution properties and data (such as statment
+/// starting timestamps).
+///
+/// An [`ExecutionProps`] is created each time a [`LogicalPlan`] is
+/// prepared for execution (optimized). If the same plan is optimized
+/// multiple times, a new `ExecutionProps` is created each time.
 ///
 /// It is important that this structure be cheap to create as it is
 /// done so during predicate pruning and expression simplification