You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2023/04/26 15:15:37 UTC
[arrow-datafusion] branch main updated: chore: Update api docs for `SessionContext`, `TaskContext`, etc (#6106)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new a38480951f chore: Update api docs for `SessionContext`, `TaskContext`, etc (#6106)
a38480951f is described below
commit a38480951f40abce7ee2d5919251a1d1607f1dee
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Wed Apr 26 11:15:31 2023 -0400
chore: Update api docs for `SessionContext`, `TaskContext`, etc (#6106)
* chore: Update api docs for `SessionContext`, `TaskContext`, etc
* clarify RuntimeEnv resource enforcement
---
datafusion/core/src/execution/context.rs | 63 +++++++++++++++++++------
datafusion/core/src/execution/mod.rs | 25 +---------
datafusion/execution/src/runtime_env.rs | 14 ++++--
datafusion/execution/src/task.rs | 7 ++-
datafusion/physical-expr/src/execution_props.rs | 10 ++--
5 files changed, 73 insertions(+), 46 deletions(-)
diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs
index 265595c59d..50aea80fc9 100644
--- a/datafusion/core/src/execution/context.rs
+++ b/datafusion/core/src/execution/context.rs
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-//! SessionContext contains methods for registering data sources and executing queries
+//! [`SessionContext`] contains methods for registering data sources and executing queries
use crate::{
catalog::catalog::{CatalogList, MemoryCatalogList},
datasource::{
@@ -158,11 +158,15 @@ where
}
}
-/// SessionContext is the main interface for executing queries with DataFusion. It stands for
-/// the connection between user and DataFusion/Ballista cluster.
-/// The context provides the following functionality
+/// Main interface for executing queries with DataFusion. Maintains
+/// the state of the connection between a user and an instance of the
+/// DataFusion engine.
///
-/// * Create DataFrame from a CSV or Parquet data source.
+/// # Overview
+///
+/// [`SessionContext`] provides the following functionality:
+///
+/// * Create a DataFrame from a CSV or Parquet data source.
/// * Register a CSV or Parquet data source as a table that can be referenced from a SQL query.
/// * Register a custom data source that can be referenced from a SQL query.
/// * Execution a SQL query
@@ -199,6 +203,20 @@ where
/// # Ok(())
/// # }
/// ```
+///
+/// # `SessionContext`, `SessionState`, and `TaskContext`
+///
+/// A [`SessionContext`] can be created from a [`SessionConfig`] and
+/// stores the state for a particular query session. A single
+/// [`SessionContext`] can run multiple queries.
+///
+/// [`SessionState`] contains information available during query
+/// planning (creating [`LogicalPlan`]s and [`ExecutionPlan`]s).
+///
+/// [`TaskContext`] contains the state available during query
+/// execution [`ExecutionPlan::execute`]. It contains a subset of the
+/// information in[`SessionState`] and is created from a
+/// [`SessionContext`] or a [`SessionState`].
#[derive(Clone)]
pub struct SessionContext {
/// UUID for the session
@@ -216,7 +234,7 @@ impl Default for SessionContext {
}
impl SessionContext {
- /// Creates a new execution context using a default session configuration.
+ /// Creates a new `SessionContext` using the default [`SessionConfig`].
pub fn new() -> Self {
Self::with_config(SessionConfig::new())
}
@@ -241,19 +259,35 @@ impl SessionContext {
Ok(())
}
- /// Creates a new session context using the provided session configuration.
+ /// Creates a new `SessionContext` using the provided
+ /// [`SessionConfig`] and a new [`RuntimeEnv`].
+ ///
+ /// See [`Self::with_config_rt`] for more details on resource
+ /// limits.
pub fn with_config(config: SessionConfig) -> Self {
let runtime = Arc::new(RuntimeEnv::default());
Self::with_config_rt(config, runtime)
}
- /// Creates a new session context using the provided configuration and [`RuntimeEnv`].
+ /// Creates a new `SessionContext` using the provided
+ /// [`SessionConfig`] and a [`RuntimeEnv`].
+ ///
+ /// # Resource Limits
+ ///
+ /// By default, each new `SessionContext` creates a new
+ /// `RuntimeEnv`, and therefore will not enforce memory or disk
+ /// limits for queries run on different `SessionContext`s.
+ ///
+ /// To enforce resource limits (e.g. to limit the total amount of
+ /// memory used) across all DataFusion queries in a process,
+ /// all `SessionContext`'s should be configured with the
+ /// same `RuntimeEnv`.
pub fn with_config_rt(config: SessionConfig, runtime: Arc<RuntimeEnv>) -> Self {
let state = SessionState::with_config_rt(config, runtime);
Self::with_state(state)
}
- /// Creates a new session context using the provided session state.
+ /// Creates a new `SessionContext` using the provided [`SessionState`]
pub fn with_state(state: SessionState) -> Self {
Self {
session_id: state.session_id.clone(),
@@ -262,7 +296,7 @@ impl SessionContext {
}
}
- /// Returns the time this session was created
+ /// Returns the time this `SessionContext` was created
pub fn session_start_time(&self) -> DateTime<Utc> {
self.session_start_time
}
@@ -282,12 +316,12 @@ impl SessionContext {
)
}
- /// Return the [RuntimeEnv] used to run queries with this [SessionContext]
+ /// Return the [RuntimeEnv] used to run queries with this `SessionContext`
pub fn runtime_env(&self) -> Arc<RuntimeEnv> {
self.state.read().runtime_env.clone()
}
- /// Return the `session_id` of this Session
+ /// Returns an id that uniquely identifies this `SessionContext`.
pub fn session_id(&self) -> String {
self.session_id.clone()
}
@@ -1205,7 +1239,7 @@ impl QueryPlanner for DefaultQueryPlanner {
/// Execution context for registering data sources and executing queries
#[derive(Clone)]
pub struct SessionState {
- /// UUID for the session
+ /// A unique UUID that identifies the session
session_id: String,
/// Responsible for analyzing and rewrite a logical plan before optimization
analyzer: Analyzer,
@@ -1252,7 +1286,8 @@ pub fn default_session_builder(config: SessionConfig) -> SessionState {
}
impl SessionState {
- /// Returns new SessionState using the provided configuration and runtime
+ /// Returns new [`SessionState`] using the provided
+ /// [`SessionConfig`] and [`RuntimeEnv`].
pub fn with_config_rt(config: SessionConfig, runtime: Arc<RuntimeEnv>) -> Self {
let catalog_list = Arc::new(MemoryCatalogList::new()) as Arc<dyn CatalogList>;
Self::with_config_rt_and_catalog_list(config, runtime, catalog_list)
diff --git a/datafusion/core/src/execution/mod.rs b/datafusion/core/src/execution/mod.rs
index ad9b9ce212..fa6c4e118e 100644
--- a/datafusion/core/src/execution/mod.rs
+++ b/datafusion/core/src/execution/mod.rs
@@ -15,30 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-//! This module contains the shared state available at different parts
-//! of query planning and execution
-//!
-//! # Runtime Environment
-//!
-//! [`runtime_env::RuntimeEnv`] can be created from a [`runtime_env::RuntimeConfig`] and
-//! stores state to be shared across multiple sessions. In most applications there will
-//! be a single [`runtime_env::RuntimeEnv`] for the entire process
-//!
-//! # Session Context
-//!
-//! [`context::SessionContext`] can be created from a [`context::SessionConfig`] and
-//! an optional [`runtime_env::RuntimeConfig`], and stores the state for a particular
-//! query session.
-//!
-//! In particular [`context::SessionState`] is the information available to query planning
-//!
-//! # Task Context
-//!
-//! [`context::TaskContext`] is typically created from a [`context::SessionContext`] or
-//! [`context::SessionState`], and represents the state available to query execution.
-//!
-//! In particular it is the state passed to [`crate::physical_plan::ExecutionPlan::execute`]
-//!
+//! Shared state for query planning and execution.
pub mod context;
// backwards compatibility
diff --git a/datafusion/execution/src/runtime_env.rs b/datafusion/execution/src/runtime_env.rs
index 67736edf68..8f9c594681 100644
--- a/datafusion/execution/src/runtime_env.rs
+++ b/datafusion/execution/src/runtime_env.rs
@@ -15,8 +15,8 @@
// specific language governing permissions and limitations
// under the License.
-//! Execution runtime environment that holds object Store, memory manager, disk manager
-//! and various system level components that are used during physical plan execution.
+//! Execution [`RuntimeEnv`] environment that manages access to object
+//! store, memory manager, disk manager.
use crate::{
disk_manager::{DiskManager, DiskManagerConfig},
@@ -32,7 +32,15 @@ use std::sync::Arc;
use url::Url;
#[derive(Clone)]
-/// Execution runtime environment.
+/// Execution runtime environment that manages system resources such
+/// as memory, disk and storage.
+///
+/// A [`RuntimeEnv`] is created from a [`RuntimeConfig`] and has the
+/// following resource management functionality:
+///
+/// * [`MemoryPool`]: Manage memory
+/// * [`DiskManager`]: Manage temporary files on local disk
+/// * [`ObjectStoreRegistry`]: Manage mapping URLs to object store instances
pub struct RuntimeEnv {
/// Runtime memory management
pub memory_pool: Arc<dyn MemoryPool>,
diff --git a/datafusion/execution/src/task.rs b/datafusion/execution/src/task.rs
index 9f73f767af..ca1bc9369e 100644
--- a/datafusion/execution/src/task.rs
+++ b/datafusion/execution/src/task.rs
@@ -32,6 +32,11 @@ use crate::{
};
/// Task Execution Context
+///
+/// A [`TaskContext`] has represents the state available during a single query's
+/// execution.
+///
+/// # Task Context
pub struct TaskContext {
/// Session Id
session_id: String,
@@ -98,7 +103,7 @@ impl TaskContext {
))
}
- /// Return the SessionConfig associated with the Task
+ /// Return the SessionConfig associated with this [TaskContext]
pub fn session_config(&self) -> &SessionConfig {
&self.session_config
}
diff --git a/datafusion/physical-expr/src/execution_props.rs b/datafusion/physical-expr/src/execution_props.rs
index ff413be361..5849850031 100644
--- a/datafusion/physical-expr/src/execution_props.rs
+++ b/datafusion/physical-expr/src/execution_props.rs
@@ -20,10 +20,12 @@ use chrono::{DateTime, TimeZone, Utc};
use std::collections::HashMap;
use std::sync::Arc;
-/// Holds per-execution properties and data (such as starting timestamps, etc).
-/// An instance of this struct is created each time a [`LogicalPlan`] is prepared for
-/// execution (optimized). If the same plan is optimized multiple times, a new
-/// `ExecutionProps` is created each time.
+/// Holds per-query execution properties and data (such as statment
+/// starting timestamps).
+///
+/// An [`ExecutionProps`] is created each time a [`LogicalPlan`] is
+/// prepared for execution (optimized). If the same plan is optimized
+/// multiple times, a new `ExecutionProps` is created each time.
///
/// It is important that this structure be cheap to create as it is
/// done so during predicate pruning and expression simplification