You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by wj...@apache.org on 2024/01/11 21:07:47 UTC
(arrow-datafusion) branch main updated: docs: document SessionConfig (#8771)

This is an automated email from the ASF dual-hosted git repository.

wjones127 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 910cc76f53 docs: document SessionConfig (#8771)
910cc76f53 is described below

commit 910cc76f53dc897708eaab45ed8ce09aac5bc8aa
Author: Will Jones <wi...@gmail.com>
AuthorDate: Thu Jan 11 13:07:42 2024 -0800

    docs: document SessionConfig (#8771)
    
    * docs: document SessionConfig
    
    * make some tests happy
    
    * pr feedback
---
 datafusion/common/src/config.rs                    |  30 +++++-
 datafusion/execution/src/config.rs                 | 103 ++++++++++++++++++---
 .../sqllogictest/test_files/information_schema.slt |   2 +-
 docs/source/user-guide/configs.md                  |   2 +-
 4 files changed, 123 insertions(+), 14 deletions(-)

diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
index 9921c446f8..5c051a7dee 100644
--- a/datafusion/common/src/config.rs
+++ b/datafusion/common/src/config.rs
@@ -153,6 +153,10 @@ macro_rules! config_namespace {
 
 config_namespace! {
     /// Options related to catalog and directory scanning
+    ///
+    /// See also: [`SessionConfig`]
+    ///
+    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
     pub struct CatalogOptions {
         /// Whether the default catalog and schema should be created automatically.
         pub create_default_catalog_and_schema: bool, default = true
@@ -180,6 +184,10 @@ config_namespace! {
 
 config_namespace! {
     /// Options related to SQL parser
+    ///
+    /// See also: [`SessionConfig`]
+    ///
+    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
     pub struct SqlParserOptions {
         /// When set to true, SQL parser will parse float as decimal type
         pub parse_float_as_decimal: bool, default = false
@@ -196,6 +204,10 @@ config_namespace! {
 
 config_namespace! {
     /// Options related to query execution
+    ///
+    /// See also: [`SessionConfig`]
+    ///
+    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
     pub struct ExecutionOptions {
         /// Default batch size while creating new batches, it's especially useful for
         /// buffer-in-memory batches since creating tiny batches would result in too much
@@ -283,6 +295,10 @@ config_namespace! {
 
 config_namespace! {
     /// Options related to parquet files
+    ///
+    /// See also: [`SessionConfig`]
+    ///
+    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
     pub struct ParquetOptions {
         /// If true, reads the Parquet data page level metadata (the
         /// Page Index), if present, to reduce the I/O and number of
@@ -306,7 +322,7 @@ config_namespace! {
         pub metadata_size_hint: Option<usize>, default = None
 
         /// If true, filter expressions are be applied during the parquet decoding operation to
-        /// reduce the number of rows decoded
+        /// reduce the number of rows decoded. This optimization is sometimes called "late materialization".
         pub pushdown_filters: bool, default = false
 
         /// If true, filter expressions evaluated during the parquet decoding operation
@@ -416,6 +432,10 @@ config_namespace! {
 
 config_namespace! {
     /// Options related to aggregate execution
+    ///
+    /// See also: [`SessionConfig`]
+    ///
+    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
     pub struct AggregateOptions {
         /// Specifies the threshold for using `ScalarValue`s to update
         /// accumulators during high-cardinality aggregations for each input batch.
@@ -433,6 +453,10 @@ config_namespace! {
 
 config_namespace! {
     /// Options related to query optimization
+    ///
+    /// See also: [`SessionConfig`]
+    ///
+    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
     pub struct OptimizerOptions {
         /// When set to true, the optimizer will push a limit operation into
         /// grouped aggregations which have no aggregate expressions, as a soft limit,
@@ -541,6 +565,10 @@ config_namespace! {
 
 config_namespace! {
     /// Options controlling explain output
+    ///
+    /// See also: [`SessionConfig`]
+    ///
+    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
     pub struct ExplainOptions {
         /// When set to true, the explain statement will only print logical plans
         pub logical_plan_only: bool, default = false
diff --git a/datafusion/execution/src/config.rs b/datafusion/execution/src/config.rs
index 8556335b39..5158e71773 100644
--- a/datafusion/execution/src/config.rs
+++ b/datafusion/execution/src/config.rs
@@ -24,7 +24,69 @@ use std::{
 
 use datafusion_common::{config::ConfigOptions, Result, ScalarValue};
 
-/// Configuration options for Execution context
+/// Configuration options for [`SessionContext`].
+///
+/// Can be passed to [`SessionContext::new_with_config`] to customize the configuration of DataFusion.
+///
+/// Options can be set using namespaces keys with `.` as the separator, where the
+/// namespace determines which configuration struct the value to routed to. All
+/// built-in options are under the `datafusion` namespace.
+///
+/// For example, the key `datafusion.execution.batch_size` will set [ExecutionOptions::batch_size][datafusion_common::config::ExecutionOptions::batch_size],
+/// because [ConfigOptions::execution] is [ExecutionOptions][datafusion_common::config::ExecutionOptions]. Similarly, the key
+/// `datafusion.execution.parquet.pushdown_filters` will set [ParquetOptions::pushdown_filters][datafusion_common::config::ParquetOptions::pushdown_filters],
+/// since [ExecutionOptions::parquet][datafusion_common::config::ExecutionOptions::parquet] is [ParquetOptions][datafusion_common::config::ParquetOptions].
+///
+/// Some options have convenience methods. For example [SessionConfig::with_batch_size] is
+/// shorthand for setting `datafusion.execution.batch_size`.
+///
+/// ```
+/// use datafusion_execution::config::SessionConfig;
+/// use datafusion_common::ScalarValue;
+///
+/// let config = SessionConfig::new()
+///    .set("datafusion.execution.batch_size", ScalarValue::UInt64(Some(1234)))
+///    .set_bool("datafusion.execution.parquet.pushdown_filters", true);
+///
+/// assert_eq!(config.batch_size(), 1234);
+/// assert_eq!(config.options().execution.batch_size, 1234);
+/// assert_eq!(config.options().execution.parquet.pushdown_filters, true);
+/// ```
+///
+/// You can also directly mutate the options via [SessionConfig::options_mut].
+/// So the following is equivalent to the above:
+///
+/// ```
+/// # use datafusion_execution::config::SessionConfig;
+/// # use datafusion_common::ScalarValue;
+/// #
+/// let mut config = SessionConfig::new();
+/// config.options_mut().execution.batch_size = 1234;
+/// config.options_mut().execution.parquet.pushdown_filters = true;
+/// #
+/// # assert_eq!(config.batch_size(), 1234);
+/// # assert_eq!(config.options().execution.batch_size, 1234);
+/// # assert_eq!(config.options().execution.parquet.pushdown_filters, true);
+/// ```
+///
+/// ## Built-in options
+///
+/// | Namespace | Config struct |
+/// | --------- | ------------- |
+/// | `datafusion.catalog` | [CatalogOptions][datafusion_common::config::CatalogOptions] |
+/// | `datafusion.execution` | [ExecutionOptions][datafusion_common::config::ExecutionOptions] |
+/// | `datafusion.execution.aggregate` | [AggregateOptions][datafusion_common::config::AggregateOptions] |
+/// | `datafusion.execution.parquet` | [ParquetOptions][datafusion_common::config::ParquetOptions] |
+/// | `datafusion.optimizer` | [OptimizerOptions][datafusion_common::config::OptimizerOptions] |
+/// | `datafusion.sql_parser` | [SqlParserOptions][datafusion_common::config::SqlParserOptions] |
+/// | `datafusion.explain` | [ExplainOptions][datafusion_common::config::ExplainOptions] |
+///
+/// ## Custom configuration
+///
+/// Configuration options can be extended. See [SessionConfig::with_extension] for details.
+///
+/// [`SessionContext`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html
+/// [`SessionContext::new_with_config`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.new_with_config
 #[derive(Clone, Debug)]
 pub struct SessionConfig {
     /// Configuration options
@@ -62,6 +124,35 @@ impl SessionConfig {
         Ok(ConfigOptions::from_string_hash_map(settings)?.into())
     }
 
+    /// Return a handle to the configuration options.
+    ///
+    /// Can be used to read the current configuration.
+    ///
+    /// ```
+    /// use datafusion_execution::config::SessionConfig;
+    ///
+    /// let config = SessionConfig::new();
+    /// assert!(config.options().execution.batch_size > 0);
+    /// ```
+    pub fn options(&self) -> &ConfigOptions {
+        &self.options
+    }
+
+    /// Return a mutable handle to the configuration options.
+    ///
+    /// Can be used to set configuration options.
+    ///
+    /// ```
+    /// use datafusion_execution::config::SessionConfig;
+    ///
+    /// let mut config = SessionConfig::new();
+    /// config.options_mut().execution.batch_size = 1024;
+    /// assert_eq!(config.options().execution.batch_size, 1024);
+    /// ```
+    pub fn options_mut(&mut self) -> &mut ConfigOptions {
+        &mut self.options
+    }
+
     /// Set a configuration option
     pub fn set(mut self, key: &str, value: ScalarValue) -> Self {
         self.options.set(key, &value.to_string()).unwrap();
@@ -346,16 +437,6 @@ impl SessionConfig {
         &mut self.options
     }
 
-    /// Return a handle to the configuration options.
-    pub fn options(&self) -> &ConfigOptions {
-        &self.options
-    }
-
-    /// Return a mutable handle to the configuration options.
-    pub fn options_mut(&mut self) -> &mut ConfigOptions {
-        &mut self.options
-    }
-
     /// Add extensions.
     ///
     /// Extensions can be used to attach extra data to the session config -- e.g. tracing information or caches.
diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt
index 14ef290294..f8893bf7ae 100644
--- a/datafusion/sqllogictest/test_files/information_schema.slt
+++ b/datafusion/sqllogictest/test_files/information_schema.slt
@@ -248,7 +248,7 @@ datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 By def
 datafusion.execution.parquet.maximum_parallel_row_group_writers 1 By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as [...]
 datafusion.execution.parquet.metadata_size_hint NULL If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer
 datafusion.execution.parquet.pruning true If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file
-datafusion.execution.parquet.pushdown_filters false If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded
+datafusion.execution.parquet.pushdown_filters false If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization".
 datafusion.execution.parquet.reorder_filters false If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query
 datafusion.execution.parquet.skip_metadata true If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata
 datafusion.execution.parquet.statistics_enabled NULL Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting
diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md
index 4a379d374c..7111ea1d0a 100644
--- a/docs/source/user-guide/configs.md
+++ b/docs/source/user-guide/configs.md
@@ -53,7 +53,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus
 | datafusion.execution.parquet.pruning                                    | true                      | If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file                                                                                                                                                                                                                                      [...]
 | datafusion.execution.parquet.skip_metadata                              | true                      | If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata                                                                                                                                                    [...]
 | datafusion.execution.parquet.metadata_size_hint                         | NULL                      | If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer                                                                                                                              [...]
-| datafusion.execution.parquet.pushdown_filters                           | false                     | If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded                                                                                                                                                                                                                                                                                [...]
+| datafusion.execution.parquet.pushdown_filters                           | false                     | If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization".                                                                                                                                                                                                                 [...]
 | datafusion.execution.parquet.reorder_filters                            | false                     | If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query                                                                                                                                                                          [...]
 | datafusion.execution.parquet.data_pagesize_limit                        | 1048576                   | Sets best effort maximum size of data page in bytes                                                                                                                                                                                                                                                                                                                                                  [...]
 | datafusion.execution.parquet.write_batch_size                           | 1024                      | Sets write_batch_size in bytes                                                                                                                                                                                                                                                                                                                                                                       [...]