You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/01/03 21:58:50 UTC

[arrow-datafusion] branch master updated: Move default catalog and schema onto ConfigOptions (#3887) (#4805)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new 5b70e3543 Move default catalog and schema onto ConfigOptions (#3887) (#4805)
5b70e3543 is described below

commit 5b70e3543f5c10832e43ed25e7d4166cf0c1df78
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Tue Jan 3 21:58:45 2023 +0000

    Move default catalog and schema onto ConfigOptions (#3887) (#4805)
---
 datafusion/core/src/config.rs                      |  6 +++
 datafusion/core/src/execution/context.rs           | 61 +++++++++++----------
 .../test_files/information_schema.slt              |  2 +
 docs/source/user-guide/configs.md                  | 62 +++++++++++-----------
 4 files changed, 72 insertions(+), 59 deletions(-)

diff --git a/datafusion/core/src/config.rs b/datafusion/core/src/config.rs
index 4e6cddcd5..4b3df66da 100644
--- a/datafusion/core/src/config.rs
+++ b/datafusion/core/src/config.rs
@@ -158,6 +158,12 @@ config_namespace! {
         /// concurrency. Defaults to the number of cpu cores on the system.
         pub create_default_catalog_and_schema: bool, default = true
 
+        /// The default catalog name - this impacts what SQL queries use if not specified
+        pub default_catalog: String, default = "datafusion".to_string()
+
+        /// The default schema name - this impacts what SQL queries use if not specified
+        pub default_schema: String, default = "public".to_string()
+
         /// Should DataFusion provide access to `information_schema`
         /// virtual tables for displaying schema information
         pub information_schema: bool, default = false
diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs
index 93981e905..70882d8fa 100644
--- a/datafusion/core/src/execution/context.rs
+++ b/datafusion/core/src/execution/context.rs
@@ -100,11 +100,6 @@ use super::options::{
     AvroReadOptions, CsvReadOptions, NdJsonReadOptions, ParquetReadOptions,
 };
 
-/// The default catalog name - this impacts what SQL queries use if not specified
-const DEFAULT_CATALOG: &str = "datafusion";
-/// The default schema name - this impacts what SQL queries use if not specified
-const DEFAULT_SCHEMA: &str = "public";
-
 /// SessionContext is the main interface for executing queries with DataFusion. It stands for
 /// the connection between user and DataFusion/Ballista cluster.
 /// The context provides the following functionality
@@ -381,18 +376,32 @@ impl SessionContext {
                 // so for now, we default to default catalog
                 let tokens: Vec<&str> = schema_name.split('.').collect();
                 let (catalog, schema_name) = match tokens.len() {
-                    1 => Ok((DEFAULT_CATALOG, schema_name.as_str())),
-                    2 => Ok((tokens[0], tokens[1])),
-                    _ => Err(DataFusionError::Execution(format!(
-                        "Unable to parse catalog from {schema_name}"
-                    ))),
-                }?;
-                let catalog = self.catalog(catalog).ok_or_else(|| {
-                    DataFusionError::Execution(format!(
-                        "Missing '{DEFAULT_CATALOG}' catalog"
-                    ))
-                })?;
-
+                    1 => {
+                        let state = self.state.read();
+                        let name = &state.config.options.catalog.default_catalog;
+                        let catalog =
+                            state.catalog_list.catalog(name).ok_or_else(|| {
+                                DataFusionError::Execution(format!(
+                                    "Missing default catalog '{name}'"
+                                ))
+                            })?;
+                        (catalog, tokens[0])
+                    }
+                    2 => {
+                        let name = &tokens[0];
+                        let catalog = self.catalog(name).ok_or_else(|| {
+                            DataFusionError::Execution(format!(
+                                "Missing catalog '{name}'"
+                            ))
+                        })?;
+                        (catalog, tokens[1])
+                    }
+                    _ => {
+                        return Err(DataFusionError::Execution(format!(
+                            "Unable to parse catalog from {schema_name}"
+                        )))
+                    }
+                };
                 let schema = catalog.schema(schema_name);
 
                 match (if_not_exists, schema) {
@@ -1098,11 +1107,6 @@ impl Hasher for IdHasher {
 /// Configuration options for session context
 #[derive(Clone)]
 pub struct SessionConfig {
-    /// Default catalog name for table resolution
-    default_catalog: String,
-    /// Default schema name for table resolution (not in ConfigOptions
-    /// due to `resolve_table_ref` which passes back references)
-    default_schema: String,
     /// Configuration options
     options: ConfigOptions,
     /// Opaque extensions.
@@ -1112,8 +1116,6 @@ pub struct SessionConfig {
 impl Default for SessionConfig {
     fn default() -> Self {
         Self {
-            default_catalog: DEFAULT_CATALOG.to_owned(),
-            default_schema: DEFAULT_SCHEMA.to_owned(),
             options: ConfigOptions::new(),
             // Assume no extensions by default.
             extensions: HashMap::with_capacity_and_hasher(
@@ -1219,8 +1221,8 @@ impl SessionConfig {
         catalog: impl Into<String>,
         schema: impl Into<String>,
     ) -> Self {
-        self.default_catalog = catalog.into();
-        self.default_schema = schema.into();
+        self.options.catalog.default_catalog = catalog.into();
+        self.options.catalog.default_schema = schema.into();
         self
     }
 
@@ -1435,7 +1437,7 @@ impl SessionState {
 
             default_catalog
                 .register_schema(
-                    &config.default_schema,
+                    &config.config_options().catalog.default_schema,
                     Arc::new(MemorySchemaProvider::new()),
                 )
                 .expect("memory catalog provider can register schema");
@@ -1443,7 +1445,7 @@ impl SessionState {
             Self::register_default_schema(&config, &runtime, &default_catalog);
 
             catalog_list.register_catalog(
-                config.default_catalog.clone(),
+                config.config_options().catalog.default_catalog.clone(),
                 Arc::new(default_catalog),
             );
         }
@@ -1560,9 +1562,10 @@ impl SessionState {
         &'a self,
         table_ref: impl Into<TableReference<'a>>,
     ) -> ResolvedTableReference<'a> {
+        let catalog = &self.config_options().catalog;
         table_ref
             .into()
-            .resolve(&self.config.default_catalog, &self.config.default_schema)
+            .resolve(&catalog.default_catalog, &catalog.default_schema)
     }
 
     fn schema_for_ref<'a>(
diff --git a/datafusion/core/tests/sqllogictests/test_files/information_schema.slt b/datafusion/core/tests/sqllogictests/test_files/information_schema.slt
index 2d663d33b..24b54aa49 100644
--- a/datafusion/core/tests/sqllogictests/test_files/information_schema.slt
+++ b/datafusion/core/tests/sqllogictests/test_files/information_schema.slt
@@ -103,6 +103,8 @@ query R
 SHOW ALL
 ----
 datafusion.catalog.create_default_catalog_and_schema true
+datafusion.catalog.default_catalog datafusion
+datafusion.catalog.default_schema public
 datafusion.catalog.format NULL
 datafusion.catalog.has_header false
 datafusion.catalog.information_schema true
diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md
index cfc2ddc18..57d23ce69 100644
--- a/docs/source/user-guide/configs.md
+++ b/docs/source/user-guide/configs.md
@@ -35,33 +35,35 @@ Values are parsed according to the [same rules used in casts from Utf8](https://
 If the value in the environment variable cannot be cast to the type of the configuration option, the default value will be used instead and a warning emitted.
 Environment variables are read during `SessionConfig` initialisation so they must be set beforehand and will not affect running sessions.
 
-| key                                                       | default | description                                                                                                                                                                                                                                                                                                |
-| --------------------------------------------------------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| datafusion.catalog.create_default_catalog_and_schema      | true    | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of cpu cores on the system.                                                                                                                                                               |
-| datafusion.catalog.information_schema                     | false   | Should DataFusion provide access to `information_schema` virtual tables for displaying schema information                                                                                                                                                                                                  |
-| datafusion.catalog.location                               | NULL    | Location scanned to load tables for `default` schema                                                                                                                                                                                                                                                       |
-| datafusion.catalog.format                                 | NULL    | Type of `TableProvider` to use when loading `default` schema                                                                                                                                                                                                                                               |
-| datafusion.catalog.has_header                             | false   | If the file has a header                                                                                                                                                                                                                                                                                   |
-| datafusion.execution.batch_size                           | 8192    | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would results in too much metadata memory consumption                                                                                                                       |
-| datafusion.execution.coalesce_batches                     | true    | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting |
-| datafusion.execution.collect_statistics                   | false   | Should DataFusion collect statistics after listing files                                                                                                                                                                                                                                                   |
-| datafusion.execution.target_partitions                    | 0       | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of cpu cores on the system                                                                                                                                                                |
-| datafusion.execution.time_zone                            | +00:00  | The default time zone Some functions, e.g. EXTRACT(HOUR from SOME_TIME), shift the underlying datetime according to this time zone, and then extract the hour                                                                                                                                              |
-| datafusion.execution.parquet.enable_page_index            | false   | If true, uses parquet data page level metadata (Page Index) statistics to reduce the number of rows decoded.                                                                                                                                                                                               |
-| datafusion.execution.parquet.pruning                      | true    | If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file                                                                                                                                            |
-| datafusion.execution.parquet.skip_metadata                | true    | If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata                                                          |
-| datafusion.execution.parquet.metadata_size_hint           | NULL    | If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two read are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer                                     |
-| datafusion.execution.parquet.pushdown_filters             | false   | If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded                                                                                                                                                                                      |
-| datafusion.execution.parquet.reorder_filters              | false   | If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query                                                                                |
-| datafusion.optimizer.enable_round_robin_repartition       | true    | When set to true, the physical plan optimizer will try to add round robin repartition to increase parallelism to leverage more CPU cores                                                                                                                                                                   |
-| datafusion.optimizer.filter_null_join_keys                | false   | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down.                                            |
-| datafusion.optimizer.repartition_aggregations             | true    | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level"                                                                                                                                                                |
-| datafusion.optimizer.repartition_joins                    | true    | Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level"                                                                                                                                                                          |
-| datafusion.optimizer.repartition_windows                  | true    | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level"                                                                                                                                                         |
-| datafusion.optimizer.skip_failed_rules                    | true    | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail                                                                       |
-| datafusion.optimizer.max_passes                           | 3       | Number of times that the optimizer will attempt to optimize the plan                                                                                                                                                                                                                                       |
-| datafusion.optimizer.top_down_join_key_reordering         | true    | When set to true, the physical plan optimizer will run a top down process to reorder the join keys                                                                                                                                                                                                         |
-| datafusion.optimizer.prefer_hash_join                     | true    | When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory                                                                                                                                      |
-| datafusion.optimizer.hash_join_single_partition_threshold | 1048576 | The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition                                                                                                                                                                                             |
-| datafusion.explain.logical_plan_only                      | false   | When set to true, the explain statement will only print logical plans                                                                                                                                                                                                                                      |
-| datafusion.explain.physical_plan_only                     | false   | When set to true, the explain statement will only print physical plans                                                                                                                                                                                                                                     |
+| key                                                       | default    | description                                                                                                                                                                                                                                                                                                |
+| --------------------------------------------------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| datafusion.catalog.create_default_catalog_and_schema      | true       | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of cpu cores on the system.                                                                                                                                                               |
+| datafusion.catalog.default_catalog                        | datafusion | The default catalog name - this impacts what SQL queries use if not specified                                                                                                                                                                                                                              |
+| datafusion.catalog.default_schema                         | public     | The default schema name - this impacts what SQL queries use if not specified                                                                                                                                                                                                                               |
+| datafusion.catalog.information_schema                     | false      | Should DataFusion provide access to `information_schema` virtual tables for displaying schema information                                                                                                                                                                                                  |
+| datafusion.catalog.location                               | NULL       | Location scanned to load tables for `default` schema                                                                                                                                                                                                                                                       |
+| datafusion.catalog.format                                 | NULL       | Type of `TableProvider` to use when loading `default` schema                                                                                                                                                                                                                                               |
+| datafusion.catalog.has_header                             | false      | If the file has a header                                                                                                                                                                                                                                                                                   |
+| datafusion.execution.batch_size                           | 8192       | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would results in too much metadata memory consumption                                                                                                                       |
+| datafusion.execution.coalesce_batches                     | true       | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting |
+| datafusion.execution.collect_statistics                   | false      | Should DataFusion collect statistics after listing files                                                                                                                                                                                                                                                   |
+| datafusion.execution.target_partitions                    | 0          | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of cpu cores on the system                                                                                                                                                                |
+| datafusion.execution.time_zone                            | +00:00     | The default time zone Some functions, e.g. EXTRACT(HOUR from SOME_TIME), shift the underlying datetime according to this time zone, and then extract the hour                                                                                                                                              |
+| datafusion.execution.parquet.enable_page_index            | false      | If true, uses parquet data page level metadata (Page Index) statistics to reduce the number of rows decoded.                                                                                                                                                                                               |
+| datafusion.execution.parquet.pruning                      | true       | If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file                                                                                                                                            |
+| datafusion.execution.parquet.skip_metadata                | true       | If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata                                                          |
+| datafusion.execution.parquet.metadata_size_hint           | NULL       | If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two read are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer                                     |
+| datafusion.execution.parquet.pushdown_filters             | false      | If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded                                                                                                                                                                                      |
+| datafusion.execution.parquet.reorder_filters              | false      | If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query                                                                                |
+| datafusion.optimizer.enable_round_robin_repartition       | true       | When set to true, the physical plan optimizer will try to add round robin repartition to increase parallelism to leverage more CPU cores                                                                                                                                                                   |
+| datafusion.optimizer.filter_null_join_keys                | false      | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down.                                            |
+| datafusion.optimizer.repartition_aggregations             | true       | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level"                                                                                                                                                                |
+| datafusion.optimizer.repartition_joins                    | true       | Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level"                                                                                                                                                                          |
+| datafusion.optimizer.repartition_windows                  | true       | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level"                                                                                                                                                         |
+| datafusion.optimizer.skip_failed_rules                    | true       | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail                                                                       |
+| datafusion.optimizer.max_passes                           | 3          | Number of times that the optimizer will attempt to optimize the plan                                                                                                                                                                                                                                       |
+| datafusion.optimizer.top_down_join_key_reordering         | true       | When set to true, the physical plan optimizer will run a top down process to reorder the join keys                                                                                                                                                                                                         |
+| datafusion.optimizer.prefer_hash_join                     | true       | When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory                                                                                                                                      |
+| datafusion.optimizer.hash_join_single_partition_threshold | 1048576    | The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition                                                                                                                                                                                             |
+| datafusion.explain.logical_plan_only                      | false      | When set to true, the explain statement will only print logical plans                                                                                                                                                                                                                                      |
+| datafusion.explain.physical_plan_only                     | false      | When set to true, the explain statement will only print physical plans                                                                                                                                                                                                                                     |