You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/07/01 20:20:11 UTC
[arrow-datafusion] branch master updated: Allow setting of config options via environment variables (#2812)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/master by this push:
new 88b88d436 Allow setting of config options via environment variables (#2812)
88b88d436 is described below
commit 88b88d4360054a85982987aa07b3f3afd2db7d70
Author: Mike Roberts <42...@users.noreply.github.com>
AuthorDate: Fri Jul 1 21:20:06 2022 +0100
Allow setting of config options via environment variables (#2812)
* Allow setting of config options via environment variables
* Fix clippy warnings in config.rs
* Remove default value check in config.rs, to avoid interference with get_from_env
* Move try_from_string helper from config.rs to ScalarValue::
* Warn on unparseable environmental config option
* Move test for config from environment to integration tests to avoid interfering with others
* Fix clippy warning
* Add ConfigOptions::from_env
* Use default in SessionConfig::from_env rather than duplicating it
Co-authored-by: Andy Grove <an...@gmail.com>
* Use ..Default::default() in SessionConfig::from_env
Co-authored-by: Andy Grove <an...@gmail.com>
---
datafusion-cli/src/main.rs | 2 +-
datafusion/common/src/scalar.rs | 10 ++++++-
datafusion/core/src/config.rs | 30 +++++++++++++++++++
datafusion/core/src/execution/context.rs | 8 ++++++
datafusion/core/tests/config_from_env.rs | 49 ++++++++++++++++++++++++++++++++
dev/update_config_docs.sh | 8 ++++++
docs/source/user-guide/configs.md | 8 ++++++
7 files changed, 113 insertions(+), 2 deletions(-)
diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs
index 23d8c74c1..44c5e2ac9 100644
--- a/datafusion-cli/src/main.rs
+++ b/datafusion-cli/src/main.rs
@@ -92,7 +92,7 @@ pub async fn main() -> Result<()> {
env::set_current_dir(&p).unwrap();
};
- let mut session_config = SessionConfig::new().with_information_schema(true);
+ let mut session_config = SessionConfig::from_env().with_information_schema(true);
if let Some(batch_size) = args.batch_size {
session_config = session_config.with_batch_size(batch_size);
diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs
index f21a6e605..76d94677c 100644
--- a/datafusion/common/src/scalar.rs
+++ b/datafusion/common/src/scalar.rs
@@ -20,7 +20,7 @@
use crate::error::{DataFusionError, Result};
use arrow::{
array::*,
- compute::kernels::cast::cast,
+ compute::kernels::cast::{cast, cast_with_options, CastOptions},
datatypes::{
ArrowDictionaryKeyType, ArrowNativeType, DataType, Field, Float32Type,
Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, IntervalUnit, TimeUnit,
@@ -1442,6 +1442,14 @@ impl ScalarValue {
})
}
+ /// Try to parse `value` into a ScalarValue of type `target_type`
+ pub fn try_from_string(value: String, target_type: &DataType) -> Result<Self> {
+ let value = ScalarValue::Utf8(Some(value));
+ let cast_options = CastOptions { safe: false };
+ let cast_arr = cast_with_options(&value.to_array(), target_type, &cast_options)?;
+ ScalarValue::try_from_array(&cast_arr, 0)
+ }
+
fn eq_array_decimal(
array: &ArrayRef,
index: usize,
diff --git a/datafusion/core/src/config.rs b/datafusion/core/src/config.rs
index 9a0570be2..20aa0055b 100644
--- a/datafusion/core/src/config.rs
+++ b/datafusion/core/src/config.rs
@@ -20,7 +20,9 @@
use arrow::datatypes::DataType;
use datafusion_common::ScalarValue;
use itertools::Itertools;
+use log::warn;
use std::collections::HashMap;
+use std::env;
/// Configuration option "datafusion.optimizer.filter_null_join_keys"
pub const OPT_FILTER_NULL_JOIN_KEYS: &str = "datafusion.optimizer.filter_null_join_keys";
@@ -186,6 +188,34 @@ impl ConfigOptions {
Self { options }
}
+ /// Create new ConfigOptions struct, taking values from environment variables where possible.
+ /// For example, setting DATAFUSION_EXECUTION_BATCH_SIZE to control `datafusion.execution.batch_size`.
+ pub fn from_env() -> Self {
+ let mut options = HashMap::new();
+ let built_in = BuiltInConfigs::new();
+ for config_def in &built_in.config_definitions {
+ let config_value = {
+ let mut env_key = config_def.key.replace('.', "_");
+ env_key.make_ascii_uppercase();
+ match env::var(&env_key) {
+ Ok(value) => match ScalarValue::try_from_string(
+ value.clone(),
+ &config_def.data_type,
+ ) {
+ Ok(parsed) => parsed,
+ Err(_) => {
+ warn!("Warning: could not parse environment variable {}={} to type {}.", env_key, value, config_def.data_type);
+ config_def.default_value.clone()
+ }
+ },
+ Err(_) => config_def.default_value.clone(),
+ }
+ };
+ options.insert(config_def.key.clone(), config_value);
+ }
+ Self { options }
+ }
+
/// set a configuration option
pub fn set(&mut self, key: &str, value: ScalarValue) {
self.options.insert(key.to_string(), value);
diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs
index 458b91526..b998ca597 100644
--- a/datafusion/core/src/execution/context.rs
+++ b/datafusion/core/src/execution/context.rs
@@ -1041,6 +1041,14 @@ impl SessionConfig {
Default::default()
}
+ /// Create an execution config with config options read from the environment
+ pub fn from_env() -> Self {
+ Self {
+ config_options: ConfigOptions::from_env(),
+ ..Default::default()
+ }
+ }
+
/// Set a configuration option
pub fn set(mut self, key: &str, value: ScalarValue) -> Self {
self.config_options.set(key, value);
diff --git a/datafusion/core/tests/config_from_env.rs b/datafusion/core/tests/config_from_env.rs
new file mode 100644
index 000000000..8dbbc7b0f
--- /dev/null
+++ b/datafusion/core/tests/config_from_env.rs
@@ -0,0 +1,49 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion::config::ConfigOptions;
+use std::env;
+
+#[test]
+fn get_config_bool_from_env() {
+ let config_key = "datafusion.optimizer.filter_null_join_keys";
+ let env_key = "DATAFUSION_OPTIMIZER_FILTER_NULL_JOIN_KEYS";
+ env::set_var(env_key, "true");
+ let config = ConfigOptions::from_env();
+ env::remove_var(env_key);
+ assert!(config.get_bool(config_key));
+}
+
+#[test]
+fn get_config_int_from_env() {
+ let config_key = "datafusion.execution.batch_size";
+ let env_key = "DATAFUSION_EXECUTION_BATCH_SIZE";
+ env::set_var(env_key, "4096");
+ let config = ConfigOptions::from_env();
+ env::remove_var(env_key);
+ assert_eq!(config.get_u64(config_key), 4096);
+}
+
+#[test]
+fn get_config_int_from_env_invalid() {
+ let config_key = "datafusion.execution.coalesce_target_batch_size";
+ let env_key = "DATAFUSION_EXECUTION_COALESCE_TARGET_BATCH_SIZE";
+ env::set_var(env_key, "abc");
+ let config = ConfigOptions::from_env();
+ env::remove_var(env_key);
+ assert_eq!(config.get_u64(config_key), 4096); // set to its default value
+}
diff --git a/dev/update_config_docs.sh b/dev/update_config_docs.sh
index 9e955229c..836ba6772 100755
--- a/dev/update_config_docs.sh
+++ b/dev/update_config_docs.sh
@@ -57,6 +57,14 @@ Instead, edit dev/update_config_docs.sh or the docstrings in datafusion/core/src
The following configuration options can be passed to `SessionConfig` to control various aspects of query execution.
+For applications which do not expose `SessionConfig`, like `datafusion-cli`, these options may also be set via environment variables.
+To construct a session with options from the environment, use `SessionConfig::from_env`.
+The name of the environment variable is the option's key, transformed to uppercase and with periods replaced with underscores.
+For example, to configure `datafusion.execution.batch_size` you would set the `DATAFUSION_EXECUTION_BATCH_SIZE` environment variable.
+Values are parsed according to the [same rules used in casts from Utf8](https://docs.rs/arrow/latest/arrow/compute/kernels/cast/fn.cast.html).
+If the value in the environment variable cannot be cast to the type of the configuration option, the default value will be used instead and a warning emitted.
+Environment variables are read during `SessionConfig` initialisation so they must be set beforehand and will not affect running sessions.
+
EOF
echo "Running CLI and inserting docs table"
diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md
index 1fc0b5fa8..c899bafdb 100644
--- a/docs/source/user-guide/configs.md
+++ b/docs/source/user-guide/configs.md
@@ -27,6 +27,14 @@ Instead, edit dev/update_config_docs.sh or the docstrings in datafusion/core/src
The following configuration options can be passed to `SessionConfig` to control various aspects of query execution.
+For applications which do not expose `SessionConfig`, like `datafusion-cli`, these options may also be set via environment variables.
+To construct a session with options from the environment, use `SessionConfig::from_env`.
+The name of the environment variable is the option's key, transformed to uppercase and with periods replaced with underscores.
+For example, to configure `datafusion.execution.batch_size` you would set the `DATAFUSION_EXECUTION_BATCH_SIZE` environment variable.
+Values are parsed according to the [same rules used in casts from Utf8](https://docs.rs/arrow/latest/arrow/compute/kernels/cast/fn.cast.html).
+If the value in the environment variable cannot be cast to the type of the configuration option, the default value will be used instead and a warning emitted.
+Environment variables are read during `SessionConfig` initialisation so they must be set beforehand and will not affect running sessions.
+
| key | type | default | description |
| ----------------------------------------------- | ------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| datafusion.execution.batch_size | UInt64 | 8192 | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would results in too much metadata memory consumption. |