You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/07/01 20:20:11 UTC

[arrow-datafusion] branch master updated: Allow setting of config options via environment variables (#2812)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new 88b88d436 Allow setting of config options via environment variables (#2812)
88b88d436 is described below

commit 88b88d4360054a85982987aa07b3f3afd2db7d70
Author: Mike Roberts <42...@users.noreply.github.com>
AuthorDate: Fri Jul 1 21:20:06 2022 +0100

    Allow setting of config options via environment variables (#2812)
    
    * Allow setting of config options via environment variables
    
    * Fix clippy warnings in config.rs
    
    * Remove default value check in config.rs, to avoid interference with get_from_env
    
    * Move try_from_string helper from config.rs to ScalarValue::
    
    * Warn on unparseable environmental config option
    
    * Move test for config from environment to integration tests to avoid interfering with others
    
    * Fix clippy warning
    
    * Add ConfigOptions::from_env
    
    * Use default in SessionConfig::from_env rather than duplicating it
    
    Co-authored-by: Andy Grove <an...@gmail.com>
    
    * Use ..Default::default() in SessionConfig::from_env
    
    Co-authored-by: Andy Grove <an...@gmail.com>
---
 datafusion-cli/src/main.rs               |  2 +-
 datafusion/common/src/scalar.rs          | 10 ++++++-
 datafusion/core/src/config.rs            | 30 +++++++++++++++++++
 datafusion/core/src/execution/context.rs |  8 ++++++
 datafusion/core/tests/config_from_env.rs | 49 ++++++++++++++++++++++++++++++++
 dev/update_config_docs.sh                |  8 ++++++
 docs/source/user-guide/configs.md        |  8 ++++++
 7 files changed, 113 insertions(+), 2 deletions(-)

diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs
index 23d8c74c1..44c5e2ac9 100644
--- a/datafusion-cli/src/main.rs
+++ b/datafusion-cli/src/main.rs
@@ -92,7 +92,7 @@ pub async fn main() -> Result<()> {
         env::set_current_dir(&p).unwrap();
     };
 
-    let mut session_config = SessionConfig::new().with_information_schema(true);
+    let mut session_config = SessionConfig::from_env().with_information_schema(true);
 
     if let Some(batch_size) = args.batch_size {
         session_config = session_config.with_batch_size(batch_size);
diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs
index f21a6e605..76d94677c 100644
--- a/datafusion/common/src/scalar.rs
+++ b/datafusion/common/src/scalar.rs
@@ -20,7 +20,7 @@
 use crate::error::{DataFusionError, Result};
 use arrow::{
     array::*,
-    compute::kernels::cast::cast,
+    compute::kernels::cast::{cast, cast_with_options, CastOptions},
     datatypes::{
         ArrowDictionaryKeyType, ArrowNativeType, DataType, Field, Float32Type,
         Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, IntervalUnit, TimeUnit,
@@ -1442,6 +1442,14 @@ impl ScalarValue {
         })
     }
 
+    /// Try to parse `value` into a ScalarValue of type `target_type`
+    pub fn try_from_string(value: String, target_type: &DataType) -> Result<Self> {
+        let value = ScalarValue::Utf8(Some(value));
+        let cast_options = CastOptions { safe: false };
+        let cast_arr = cast_with_options(&value.to_array(), target_type, &cast_options)?;
+        ScalarValue::try_from_array(&cast_arr, 0)
+    }
+
     fn eq_array_decimal(
         array: &ArrayRef,
         index: usize,
diff --git a/datafusion/core/src/config.rs b/datafusion/core/src/config.rs
index 9a0570be2..20aa0055b 100644
--- a/datafusion/core/src/config.rs
+++ b/datafusion/core/src/config.rs
@@ -20,7 +20,9 @@
 use arrow::datatypes::DataType;
 use datafusion_common::ScalarValue;
 use itertools::Itertools;
+use log::warn;
 use std::collections::HashMap;
+use std::env;
 
 /// Configuration option "datafusion.optimizer.filter_null_join_keys"
 pub const OPT_FILTER_NULL_JOIN_KEYS: &str = "datafusion.optimizer.filter_null_join_keys";
@@ -186,6 +188,34 @@ impl ConfigOptions {
         Self { options }
     }
 
+    /// Create new ConfigOptions struct, taking values from environment variables where possible.
+    /// For example, setting DATAFUSION_EXECUTION_BATCH_SIZE to control `datafusion.execution.batch_size`.
+    pub fn from_env() -> Self {
+        let mut options = HashMap::new();
+        let built_in = BuiltInConfigs::new();
+        for config_def in &built_in.config_definitions {
+            let config_value = {
+                let mut env_key = config_def.key.replace('.', "_");
+                env_key.make_ascii_uppercase();
+                match env::var(&env_key) {
+                    Ok(value) => match ScalarValue::try_from_string(
+                        value.clone(),
+                        &config_def.data_type,
+                    ) {
+                        Ok(parsed) => parsed,
+                        Err(_) => {
+                            warn!("Warning: could not parse environment variable {}={} to type {}.", env_key, value, config_def.data_type);
+                            config_def.default_value.clone()
+                        }
+                    },
+                    Err(_) => config_def.default_value.clone(),
+                }
+            };
+            options.insert(config_def.key.clone(), config_value);
+        }
+        Self { options }
+    }
+
     /// set a configuration option
     pub fn set(&mut self, key: &str, value: ScalarValue) {
         self.options.insert(key.to_string(), value);
diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs
index 458b91526..b998ca597 100644
--- a/datafusion/core/src/execution/context.rs
+++ b/datafusion/core/src/execution/context.rs
@@ -1041,6 +1041,14 @@ impl SessionConfig {
         Default::default()
     }
 
+    /// Create an execution config with config options read from the environment
+    pub fn from_env() -> Self {
+        Self {
+            config_options: ConfigOptions::from_env(),
+            ..Default::default()
+        }
+    }
+
     /// Set a configuration option
     pub fn set(mut self, key: &str, value: ScalarValue) -> Self {
         self.config_options.set(key, value);
diff --git a/datafusion/core/tests/config_from_env.rs b/datafusion/core/tests/config_from_env.rs
new file mode 100644
index 000000000..8dbbc7b0f
--- /dev/null
+++ b/datafusion/core/tests/config_from_env.rs
@@ -0,0 +1,49 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion::config::ConfigOptions;
+use std::env;
+
+#[test]
+fn get_config_bool_from_env() {
+    let config_key = "datafusion.optimizer.filter_null_join_keys";
+    let env_key = "DATAFUSION_OPTIMIZER_FILTER_NULL_JOIN_KEYS";
+    env::set_var(env_key, "true");
+    let config = ConfigOptions::from_env();
+    env::remove_var(env_key);
+    assert!(config.get_bool(config_key));
+}
+
+#[test]
+fn get_config_int_from_env() {
+    let config_key = "datafusion.execution.batch_size";
+    let env_key = "DATAFUSION_EXECUTION_BATCH_SIZE";
+    env::set_var(env_key, "4096");
+    let config = ConfigOptions::from_env();
+    env::remove_var(env_key);
+    assert_eq!(config.get_u64(config_key), 4096);
+}
+
+#[test]
+fn get_config_int_from_env_invalid() {
+    let config_key = "datafusion.execution.coalesce_target_batch_size";
+    let env_key = "DATAFUSION_EXECUTION_COALESCE_TARGET_BATCH_SIZE";
+    env::set_var(env_key, "abc");
+    let config = ConfigOptions::from_env();
+    env::remove_var(env_key);
+    assert_eq!(config.get_u64(config_key), 4096); // set to its default value
+}
diff --git a/dev/update_config_docs.sh b/dev/update_config_docs.sh
index 9e955229c..836ba6772 100755
--- a/dev/update_config_docs.sh
+++ b/dev/update_config_docs.sh
@@ -57,6 +57,14 @@ Instead, edit dev/update_config_docs.sh or the docstrings in datafusion/core/src
 
 The following configuration options can be passed to `SessionConfig` to control various aspects of query execution.
 
+For applications which do not expose `SessionConfig`, like `datafusion-cli`, these options may also be set via environment variables.
+To construct a session with options from the environment, use `SessionConfig::from_env`.
+The name of the environment variable is the option's key, transformed to uppercase and with periods replaced with underscores.
+For example, to configure `datafusion.execution.batch_size` you would set the `DATAFUSION_EXECUTION_BATCH_SIZE` environment variable.
+Values are parsed according to the [same rules used in casts from Utf8](https://docs.rs/arrow/latest/arrow/compute/kernels/cast/fn.cast.html).
+If the value in the environment variable cannot be cast to the type of the configuration option, the default value will be used instead and a warning emitted.
+Environment variables are read during `SessionConfig` initialisation so they must be set beforehand and will not affect running sessions.
+
 EOF
 
 echo "Running CLI and inserting docs table"
diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md
index 1fc0b5fa8..c899bafdb 100644
--- a/docs/source/user-guide/configs.md
+++ b/docs/source/user-guide/configs.md
@@ -27,6 +27,14 @@ Instead, edit dev/update_config_docs.sh or the docstrings in datafusion/core/src
 
 The following configuration options can be passed to `SessionConfig` to control various aspects of query execution.
 
+For applications which do not expose `SessionConfig`, like `datafusion-cli`, these options may also be set via environment variables.
+To construct a session with options from the environment, use `SessionConfig::from_env`.
+The name of the environment variable is the option's key, transformed to uppercase and with periods replaced with underscores.
+For example, to configure `datafusion.execution.batch_size` you would set the `DATAFUSION_EXECUTION_BATCH_SIZE` environment variable.
+Values are parsed according to the [same rules used in casts from Utf8](https://docs.rs/arrow/latest/arrow/compute/kernels/cast/fn.cast.html).
+If the value in the environment variable cannot be cast to the type of the configuration option, the default value will be used instead and a warning emitted.
+Environment variables are read during `SessionConfig` initialisation so they must be set beforehand and will not affect running sessions.
+
 | key                                             | type    | default | description                                                                                                                                                                                                                                                                                                                                                   |
 | ----------------------------------------------- | ------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | datafusion.execution.batch_size                 | UInt64  | 8192    | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would results in too much metadata memory consumption.                                                                                                                                                                         |