You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2019/03/17 17:58:32 UTC
[spark] branch master updated: [SPARK-26979][PYTHON] Add missing string column name support for some SQL functions

This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new f9180f8  [SPARK-26979][PYTHON] Add missing string column name support for some SQL functions
f9180f8 is described below

commit f9180f8752b7cddec463faf82cdb09af707ae402
Author: André Sá de Mello <am...@palantir.com>
AuthorDate: Sun Mar 17 12:58:16 2019 -0500

    [SPARK-26979][PYTHON] Add missing string column name support for some SQL functions
    
    ## What changes were proposed in this pull request?
    
    Most SQL functions defined in `spark.sql.functions` have two calling patterns, one with a Column object as input, and another with a string representing a column name, which is then converted into a Column object internally.
    
    There are, however, a few notable exceptions:
    
    - lower()
    - upper()
    - abs()
    - bitwiseNOT()
    - ltrim()
    - rtrim()
    - trim()
    - ascii()
    - base64()
    - unbase64()
    
    While this doesn't break anything, as you can easily create a Column object yourself prior to passing it to one of these functions, it has two undesirable consequences:
    
    1. It is surprising - it breaks coder's expectations when they are first starting with Spark. Every API should be as consistent as possible, so as to make the learning curve smoother and to reduce causes for human error;
    
    2. It gets in the way of stylistic conventions. Most of the time it makes Python code more readable to use literal names, and the API provides ample support for that, but these few exceptions prevent this pattern from being universally applicable.
    
    This patch is meant to fix the aforementioned problem.
    
    ### Effect
    
    This patch **enables** support for passing column names as input to those functions mentioned above.
    
    ### Side effects
    
    This PR also **fixes** an issue with some functions being defined multiple times by using `_create_function()`.
    
    ### How it works
    
    `_create_function()` was redefined to always convert the argument to a Column object. The old implementation has been kept under `_create_name_function()`, and is still being used to generate the following special functions:
    
    - lit()
    - col()
    - column()
    - asc()
    - desc()
    - asc_nulls_first()
    - asc_nulls_last()
    - desc_nulls_first()
    - desc_nulls_last()
    
    This is because these functions can only take a column name as their argument. This is not a problem, as their semantics require so.
    
    ## How was this patch tested?
    
    Ran ./dev/run-tests and tested it manually.
    
    Closes #23882 from asmello/col-name-support-pyspark.
    
    Authored-by: André Sá de Mello <am...@palantir.com>
    Signed-off-by: Sean Owen <se...@databricks.com>
---
 python/pyspark/sql/functions.py | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index a36423e..3ee485c 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -37,8 +37,8 @@ from pyspark.sql.types import StringType, DataType
 from pyspark.sql.udf import UserDefinedFunction, _create_udf
 
 
-def _create_function(name, doc=""):
-    """ Create a function for aggregator by name"""
+def _create_name_function(name, doc=""):
+    """ Create a function that takes a column name argument, by name"""
     def _(col):
         sc = SparkContext._active_spark_context
         jc = getattr(sc._jvm.functions, name)(col._jc if isinstance(col, Column) else col)
@@ -48,6 +48,17 @@ def _create_function(name, doc=""):
     return _
 
 
+def _create_function(name, doc=""):
+    """ Create a function that takes a Column object, by name"""
+    def _(col):
+        sc = SparkContext._active_spark_context
+        jc = getattr(sc._jvm.functions, name)(_to_java_column(col))
+        return Column(jc)
+    _.__name__ = name
+    _.__doc__ = doc
+    return _
+
+
 def _wrap_deprecated_function(func, message):
     """ Wrap the deprecated function to print out deprecation warnings"""
     def _(col):
@@ -85,13 +96,16 @@ _lit_doc = """
     >>> df.select(lit(5).alias('height')).withColumn('spark_user', lit(True)).take(1)
     [Row(height=5, spark_user=True)]
     """
-_functions = {
+_name_functions = {
+    # name functions take a column name as their argument
     'lit': _lit_doc,
     'col': 'Returns a :class:`Column` based on the given column name.',
     'column': 'Returns a :class:`Column` based on the given column name.',
     'asc': 'Returns a sort expression based on the ascending order of the given column name.',
     'desc': 'Returns a sort expression based on the descending order of the given column name.',
+}
 
+_functions = {
     'upper': 'Converts a string expression to upper case.',
     'lower': 'Converts a string expression to upper case.',
     'sqrt': 'Computes the square root of the specified float value.',
@@ -141,7 +155,7 @@ _functions_1_4 = {
     'bitwiseNOT': 'Computes bitwise not.',
 }
 
-_functions_2_4 = {
+_name_functions_2_4 = {
     'asc_nulls_first': 'Returns a sort expression based on the ascending order of the given' +
                        ' column name, and null values return before non-null values.',
     'asc_nulls_last': 'Returns a sort expression based on the ascending order of the given' +
@@ -254,6 +268,8 @@ _window_functions = {
 _functions_deprecated = {
 }
 
+for _name, _doc in _name_functions.items():
+    globals()[_name] = since(1.3)(_create_name_function(_name, _doc))
 for _name, _doc in _functions.items():
     globals()[_name] = since(1.3)(_create_function(_name, _doc))
 for _name, _doc in _functions_1_4.items():
@@ -268,8 +284,8 @@ for _name, _doc in _functions_2_1.items():
     globals()[_name] = since(2.1)(_create_function(_name, _doc))
 for _name, _message in _functions_deprecated.items():
     globals()[_name] = _wrap_deprecated_function(globals()[_name], _message)
-for _name, _doc in _functions_2_4.items():
-    globals()[_name] = since(2.4)(_create_function(_name, _doc))
+for _name, _doc in _name_functions_2_4.items():
+    globals()[_name] = since(2.4)(_create_name_function(_name, _doc))
 del _name, _doc
 
 
@@ -1437,10 +1453,6 @@ _string_functions = {
     'ascii': 'Computes the numeric value of the first character of the string column.',
     'base64': 'Computes the BASE64 encoding of a binary column and returns it as a string column.',
     'unbase64': 'Decodes a BASE64 encoded string column and returns it as a binary column.',
-    'initcap': 'Returns a new string column by converting the first letter of each word to ' +
-               'uppercase. Words are delimited by whitespace.',
-    'lower': 'Converts a string column to lower case.',
-    'upper': 'Converts a string column to upper case.',
     'ltrim': 'Trim the spaces from left end for the specified string value.',
     'rtrim': 'Trim the spaces from right end for the specified string value.',
     'trim': 'Trim the spaces from both ends for the specified string column.',


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org