You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2018/01/25 22:50:52 UTC
spark git commit: [SPARK-23081][PYTHON] Add colRegex API to PySpark

Repository: spark
Updated Branches:
  refs/heads/master 8532e26f3 -> 8480c0c57


[SPARK-23081][PYTHON] Add colRegex API to PySpark

## What changes were proposed in this pull request?

Add colRegex API to PySpark

## How was this patch tested?

add a test in sql/tests.py

Author: Huaxin Gao <hu...@us.ibm.com>

Closes #20390 from huaxingao/spark-23081.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8480c0c5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8480c0c5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8480c0c5

Branch: refs/heads/master
Commit: 8480c0c57698b7dcccec5483d67b17cf2c7527ed
Parents: 8532e26
Author: Huaxin Gao <hu...@us.ibm.com>
Authored: Fri Jan 26 07:50:48 2018 +0900
Committer: hyukjinkwon <gu...@gmail.com>
Committed: Fri Jan 26 07:50:48 2018 +0900

----------------------------------------------------------------------
 python/pyspark/sql/dataframe.py                 | 23 ++++++++++++++++++++
 .../scala/org/apache/spark/sql/Dataset.scala    |  8 +++----
 2 files changed, 27 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/8480c0c5/python/pyspark/sql/dataframe.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 2d5e9b9..ac40308 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -819,6 +819,29 @@ class DataFrame(object):
         """
         return [f.name for f in self.schema.fields]
 
+    @since(2.3)
+    def colRegex(self, colName):
+        """
+        Selects column based on the column name specified as a regex and returns it
+        as :class:`Column`.
+
+        :param colName: string, column name specified as a regex.
+
+        >>> df = spark.createDataFrame([("a", 1), ("b", 2), ("c",  3)], ["Col1", "Col2"])
+        >>> df.select(df.colRegex("`(Col1)?+.+`")).show()
+        +----+
+        |Col2|
+        +----+
+        |   1|
+        |   2|
+        |   3|
+        +----+
+        """
+        if not isinstance(colName, basestring):
+            raise ValueError("colName should be provided as string")
+        jc = self._jdf.colRegex(colName)
+        return Column(jc)
+
     @ignore_unicode_prefix
     @since(1.3)
     def alias(self, alias):

http://git-wip-us.apache.org/repos/asf/spark/blob/8480c0c5/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 912f411..edb6644 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -1194,7 +1194,7 @@ class Dataset[T] private[sql](
   def orderBy(sortExprs: Column*): Dataset[T] = sort(sortExprs : _*)
 
   /**
-   * Selects column based on the column name and return it as a [[Column]].
+   * Selects column based on the column name and returns it as a [[Column]].
    *
    * @note The column name can also reference to a nested column like `a.b`.
    *
@@ -1220,7 +1220,7 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Selects column based on the column name and return it as a [[Column]].
+   * Selects column based on the column name and returns it as a [[Column]].
    *
    * @note The column name can also reference to a nested column like `a.b`.
    *
@@ -1240,7 +1240,7 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Selects column based on the column name specified as a regex and return it as [[Column]].
+   * Selects column based on the column name specified as a regex and returns it as [[Column]].
    * @group untypedrel
    * @since 2.3.0
    */
@@ -2729,7 +2729,7 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Return an iterator that contains all rows in this Dataset.
+   * Returns an iterator that contains all rows in this Dataset.
    *
    * The iterator will consume as much memory as the largest partition in this Dataset.
    *


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org