You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ma...@apache.org on 2015/02/25 03:59:28 UTC
spark git commit: [SPARK-5985][SQL] DataFrame sortBy -> orderBy in
Python.
Repository: spark
Updated Branches:
refs/heads/master 53a1ebf33 -> fba11c2f5
[SPARK-5985][SQL] DataFrame sortBy -> orderBy in Python.
Also added desc/asc function for constructing sorting expressions more conveniently. And added a small fix to lift alias out of cast expression.
Author: Reynold Xin <rx...@databricks.com>
Closes #4752 from rxin/SPARK-5985 and squashes the following commits:
aeda5ae [Reynold Xin] Added Experimental flag to ColumnName.
047ad03 [Reynold Xin] Lift alias out of cast.
c9cf17c [Reynold Xin] [SPARK-5985][SQL] DataFrame sortBy -> orderBy in Python.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fba11c2f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fba11c2f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fba11c2f
Branch: refs/heads/master
Commit: fba11c2f55dd81e4f6230e7edca3c7b2e01ccd9d
Parents: 53a1ebf
Author: Reynold Xin <rx...@databricks.com>
Authored: Tue Feb 24 18:59:23 2015 -0800
Committer: Michael Armbrust <mi...@databricks.com>
Committed: Tue Feb 24 18:59:23 2015 -0800
----------------------------------------------------------------------
python/pyspark/sql/dataframe.py | 11 ++++++--
python/pyspark/sql/functions.py | 3 ++
.../scala/org/apache/spark/sql/Column.scala | 13 +++++++--
.../scala/org/apache/spark/sql/functions.scala | 29 ++++++++++++++++++++
.../spark/sql/ColumnExpressionSuite.scala | 4 +++
.../org/apache/spark/sql/DataFrameSuite.scala | 4 +++
6 files changed, 59 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/fba11c2f/python/pyspark/sql/dataframe.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 010c38f..6f746d1 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -504,13 +504,18 @@ class DataFrame(object):
return DataFrame(jdf, self.sql_ctx)
def sort(self, *cols):
- """ Return a new :class:`DataFrame` sorted by the specified column.
+ """ Return a new :class:`DataFrame` sorted by the specified column(s).
:param cols: The columns or expressions used for sorting
>>> df.sort(df.age.desc()).collect()
[Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')]
- >>> df.sortBy(df.age.desc()).collect()
+ >>> df.orderBy(df.age.desc()).collect()
+ [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')]
+ >>> from pyspark.sql.functions import *
+ >>> df.sort(asc("age")).collect()
+ [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
+ >>> df.orderBy(desc("age"), "name").collect()
[Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')]
"""
if not cols:
@@ -520,7 +525,7 @@ class DataFrame(object):
jdf = self._jdf.sort(self._sc._jvm.PythonUtils.toSeq(jcols))
return DataFrame(jdf, self.sql_ctx)
- sortBy = sort
+ orderBy = sort
def head(self, n=None):
""" Return the first `n` rows or the first row if n is None.
http://git-wip-us.apache.org/repos/asf/spark/blob/fba11c2f/python/pyspark/sql/functions.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index fc61162..8aa4476 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -48,6 +48,9 @@ _functions = {
'lit': 'Creates a :class:`Column` of literal value.',
'col': 'Returns a :class:`Column` based on the given column name.',
'column': 'Returns a :class:`Column` based on the given column name.',
+ 'asc': 'Returns a sort expression based on the ascending order of the given column name.',
+ 'desc': 'Returns a sort expression based on the descending order of the given column name.',
+
'upper': 'Converts a string expression to upper case.',
'lower': 'Converts a string expression to upper case.',
'sqrt': 'Computes the square root of the specified float value.',
http://git-wip-us.apache.org/repos/asf/spark/blob/fba11c2f/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 9807543..a2cc9a9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -600,7 +600,11 @@ class Column(protected[sql] val expr: Expression) {
*
* @group expr_ops
*/
- def cast(to: DataType): Column = Cast(expr, to)
+ def cast(to: DataType): Column = expr match {
+ // Lift alias out of cast so we can support col.as("name").cast(IntegerType)
+ case Alias(childExpr, name) => Alias(Cast(childExpr, to), name)()
+ case _ => Cast(expr, to)
+ }
/**
* Casts the column to a different data type, using the canonical string representation
@@ -613,7 +617,7 @@ class Column(protected[sql] val expr: Expression) {
*
* @group expr_ops
*/
- def cast(to: String): Column = Cast(expr, to.toLowerCase match {
+ def cast(to: String): Column = cast(to.toLowerCase match {
case "string" | "str" => StringType
case "boolean" => BooleanType
case "byte" => ByteType
@@ -671,6 +675,11 @@ class Column(protected[sql] val expr: Expression) {
}
+/**
+ * :: Experimental ::
+ * A convenient class used for constructing schema.
+ */
+@Experimental
class ColumnName(name: String) extends Column(name) {
/** Creates a new AttributeReference of type boolean */
http://git-wip-us.apache.org/repos/asf/spark/blob/fba11c2f/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 2a1e086..4fdbfc6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -33,6 +33,7 @@ import org.apache.spark.sql.types._
*
* @groupname udf_funcs UDF functions
* @groupname agg_funcs Aggregate functions
+ * @groupname sort_funcs Sorting functions
* @groupname normal_funcs Non-aggregate functions
* @groupname Ungrouped Support functions for DataFrames.
*/
@@ -96,6 +97,33 @@ object functions {
}
//////////////////////////////////////////////////////////////////////////////////////////////
+ // Sort functions
+ //////////////////////////////////////////////////////////////////////////////////////////////
+
+ /**
+ * Returns a sort expression based on ascending order of the column.
+ * {{
+ * // Sort by dept in ascending order, and then age in descending order.
+ * df.sort(asc("dept"), desc("age"))
+ * }}
+ *
+ * @group sort_funcs
+ */
+ def asc(columnName: String): Column = Column(columnName).asc
+
+ /**
+ * Returns a sort expression based on the descending order of the column.
+ * {{
+ * // Sort by dept in ascending order, and then age in descending order.
+ * df.sort(asc("dept"), desc("age"))
+ * }}
+ *
+ * @group sort_funcs
+ */
+ def desc(columnName: String): Column = Column(columnName).desc
+
+ //////////////////////////////////////////////////////////////////////////////////////////////
+ // Aggregate functions
//////////////////////////////////////////////////////////////////////////////////////////////
/**
@@ -263,6 +291,7 @@ object functions {
def max(columnName: String): Column = max(Column(columnName))
//////////////////////////////////////////////////////////////////////////////////////////////
+ // Non-aggregate functions
//////////////////////////////////////////////////////////////////////////////////////////////
/**
http://git-wip-us.apache.org/repos/asf/spark/blob/fba11c2f/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index 928b0de..37c02aa 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -309,4 +309,8 @@ class ColumnExpressionSuite extends QueryTest {
(1 to 100).map(n => Row(null))
)
}
+
+ test("lift alias out of cast") {
+ assert(col("1234").as("name").cast("int").expr === col("1234").cast("int").as("name").expr)
+ }
}
http://git-wip-us.apache.org/repos/asf/spark/blob/fba11c2f/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 30e77e4..c392a55 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -240,6 +240,10 @@ class DataFrameSuite extends QueryTest {
Seq(Row(1,1), Row(1,2), Row(2,1), Row(2,2), Row(3,1), Row(3,2)))
checkAnswer(
+ testData2.orderBy(asc("a"), desc("b")),
+ Seq(Row(1,2), Row(1,1), Row(2,2), Row(2,1), Row(3,2), Row(3,1)))
+
+ checkAnswer(
testData2.orderBy('a.asc, 'b.desc),
Seq(Row(1,2), Row(1,1), Row(2,2), Row(2,1), Row(3,2), Row(3,1)))
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org