You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by da...@apache.org on 2015/07/02 01:43:27 UTC
spark git commit: [SPARK-8766] support non-ascii character in column
names
Repository: spark
Updated Branches:
refs/heads/master 1ce642890 -> f958f27e2
[SPARK-8766] support non-ascii character in column names
Use UTF-8 to encode the name of column in Python 2, or it may failed to encode with default encoding ('ascii').
This PR also fix a bug when there is Java exception without error message.
Author: Davies Liu <da...@databricks.com>
Closes #7165 from davies/non_ascii and squashes the following commits:
02cb61a [Davies Liu] fix tests
3b09d31 [Davies Liu] add encoding in header
867754a [Davies Liu] support non-ascii character in column names
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f958f27e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f958f27e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f958f27e
Branch: refs/heads/master
Commit: f958f27e2056f9e380373c2807d8bb5977ecf269
Parents: 1ce6428
Author: Davies Liu <da...@databricks.com>
Authored: Wed Jul 1 16:43:18 2015 -0700
Committer: Davies Liu <da...@databricks.com>
Committed: Wed Jul 1 16:43:18 2015 -0700
----------------------------------------------------------------------
python/pyspark/sql/dataframe.py | 3 +--
python/pyspark/sql/tests.py | 9 +++++++++
python/pyspark/sql/types.py | 2 ++
python/pyspark/sql/utils.py | 6 +++---
4 files changed, 15 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/f958f27e/python/pyspark/sql/dataframe.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 4b9efa0..273a40d 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -484,13 +484,12 @@ class DataFrame(object):
return [(str(f.name), f.dataType.simpleString()) for f in self.schema.fields]
@property
- @ignore_unicode_prefix
@since(1.3)
def columns(self):
"""Returns all column names as a list.
>>> df.columns
- [u'age', u'name']
+ ['age', 'name']
"""
return [f.name for f in self.schema.fields]
http://git-wip-us.apache.org/repos/asf/spark/blob/f958f27e/python/pyspark/sql/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 5af2ce0..333378c 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1,3 +1,4 @@
+# -*- encoding: utf-8 -*-
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
@@ -628,6 +629,14 @@ class SQLTests(ReusedPySparkTestCase):
self.assertRaises(IndexError, lambda: df["bad_key"])
self.assertRaises(TypeError, lambda: df[{}])
+ def test_column_name_with_non_ascii(self):
+ df = self.sqlCtx.createDataFrame([(1,)], ["数量"])
+ self.assertEqual(StructType([StructField("数量", LongType(), True)]), df.schema)
+ self.assertEqual("DataFrame[数量: bigint]", str(df))
+ self.assertEqual([("数量", 'bigint')], df.dtypes)
+ self.assertEqual(1, df.select("数量").first()[0])
+ self.assertEqual(1, df.select(df["数量"]).first()[0])
+
def test_access_nested_types(self):
df = self.sc.parallelize([Row(l=[1], r=Row(a=1, b="b"), d={"k": "v"})]).toDF()
self.assertEqual(1, df.select(df.l[0]).first()[0])
http://git-wip-us.apache.org/repos/asf/spark/blob/f958f27e/python/pyspark/sql/types.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index ae9344e..160df40 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -324,6 +324,8 @@ class StructField(DataType):
False
"""
assert isinstance(dataType, DataType), "dataType should be DataType"
+ if not isinstance(name, str):
+ name = name.encode('utf-8')
self.name = name
self.dataType = dataType
self.nullable = nullable
http://git-wip-us.apache.org/repos/asf/spark/blob/f958f27e/python/pyspark/sql/utils.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py
index 8096802..cc5b2c0 100644
--- a/python/pyspark/sql/utils.py
+++ b/python/pyspark/sql/utils.py
@@ -29,9 +29,9 @@ def capture_sql_exception(f):
try:
return f(*a, **kw)
except py4j.protocol.Py4JJavaError as e:
- cls, msg = e.java_exception.toString().split(': ', 1)
- if cls == 'org.apache.spark.sql.AnalysisException':
- raise AnalysisException(msg)
+ s = e.java_exception.toString()
+ if s.startswith('org.apache.spark.sql.AnalysisException: '):
+ raise AnalysisException(s.split(': ', 1)[1])
raise
return deco
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org