You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by rx...@apache.org on 2013/12/03 23:22:19 UTC
git commit: Merge pull request #218 from
JoshRosen/spark-970-pyspark-unicode-error
Updated Branches:
refs/heads/branch-0.8 8b091febd -> daaaee175
Merge pull request #218 from JoshRosen/spark-970-pyspark-unicode-error
Fix UnicodeEncodeError in PySpark saveAsTextFile() (SPARK-970)
This fixes [SPARK-970](https://spark-project.atlassian.net/browse/SPARK-970), an issue where PySpark's saveAsTextFile() could throw UnicodeEncodeError when called on an RDD of Unicode strings.
Please merge this into master and branch-0.8.
(cherry picked from commit 8a3475aed66617772f4e98e9f774b109756eb391)
Signed-off-by: Reynold Xin <rx...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/daaaee17
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/daaaee17
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/daaaee17
Branch: refs/heads/branch-0.8
Commit: daaaee175a6c07115c8eef85611f3717123c43b3
Parents: 8b091fe
Author: Reynold Xin <rx...@apache.org>
Authored: Tue Dec 3 14:21:40 2013 -0800
Committer: Reynold Xin <rx...@apache.org>
Committed: Tue Dec 3 14:22:05 2013 -0800
----------------------------------------------------------------------
python/pyspark/rdd.py | 5 ++++-
python/pyspark/tests.py | 15 +++++++++++++++
2 files changed, 19 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/daaaee17/python/pyspark/rdd.py
----------------------------------------------------------------------
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 7019fb8..0c599e0 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -598,7 +598,10 @@ class RDD(object):
'0\\n1\\n2\\n3\\n4\\n5\\n6\\n7\\n8\\n9\\n'
"""
def func(split, iterator):
- return (str(x).encode("utf-8") for x in iterator)
+ for x in iterator:
+ if not isinstance(x, basestring):
+ x = unicode(x)
+ yield x.encode("utf-8")
keyed = PipelinedRDD(self, func)
keyed._bypass_serializer = True
keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path)
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/daaaee17/python/pyspark/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 29d6a12..d3f6c2b 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -19,6 +19,8 @@
Unit tests for PySpark; additional tests are implemented as doctests in
individual modules.
"""
+from fileinput import input
+from glob import glob
import os
import shutil
import sys
@@ -137,6 +139,19 @@ class TestAddFile(PySparkTestCase):
self.assertEqual("Hello World from inside a package!", UserClass().hello())
+class TestRDDFunctions(PySparkTestCase):
+
+ def test_save_as_textfile_with_unicode(self):
+ # Regression test for SPARK-970
+ x = u"\u00A1Hola, mundo!"
+ data = self.sc.parallelize([x])
+ tempFile = NamedTemporaryFile(delete=True)
+ tempFile.close()
+ data.saveAsTextFile(tempFile.name)
+ raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
+ self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))
+
+
class TestIO(PySparkTestCase):
def test_stdout_redirection(self):