You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by rx...@apache.org on 2013/12/03 23:21:44 UTC

[1/2] git commit: Fix UnicodeEncodeError in PySpark saveAsTextFile().

Updated Branches:
  refs/heads/master 58d9bbcfe -> 8a3475aed


Fix UnicodeEncodeError in PySpark saveAsTextFile().

Fixes SPARK-970.

Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/3787f514
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/3787f514
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/3787f514

Branch: refs/heads/master
Commit: 3787f514d9a8e45d2c257b4696e30bc1a1935748
Parents: 743a31a
Author: Josh Rosen <jo...@apache.org>
Authored: Thu Nov 28 23:44:56 2013 -0800
Committer: Josh Rosen <jo...@apache.org>
Committed: Thu Nov 28 23:44:56 2013 -0800

----------------------------------------------------------------------
 python/pyspark/rdd.py   |  5 ++++-
 python/pyspark/tests.py | 15 +++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/3787f514/python/pyspark/rdd.py
----------------------------------------------------------------------
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 957f3f8..d8da020 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -605,7 +605,10 @@ class RDD(object):
         '0\\n1\\n2\\n3\\n4\\n5\\n6\\n7\\n8\\n9\\n'
         """
         def func(split, iterator):
-            return (str(x).encode("utf-8") for x in iterator)
+            for x in iterator:
+                if not isinstance(x, basestring):
+                    x = unicode(x)
+                yield x.encode("utf-8")
         keyed = PipelinedRDD(self, func)
         keyed._bypass_serializer = True
         keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path)

http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/3787f514/python/pyspark/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 621e1cb..3987642 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -19,6 +19,8 @@
 Unit tests for PySpark; additional tests are implemented as doctests in
 individual modules.
 """
+from fileinput import input
+from glob import glob
 import os
 import shutil
 import sys
@@ -138,6 +140,19 @@ class TestAddFile(PySparkTestCase):
         self.assertEqual("Hello World from inside a package!", UserClass().hello())
 
 
+class TestRDDFunctions(PySparkTestCase):
+
+    def test_save_as_textfile_with_unicode(self):
+        # Regression test for SPARK-970
+        x = u"\u00A1Hola, mundo!"
+        data = self.sc.parallelize([x])
+        tempFile = NamedTemporaryFile(delete=True)
+        tempFile.close()
+        data.saveAsTextFile(tempFile.name)
+        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
+        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))
+
+
 class TestIO(PySparkTestCase):
 
     def test_stdout_redirection(self):


[2/2] git commit: Merge pull request #218 from JoshRosen/spark-970-pyspark-unicode-error

Posted by rx...@apache.org.
Merge pull request #218 from JoshRosen/spark-970-pyspark-unicode-error

Fix UnicodeEncodeError in PySpark saveAsTextFile() (SPARK-970)

This fixes [SPARK-970](https://spark-project.atlassian.net/browse/SPARK-970), an issue where PySpark's saveAsTextFile() could throw UnicodeEncodeError when called on an RDD of Unicode strings.

Please merge this into master and branch-0.8.


Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/8a3475ae
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/8a3475ae
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/8a3475ae

Branch: refs/heads/master
Commit: 8a3475aed66617772f4e98e9f774b109756eb391
Parents: 58d9bbc 3787f51
Author: Reynold Xin <rx...@apache.org>
Authored: Tue Dec 3 14:21:40 2013 -0800
Committer: Reynold Xin <rx...@apache.org>
Committed: Tue Dec 3 14:21:40 2013 -0800

----------------------------------------------------------------------
 python/pyspark/rdd.py   |  5 ++++-
 python/pyspark/tests.py | 15 +++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)
----------------------------------------------------------------------