You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ma...@apache.org on 2014/04/06 05:52:14 UTC
git commit: SPARK-1421. Make MLlib work on Python 2.6

Repository: spark
Updated Branches:
  refs/heads/master 890d63bd4 -> 0b8551678


SPARK-1421. Make MLlib work on Python 2.6

The reason it wasn't working was passing a bytearray to stream.write(), which is not supported in Python 2.6 but is in 2.7. (This array came from NumPy when we converted data to send it over to Java). Now we just convert those bytearrays to strings of bytes, which preserves nonprintable characters as well.

Author: Matei Zaharia <ma...@databricks.com>

Closes #335 from mateiz/mllib-python-2.6 and squashes the following commits:

f26c59f [Matei Zaharia] Update docs to no longer say we need Python 2.7
a84d6af [Matei Zaharia] SPARK-1421. Make MLlib work on Python 2.6


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0b855167
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0b855167
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0b855167

Branch: refs/heads/master
Commit: 0b855167818b9afd2d2aa9f617b9861d77b2425d
Parents: 890d63b
Author: Matei Zaharia <ma...@databricks.com>
Authored: Sat Apr 5 20:52:05 2014 -0700
Committer: Matei Zaharia <ma...@databricks.com>
Committed: Sat Apr 5 20:52:05 2014 -0700

----------------------------------------------------------------------
 docs/mllib-guide.md              |  3 +--
 docs/python-programming-guide.md |  2 +-
 python/pyspark/mllib/__init__.py |  6 +-----
 python/pyspark/serializers.py    | 11 ++++++++++-
 4 files changed, 13 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/0b855167/docs/mllib-guide.md
----------------------------------------------------------------------
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 203d235..a5e0cc5 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -38,6 +38,5 @@ depends on native Fortran routines. You may need to install the
 if it is not already present on your nodes. MLlib will throw a linking error if it cannot 
 detect these libraries automatically.
 
-To use MLlib in Python, you will need [NumPy](http://www.numpy.org) version 1.7 or newer
-and Python 2.7.
+To use MLlib in Python, you will need [NumPy](http://www.numpy.org) version 1.7 or newer.
 

http://git-wip-us.apache.org/repos/asf/spark/blob/0b855167/docs/python-programming-guide.md
----------------------------------------------------------------------
diff --git a/docs/python-programming-guide.md b/docs/python-programming-guide.md
index cbe7d82..c2e5327 100644
--- a/docs/python-programming-guide.md
+++ b/docs/python-programming-guide.md
@@ -152,7 +152,7 @@ Many of the methods also contain [doctests](http://docs.python.org/2/library/doc
 # Libraries
 
 [MLlib](mllib-guide.html) is also available in PySpark. To use it, you'll need
-[NumPy](http://www.numpy.org) version 1.7 or newer, and Python 2.7. The [MLlib guide](mllib-guide.html) contains
+[NumPy](http://www.numpy.org) version 1.7 or newer. The [MLlib guide](mllib-guide.html) contains
 some example applications.
 
 # Where to Go from Here

http://git-wip-us.apache.org/repos/asf/spark/blob/0b855167/python/pyspark/mllib/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/__init__.py b/python/pyspark/mllib/__init__.py
index b420d7a..538ff26 100644
--- a/python/pyspark/mllib/__init__.py
+++ b/python/pyspark/mllib/__init__.py
@@ -19,11 +19,7 @@
 Python bindings for MLlib.
 """
 
-# MLlib currently needs Python 2.7+ and NumPy 1.7+, so complain if lower
-
-import sys
-if sys.version_info[0:2] < (2, 7):
-    raise Exception("MLlib requires Python 2.7+")
+# MLlib currently needs and NumPy 1.7+, so complain if lower
 
 import numpy
 if numpy.version.version < '1.7':

http://git-wip-us.apache.org/repos/asf/spark/blob/0b855167/python/pyspark/serializers.py
----------------------------------------------------------------------
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 4d80292..b253807 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -64,6 +64,7 @@ import cPickle
 from itertools import chain, izip, product
 import marshal
 import struct
+import sys
 from pyspark import cloudpickle
 
 
@@ -113,6 +114,11 @@ class FramedSerializer(Serializer):
     where C{length} is a 32-bit integer and data is C{length} bytes.
     """
 
+    def __init__(self):
+        # On Python 2.6, we can't write bytearrays to streams, so we need to convert them
+        # to strings first. Check if the version number is that old.
+        self._only_write_strings = sys.version_info[0:2] <= (2, 6)
+
     def dump_stream(self, iterator, stream):
         for obj in iterator:
             self._write_with_length(obj, stream)
@@ -127,7 +133,10 @@ class FramedSerializer(Serializer):
     def _write_with_length(self, obj, stream):
         serialized = self.dumps(obj)
         write_int(len(serialized), stream)
-        stream.write(serialized)
+        if self._only_write_strings:
+            stream.write(str(serialized))
+        else:
+            stream.write(serialized)
 
     def _read_with_length(self, stream):
         length = read_int(stream)