You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by yh...@apache.org on 2015/10/28 22:28:49 UTC

spark git commit: [SPARK-11292] [SQL] Python API for text data source

Repository: spark
Updated Branches:
  refs/heads/master 032748bb9 -> 5aa052191


[SPARK-11292] [SQL] Python API for text data source

Adds DataFrameReader.text and DataFrameWriter.text.

Author: Reynold Xin <rx...@databricks.com>

Closes #9259 from rxin/SPARK-11292.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5aa05219
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5aa05219
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5aa05219

Branch: refs/heads/master
Commit: 5aa05219118e3d3525fb703a4716ae8e04f3da72
Parents: 032748b
Author: Reynold Xin <rx...@databricks.com>
Authored: Wed Oct 28 14:28:38 2015 -0700
Committer: Yin Huai <yh...@databricks.com>
Committed: Wed Oct 28 14:28:38 2015 -0700

----------------------------------------------------------------------
 python/pyspark/sql/readwriter.py      | 27 +++++++++++++++++++++++++--
 python/test_support/sql/text-test.txt |  2 ++
 2 files changed, 27 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/5aa05219/python/pyspark/sql/readwriter.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 93832d4..97bd90c 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -23,6 +23,7 @@ if sys.version >= '3':
 from py4j.java_gateway import JavaClass
 
 from pyspark import RDD, since
+from pyspark.rdd import ignore_unicode_prefix
 from pyspark.sql.column import _to_seq
 from pyspark.sql.types import *
 
@@ -193,10 +194,22 @@ class DataFrameReader(object):
         """
         return self._df(self._jreader.parquet(_to_seq(self._sqlContext._sc, paths)))
 
+    @ignore_unicode_prefix
+    @since(1.6)
+    def text(self, path):
+        """Loads a text file and returns a [[DataFrame]] with a single string column named "text".
+
+        Each line in the text file is a new row in the resulting DataFrame.
+
+        >>> df = sqlContext.read.text('python/test_support/sql/text-test.txt')
+        >>> df.collect()
+        [Row(text=u'hello'), Row(text=u'this')]
+        """
+        return self._df(self._jreader.text(path))
+
     @since(1.5)
     def orc(self, path):
-        """
-        Loads an ORC file, returning the result as a :class:`DataFrame`.
+        """Loads an ORC file, returning the result as a :class:`DataFrame`.
 
         ::Note: Currently ORC support is only available together with
         :class:`HiveContext`.
@@ -432,6 +445,16 @@ class DataFrameWriter(object):
             self.partitionBy(partitionBy)
         self._jwrite.parquet(path)
 
+    @since(1.6)
+    def text(self, path):
+        """Saves the content of the DataFrame in a text file at the specified path.
+
+        The DataFrame must have only one column that is of string type.
+        Each row becomes a new line in the output file.
+        """
+        self._jwrite.text(path)
+
+    @since(1.5)
     def orc(self, path, mode=None, partitionBy=None):
         """Saves the content of the :class:`DataFrame` in ORC format at the specified path.
 

http://git-wip-us.apache.org/repos/asf/spark/blob/5aa05219/python/test_support/sql/text-test.txt
----------------------------------------------------------------------
diff --git a/python/test_support/sql/text-test.txt b/python/test_support/sql/text-test.txt
new file mode 100644
index 0000000..ae1e76c
--- /dev/null
+++ b/python/test_support/sql/text-test.txt
@@ -0,0 +1,2 @@
+hello
+this
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org