You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2022/08/10 00:18:25 UTC

[spark] branch master updated: [SPARK-40015][PYTHON] Add sc.listArchives and sc.listFiles to PySpark

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 934b74d226a [SPARK-40015][PYTHON] Add sc.listArchives and sc.listFiles to PySpark
934b74d226a is described below

commit 934b74d226a3d856330709a7ee3e8e68a71b736e
Author: Ruifeng Zheng <ru...@apache.org>
AuthorDate: Wed Aug 10 09:18:15 2022 +0900

    [SPARK-40015][PYTHON] Add sc.listArchives and sc.listFiles to PySpark
    
    ### What changes were proposed in this pull request?
    Add `sc.listArchives` and `sc.listFiles` to PySpark
    
    ### Why are the changes needed?
    for function parity
    
    ### Does this PR introduce _any_ user-facing change?
    yes, new APIs
    
    ### How was this patch tested?
    added doctests
    
    Closes #37445 from zhengruifeng/py_add_files.
    
    Authored-by: Ruifeng Zheng <ru...@apache.org>
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
 python/docs/source/reference/pyspark.rst |  2 ++
 python/pyspark/context.py                | 50 ++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

diff --git a/python/docs/source/reference/pyspark.rst b/python/docs/source/reference/pyspark.rst
index f0997255bb9..c3afae10ddb 100644
--- a/python/docs/source/reference/pyspark.rst
+++ b/python/docs/source/reference/pyspark.rst
@@ -72,6 +72,8 @@ Spark Context APIs
     SparkContext.getOrCreate
     SparkContext.hadoopFile
     SparkContext.hadoopRDD
+    SparkContext.listArchives
+    SparkContext.listFiles
     SparkContext.newAPIHadoopFile
     SparkContext.newAPIHadoopRDD
     SparkContext.parallelize
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index adfdea51c9c..032efaef492 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -1249,9 +1249,33 @@ class SparkContext:
         ...        return [x * fileVal for x in iterator]
         >>> sc.parallelize([1, 2, 3, 4]).mapPartitions(func).collect()
         [100, 200, 300, 400]
+        >>> sc.listFiles
+        ['file:/.../test.txt']
+        >>> path2 = os.path.join(tempdir, "test2.txt")
+        >>> with open(path2, "w") as testFile:
+        ...    _ = testFile.write("100")
+        >>> sc.addFile(path2)
+        >>> sorted(sc.listFiles)
+        ['file:/.../test.txt', 'file:/.../test2.txt']
         """
         self._jsc.sc().addFile(path, recursive)
 
+    @property
+    def listFiles(self) -> List[str]:
+        """Returns a list of file paths that are added to resources.
+
+        .. versionadded:: 3.4.0
+
+        See Also
+        --------
+        SparkContext.addFile
+        """
+        return list(
+            self._jvm.scala.collection.JavaConverters.seqAsJavaList(  # type: ignore[union-attr]
+                self._jsc.sc().listFiles()
+            )
+        )
+
     def addPyFile(self, path: str) -> None:
         """
         Add a .py or .zip dependency for all tasks to be executed on this
@@ -1304,6 +1328,16 @@ class SparkContext:
         ...         _ = f.write("100")
         ...     zipped.write(path, os.path.basename(path))
         >>> sc.addArchive(zip_path)
+        >>> sc.listArchives
+        ['file:/.../test.zip']
+        >>> zip_path2 = os.path.join(tempdir, "test2.zip")
+        >>> with zipfile.ZipFile(zip_path2, "w", zipfile.ZIP_DEFLATED) as zipped:
+        ...     with open(path, "w") as f:
+        ...         _ = f.write("100")
+        ...     zipped.write(path, os.path.basename(path))
+        >>> sc.addArchive(zip_path2)
+        >>> sorted(sc.listArchives)
+        ['file:/.../test.zip', 'file:/.../test2.zip']
 
         Reads the '100' as an integer in the zipped file, and processes
         it with the data in the RDD.
@@ -1317,6 +1351,22 @@ class SparkContext:
         """
         self._jsc.sc().addArchive(path)
 
+    @property
+    def listArchives(self) -> List[str]:
+        """Returns a list of archive paths that are added to resources.
+
+        .. versionadded:: 3.4.0
+
+        See Also
+        --------
+        SparkContext.addArchive
+        """
+        return list(
+            self._jvm.scala.collection.JavaConverters.seqAsJavaList(  # type: ignore[union-attr]
+                self._jsc.sc().listArchives()
+            )
+        )
+
     def setCheckpointDir(self, dirName: str) -> None:
         """
         Set the directory under which RDDs are going to be checkpointed. The


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org