You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2022/08/10 00:18:25 UTC
[spark] branch master updated: [SPARK-40015][PYTHON] Add sc.listArchives and sc.listFiles to PySpark
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 934b74d226a [SPARK-40015][PYTHON] Add sc.listArchives and sc.listFiles to PySpark
934b74d226a is described below
commit 934b74d226a3d856330709a7ee3e8e68a71b736e
Author: Ruifeng Zheng <ru...@apache.org>
AuthorDate: Wed Aug 10 09:18:15 2022 +0900
[SPARK-40015][PYTHON] Add sc.listArchives and sc.listFiles to PySpark
### What changes were proposed in this pull request?
Add `sc.listArchives` and `sc.listFiles` to PySpark
### Why are the changes needed?
for function parity
### Does this PR introduce _any_ user-facing change?
yes, new APIs
### How was this patch tested?
added doctests
Closes #37445 from zhengruifeng/py_add_files.
Authored-by: Ruifeng Zheng <ru...@apache.org>
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
python/docs/source/reference/pyspark.rst | 2 ++
python/pyspark/context.py | 50 ++++++++++++++++++++++++++++++++
2 files changed, 52 insertions(+)
diff --git a/python/docs/source/reference/pyspark.rst b/python/docs/source/reference/pyspark.rst
index f0997255bb9..c3afae10ddb 100644
--- a/python/docs/source/reference/pyspark.rst
+++ b/python/docs/source/reference/pyspark.rst
@@ -72,6 +72,8 @@ Spark Context APIs
SparkContext.getOrCreate
SparkContext.hadoopFile
SparkContext.hadoopRDD
+ SparkContext.listArchives
+ SparkContext.listFiles
SparkContext.newAPIHadoopFile
SparkContext.newAPIHadoopRDD
SparkContext.parallelize
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index adfdea51c9c..032efaef492 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -1249,9 +1249,33 @@ class SparkContext:
... return [x * fileVal for x in iterator]
>>> sc.parallelize([1, 2, 3, 4]).mapPartitions(func).collect()
[100, 200, 300, 400]
+ >>> sc.listFiles
+ ['file:/.../test.txt']
+ >>> path2 = os.path.join(tempdir, "test2.txt")
+ >>> with open(path2, "w") as testFile:
+ ... _ = testFile.write("100")
+ >>> sc.addFile(path2)
+ >>> sorted(sc.listFiles)
+ ['file:/.../test.txt', 'file:/.../test2.txt']
"""
self._jsc.sc().addFile(path, recursive)
+ @property
+ def listFiles(self) -> List[str]:
+ """Returns a list of file paths that are added to resources.
+
+ .. versionadded:: 3.4.0
+
+ See Also
+ --------
+ SparkContext.addFile
+ """
+ return list(
+ self._jvm.scala.collection.JavaConverters.seqAsJavaList( # type: ignore[union-attr]
+ self._jsc.sc().listFiles()
+ )
+ )
+
def addPyFile(self, path: str) -> None:
"""
Add a .py or .zip dependency for all tasks to be executed on this
@@ -1304,6 +1328,16 @@ class SparkContext:
... _ = f.write("100")
... zipped.write(path, os.path.basename(path))
>>> sc.addArchive(zip_path)
+ >>> sc.listArchives
+ ['file:/.../test.zip']
+ >>> zip_path2 = os.path.join(tempdir, "test2.zip")
+ >>> with zipfile.ZipFile(zip_path2, "w", zipfile.ZIP_DEFLATED) as zipped:
+ ... with open(path, "w") as f:
+ ... _ = f.write("100")
+ ... zipped.write(path, os.path.basename(path))
+ >>> sc.addArchive(zip_path2)
+ >>> sorted(sc.listArchives)
+ ['file:/.../test.zip', 'file:/.../test2.zip']
Reads the '100' as an integer in the zipped file, and processes
it with the data in the RDD.
@@ -1317,6 +1351,22 @@ class SparkContext:
"""
self._jsc.sc().addArchive(path)
+ @property
+ def listArchives(self) -> List[str]:
+ """Returns a list of archive paths that are added to resources.
+
+ .. versionadded:: 3.4.0
+
+ See Also
+ --------
+ SparkContext.addArchive
+ """
+ return list(
+ self._jvm.scala.collection.JavaConverters.seqAsJavaList( # type: ignore[union-attr]
+ self._jsc.sc().listArchives()
+ )
+ )
+
def setCheckpointDir(self, dirName: str) -> None:
"""
Set the directory under which RDDs are going to be checkpointed. The
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org