You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by an...@apache.org on 2015/06/30 02:21:42 UTC
spark git commit: [SPARK-8437] [DOCS] Using directory path without
wildcard for filename slow for large number of files with wholeTextFiles and
binaryFiles
Repository: spark
Updated Branches:
refs/heads/master fbf75738f -> 5d30eae56
[SPARK-8437] [DOCS] Using directory path without wildcard for filename slow for large number of files with wholeTextFiles and binaryFiles
Note that 'dir/*' can be more efficient in some Hadoop FS implementations that 'dir/'
Author: Sean Owen <so...@cloudera.com>
Closes #7036 from srowen/SPARK-8437 and squashes the following commits:
0e813ae [Sean Owen] Note that 'dir/*' can be more efficient in some Hadoop FS implementations that 'dir/'
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5d30eae5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5d30eae5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5d30eae5
Branch: refs/heads/master
Commit: 5d30eae56051c563a8427f330b09ef66db0a0d21
Parents: fbf7573
Author: Sean Owen <so...@cloudera.com>
Authored: Mon Jun 29 17:21:35 2015 -0700
Committer: Andrew Or <an...@databricks.com>
Committed: Mon Jun 29 17:21:35 2015 -0700
----------------------------------------------------------------------
core/src/main/scala/org/apache/spark/SparkContext.scala | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/5d30eae5/core/src/main/scala/org/apache/spark/SparkContext.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index b3c3bf3..cb7e24c 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -831,6 +831,8 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
* }}}
*
* @note Small files are preferred, large file is also allowable, but may cause bad performance.
+ * @note On some filesystems, `.../path/*` can be a more efficient way to read all files in a directory
+ * rather than `.../path/` or `.../path`
*
* @param minPartitions A suggestion value of the minimal splitting number for input data.
*/
@@ -878,9 +880,11 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
* (a-hdfs-path/part-nnnnn, its content)
* }}}
*
- * @param minPartitions A suggestion value of the minimal splitting number for input data.
- *
* @note Small files are preferred; very large files may cause bad performance.
+ * @note On some filesystems, `.../path/*` can be a more efficient way to read all files in a directory
+ * rather than `.../path/` or `.../path`
+ *
+ * @param minPartitions A suggestion value of the minimal splitting number for input data.
*/
@Experimental
def binaryFiles(
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org