You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/02/05 10:09:21 UTC

[spark] branch branch-3.0 updated: [SPARK-30506][SQL][DOC] Document for generic file source options/configs

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 3ccebcd  [SPARK-30506][SQL][DOC] Document for generic file source options/configs
3ccebcd is described below

commit 3ccebcdd45a0def4853c1175c18d8538602d571d
Author: yi.wu <yi...@databricks.com>
AuthorDate: Wed Feb 5 17:16:38 2020 +0800

    [SPARK-30506][SQL][DOC] Document for generic file source options/configs
    
    ### What changes were proposed in this pull request?
    
    Add a new document page named *Generic File Source Options* for *Data Sources* menu and added following sub items:
    
    * spark.sql.files.ignoreCorruptFiles
    * spark.sql.files.ignoreMissingFiles
    * pathGlobFilter
    * recursiveFileLookup
    
    And here're snapshots of the generated document:
    <img width="1080" alt="doc-1" src="https://user-images.githubusercontent.com/16397174/73816825-87a54800-4824-11ea-97da-e5c40c59a7d4.png">
    <img width="1081" alt="doc-2" src="https://user-images.githubusercontent.com/16397174/73816827-8a07a200-4824-11ea-99ec-9c8b0286625e.png">
    <img width="1080" alt="doc-3" src="https://user-images.githubusercontent.com/16397174/73816831-8c69fc00-4824-11ea-84f0-6c9e94c2f0e2.png">
    <img width="1081" alt="doc-4" src="https://user-images.githubusercontent.com/16397174/73816834-8f64ec80-4824-11ea-9355-76ad45476634.png">
    
    ### Why are the changes needed?
    
    Better guidance for end-user.
    
    ### Does this PR introduce any user-facing change?
    
    No, added in Spark 3.0.
    
    ### How was this patch tested?
    
    Pass Jenkins.
    
    Closes #27302 from Ngone51/doc-generic-file-source-option.
    
    Lead-authored-by: yi.wu <yi...@databricks.com>
    Co-authored-by: Yuanjian Li <xy...@gmail.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
    (cherry picked from commit 5983ad9cc4481e224a7e094de830ef2e816c1fe6)
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 docs/_data/menu-sql.yaml                           |   2 +
 docs/sql-data-sources-avro.md                      |   2 +-
 docs/sql-data-sources-generic-options.md           | 121 +++++++++++++++++++++
 docs/sql-data-sources-load-save-functions.md       |  21 ----
 docs/sql-data-sources.md                           |   5 +
 .../examples/sql/JavaSQLDataSourceExample.java     |  48 +++++++-
 examples/src/main/python/sql/datasource.py         |  48 +++++++-
 examples/src/main/r/RSparkSQLExample.R             |  24 +++-
 .../src/main/resources/dir1/dir2/file2.parquet     | Bin 0 -> 520 bytes
 examples/src/main/resources/dir1/file1.parquet     | Bin 0 -> 520 bytes
 examples/src/main/resources/dir1/file3.json        |   1 +
 .../partitioned_users.orc/do_not_read_this.txt     |   1 -
 .../users.orc                                      | Bin 448 -> 0 bytes
 .../favorite_color=red/users.orc                   | Bin 402 -> 0 bytes
 .../spark/examples/sql/SQLDataSourceExample.scala  |  48 +++++++-
 15 files changed, 282 insertions(+), 39 deletions(-)

diff --git a/docs/_data/menu-sql.yaml b/docs/_data/menu-sql.yaml
index 3e4db71..241ec39 100644
--- a/docs/_data/menu-sql.yaml
+++ b/docs/_data/menu-sql.yaml
@@ -24,6 +24,8 @@
   subitems:
     - text: "Generic Load/Save Functions"
       url: sql-data-sources-load-save-functions.html
+    - text: "Generic File Source Options"
+      url: sql-data-sources-generic-options.html
     - text: Parquet Files
       url: sql-data-sources-parquet.html
     - text: ORC Files
diff --git a/docs/sql-data-sources-avro.md b/docs/sql-data-sources-avro.md
index b007687..8e6a407 100644
--- a/docs/sql-data-sources-avro.md
+++ b/docs/sql-data-sources-avro.md
@@ -230,7 +230,7 @@ Data source options of Avro can be set via:
   <tr>
     <td><code>ignoreExtension</code></td>
     <td>true</td>
-    <td>The option controls ignoring of files without <code>.avro</code> extensions in read.<br> If the option is enabled, all files (with and without <code>.avro</code> extension) are loaded.<br> The option has been deprecated, and it will be removed in the future releases. Please use the general data source option <code>pathGlobFilter</code> for filtering file names.</td>
+    <td>The option controls ignoring of files without <code>.avro</code> extensions in read.<br> If the option is enabled, all files (with and without <code>.avro</code> extension) are loaded.<br> The option has been deprecated, and it will be removed in the future releases. Please use the general data source option <a href="./sql-data-sources-generic-options.html#path-global-filter">pathGlobFilter</a> for filtering file names.</td>
     <td>read</td>
   </tr>
   <tr>
diff --git a/docs/sql-data-sources-generic-options.md b/docs/sql-data-sources-generic-options.md
new file mode 100644
index 0000000..0cfe2ed
--- /dev/null
+++ b/docs/sql-data-sources-generic-options.md
@@ -0,0 +1,121 @@
+---
+layout: global
+title: Generic File Source Options
+displayTitle: Generic File Source Options
+license: |
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+---
+
+* Table of contents
+{:toc}
+
+These generic options/configurations are effective only when using file-based sources: parquet, orc, avro, json, csv, text.
+
+Please note that the hierarchy of directories used in examples below are:
+
+{% highlight text %}
+
+dir1/
+ ├── dir2/
+ │    └── file2.parquet (schema: <file: string>, content: "file2.parquet")
+ └── file1.parquet (schema: <file, string>, content: "file1.parquet")
+ └── file3.json (schema: <file, string>, content: "{'file':'corrupt.json'}")
+
+{% endhighlight %}
+
+### Ignore Corrupt Files
+
+Spark allows you to use `spark.sql.files.ignoreCorruptFiles` to ignore corrupt files while reading data
+from files. When set to true, the Spark jobs will continue to run when encountering corrupted files and
+the contents that have been read will still be returned.
+
+To ignore corrupt files while reading data files, you can use:
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+{% include_example ignore_corrupt_files scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %}
+</div>
+
+<div data-lang="java"  markdown="1">
+{% include_example ignore_corrupt_files java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %}
+</div>
+
+<div data-lang="python"  markdown="1">
+{% include_example ignore_corrupt_files python/sql/datasource.py %}
+</div>
+
+<div data-lang="r"  markdown="1">
+{% include_example ignore_corrupt_files r/RSparkSQLExample.R %}
+</div>
+</div>
+
+### Ignore Missing Files
+
+Spark allows you to use `spark.sql.files.ignoreMissingFiles` to ignore missing files while reading data
+from files. Here, missing file really means the deleted file under directory after you construct the
+`DataFrame`. When set to true, the Spark jobs will continue to run when encountering missing files and
+the contents that have been read will still be returned.
+
+### Path Global Filter
+
+`pathGlobFilter` is used to only include files with file names matching the pattern.
+The syntax follows <code>org.apache.hadoop.fs.GlobFilter</code>.
+It does not change the behavior of partition discovery.
+
+To load files with paths matching a given glob pattern while keeping the behavior of partition discovery,
+you can use:
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+{% include_example load_with_path_glob_filter scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %}
+</div>
+
+<div data-lang="java"  markdown="1">
+{% include_example load_with_path_glob_filter java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %}
+</div>
+
+<div data-lang="python"  markdown="1">
+{% include_example load_with_path_glob_filter python/sql/datasource.py %}
+</div>
+
+<div data-lang="r"  markdown="1">
+{% include_example load_with_path_glob_filter r/RSparkSQLExample.R %}
+</div>
+</div>
+
+### Recursive File Lookup
+`recursiveFileLookup` is used to recursively load files and it disables partition inferring. Its default value is `false`.
+If data source explicitly specifies the `partitionSpec` when `recursiveFileLookup` is true, exception will be thrown.
+
+To load all files recursively, you can use:
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+{% include_example recursive_file_lookup scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %}
+</div>
+
+<div data-lang="java"  markdown="1">
+{% include_example recursive_file_lookup java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %}
+</div>
+
+<div data-lang="python"  markdown="1">
+{% include_example recursive_file_lookup python/sql/datasource.py %}
+</div>
+
+<div data-lang="r"  markdown="1">
+{% include_example recursive_file_lookup r/RSparkSQLExample.R %}
+</div>
+</div>
\ No newline at end of file
diff --git a/docs/sql-data-sources-load-save-functions.md b/docs/sql-data-sources-load-save-functions.md
index 0748213..a7efb93 100644
--- a/docs/sql-data-sources-load-save-functions.md
+++ b/docs/sql-data-sources-load-save-functions.md
@@ -102,27 +102,6 @@ To load a CSV file you can use:
 </div>
 </div>
 
-To load files with paths matching a given glob pattern while keeping the behavior of partition discovery,
-you can use:
-
-<div class="codetabs">
-<div data-lang="scala"  markdown="1">
-{% include_example load_with_path_glob_filter scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %}
-</div>
-
-<div data-lang="java"  markdown="1">
-{% include_example load_with_path_glob_filter java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %}
-</div>
-
-<div data-lang="python"  markdown="1">
-{% include_example load_with_path_glob_filter python/sql/datasource.py %}
-</div>
-
-<div data-lang="r"  markdown="1">
-{% include_example load_with_path_glob_filter r/RSparkSQLExample.R %}
-</div>
-</div>
-
 The extra options are also used during write operation.
 For example, you can control bloom filters and dictionary encodings for ORC data sources.
 The following ORC example will create bloom filter and use dictionary encoding only for `favorite_color`.
diff --git a/docs/sql-data-sources.md b/docs/sql-data-sources.md
index 079c540..9396846 100644
--- a/docs/sql-data-sources.md
+++ b/docs/sql-data-sources.md
@@ -33,6 +33,11 @@ goes into specific options that are available for the built-in data sources.
   * [Save Modes](sql-data-sources-load-save-functions.html#save-modes)
   * [Saving to Persistent Tables](sql-data-sources-load-save-functions.html#saving-to-persistent-tables)
   * [Bucketing, Sorting and Partitioning](sql-data-sources-load-save-functions.html#bucketing-sorting-and-partitioning)
+* [Generic File Source Options](sql-data-sources-generic-options.html)
+  * [Ignore Corrupt Files](sql-data-sources-generic-options.html#ignore-corrupt-iles)
+  * [Ignore Missing Files](sql-data-sources-generic-options.html#ignore-missing-iles)
+  * [Path Global Filter](sql-data-sources-generic-options.html#path-global-filter)
+  * [Recursive File Lookup](sql-data-sources-generic-options.html#recursive-file-lookup)
 * [Parquet Files](sql-data-sources-parquet.html)
   * [Loading Data Programmatically](sql-data-sources-parquet.html#loading-data-programmatically)
   * [Partition Discovery](sql-data-sources-parquet.html#partition-discovery)
diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java
index b2ce0bc..2295225 100644
--- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java
@@ -98,6 +98,7 @@ public class JavaSQLDataSourceExample {
       .getOrCreate();
 
     runBasicDataSourceExample(spark);
+    runGenericFileSourceOptionsExample(spark);
     runBasicParquetExample(spark);
     runParquetSchemaMergingExample(spark);
     runJsonDatasetExample(spark);
@@ -106,6 +107,48 @@ public class JavaSQLDataSourceExample {
     spark.stop();
   }
 
+  private static void runGenericFileSourceOptionsExample(SparkSession spark) {
+    // $example on:ignore_corrupt_files$
+    // enable ignore corrupt files
+    spark.sql("set spark.sql.files.ignoreCorruptFiles=true");
+    // dir1/file3.json is corrupt from parquet's view
+    Dataset<Row> testCorruptDF = spark.read().parquet(
+            "examples/src/main/resources/dir1/",
+            "examples/src/main/resources/dir1/dir2/");
+    testCorruptDF.show();
+    // +-------------+
+    // |         file|
+    // +-------------+
+    // |file1.parquet|
+    // |file2.parquet|
+    // +-------------+
+    // $example off:ignore_corrupt_files$
+    // $example on:recursive_file_lookup$
+    Dataset<Row> recursiveLoadedDF = spark.read().format("parquet")
+            .option("recursiveFileLookup", "true")
+            .load("examples/src/main/resources/dir1");
+    recursiveLoadedDF.show();
+    // +-------------+
+    // |         file|
+    // +-------------+
+    // |file1.parquet|
+    // |file2.parquet|
+    // +-------------+
+    // $example off:recursive_file_lookup$
+    spark.sql("set spark.sql.files.ignoreCorruptFiles=false");
+    // $example on:load_with_path_glob_filter$
+    Dataset<Row> testGlobFilterDF = spark.read().format("parquet")
+            .option("pathGlobFilter", "*.parquet") // json file should be filtered out
+            .load("examples/src/main/resources/dir1");
+    testGlobFilterDF.show();
+    // +-------------+
+    // |         file|
+    // +-------------+
+    // |file1.parquet|
+    // +-------------+
+    // $example off:load_with_path_glob_filter$
+  }
+
   private static void runBasicDataSourceExample(SparkSession spark) {
     // $example on:generic_load_save_functions$
     Dataset<Row> usersDF = spark.read().load("examples/src/main/resources/users.parquet");
@@ -123,11 +166,6 @@ public class JavaSQLDataSourceExample {
       .option("header", "true")
       .load("examples/src/main/resources/people.csv");
     // $example off:manual_load_options_csv$
-    // $example on:load_with_path_glob_filter$
-    Dataset<Row> partitionedUsersDF = spark.read().format("orc")
-      .option("pathGlobFilter", "*.orc")
-      .load("examples/src/main/resources/partitioned_users.orc");
-    // $example off:load_with_path_glob_filter$
     // $example on:manual_save_options_orc$
     usersDF.write().format("orc")
       .option("orc.bloom.filter.columns", "favorite_color")
diff --git a/examples/src/main/python/sql/datasource.py b/examples/src/main/python/sql/datasource.py
index 0d78097..265f135 100644
--- a/examples/src/main/python/sql/datasource.py
+++ b/examples/src/main/python/sql/datasource.py
@@ -28,6 +28,48 @@ from pyspark.sql import Row
 # $example off:schema_merging$
 
 
+def generic_file_source_options_example(spark):
+    # $example on:ignore_corrupt_files$
+    # enable ignore corrupt files
+    spark.sql("set spark.sql.files.ignoreCorruptFiles=true")
+    # dir1/file3.json is corrupt from parquet's view
+    test_corrupt_df = spark.read.parquet("examples/src/main/resources/dir1/",
+                                         "examples/src/main/resources/dir1/dir2/")
+    test_corrupt_df.show()
+    # +-------------+
+    # |         file|
+    # +-------------+
+    # |file1.parquet|
+    # |file2.parquet|
+    # +-------------+
+    # $example off:ignore_corrupt_files$
+
+    # $example on:recursive_file_lookup$
+    recursive_loaded_df = spark.read.format("parquet")\
+        .option("recursiveFileLookup", "true")\
+        .load("examples/src/main/resources/dir1")
+    recursive_loaded_df.show()
+    # +-------------+
+    # |         file|
+    # +-------------+
+    # |file1.parquet|
+    # |file2.parquet|
+    # +-------------+
+    # $example off:recursive_file_lookup$
+    spark.sql("set spark.sql.files.ignoreCorruptFiles=false")
+
+    # $example on:load_with_path_glob_filter$
+    df = spark.read.load("examples/src/main/resources/dir1",
+                         format="parquet", pathGlobFilter="*.parquet")
+    df.show()
+    # +-------------+
+    # |         file|
+    # +-------------+
+    # |file1.parquet|
+    # +-------------+
+    # $example off:load_with_path_glob_filter$
+
+
 def basic_datasource_example(spark):
     # $example on:generic_load_save_functions$
     df = spark.read.load("examples/src/main/resources/users.parquet")
@@ -57,11 +99,6 @@ def basic_datasource_example(spark):
                          format="csv", sep=":", inferSchema="true", header="true")
     # $example off:manual_load_options_csv$
 
-    # $example on:load_with_path_glob_filter$
-    df = spark.read.load("examples/src/main/resources/partitioned_users.orc",
-                         format="orc", pathGlobFilter="*.orc")
-    # $example off:load_with_path_glob_filter$
-
     # $example on:manual_save_options_orc$
     df = spark.read.orc("examples/src/main/resources/users.orc")
     (df.write.format("orc")
@@ -233,6 +270,7 @@ if __name__ == "__main__":
         .getOrCreate()
 
     basic_datasource_example(spark)
+    generic_file_source_options_example(spark)
     parquet_example(spark)
     parquet_schema_merging_example(spark)
     json_dataset_example(spark)
diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R
index fa083d5..8685cfb 100644
--- a/examples/src/main/r/RSparkSQLExample.R
+++ b/examples/src/main/r/RSparkSQLExample.R
@@ -99,6 +99,26 @@ createOrReplaceTempView(df, "table")
 df <- sql("SELECT * FROM table")
 # $example off:run_sql$
 
+# Ignore corrupt files
+# $example on:ignore_corrupt_files$
+# enable ignore corrupt files
+sql("set spark.sql.files.ignoreCorruptFiles=true")
+# dir1/file3.json is corrupt from parquet's view
+testCorruptDF <- read.parquet(c("examples/src/main/resources/dir1/", "examples/src/main/resources/dir1/dir2/"))
+head(testCorruptDF)
+#            file
+# 1 file1.parquet
+# 2 file2.parquet
+# $example off:ignore_corrupt_files$
+
+# $example on:recursive_file_lookup$
+recursiveLoadedDF <- read.df("examples/src/main/resources/dir1", "parquet", recursiveFileLookup = "true")
+head(recursiveLoadedDF)
+#            file
+# 1 file1.parquet
+# 2 file2.parquet
+# $example off:recursive_file_lookup$
+sql("set spark.sql.files.ignoreCorruptFiles=false")
 
 # $example on:generic_load_save_functions$
 df <- read.df("examples/src/main/resources/users.parquet")
@@ -119,7 +139,9 @@ namesAndAges <- select(df, "name", "age")
 # $example off:manual_load_options_csv$
 
 # $example on:load_with_path_glob_filter$
-df <- read.df("examples/src/main/resources/partitioned_users.orc", "orc", pathGlobFilter = "*.orc")
+df <- read.df("examples/src/main/resources/dir1", "parquet", pathGlobFilter = "*.parquet")
+#            file
+# 1 file1.parquet
 # $example off:load_with_path_glob_filter$
 
 # $example on:manual_save_options_orc$
diff --git a/examples/src/main/resources/dir1/dir2/file2.parquet b/examples/src/main/resources/dir1/dir2/file2.parquet
new file mode 100644
index 0000000..d1895bf
Binary files /dev/null and b/examples/src/main/resources/dir1/dir2/file2.parquet differ
diff --git a/examples/src/main/resources/dir1/file1.parquet b/examples/src/main/resources/dir1/file1.parquet
new file mode 100644
index 0000000..ad360b1
Binary files /dev/null and b/examples/src/main/resources/dir1/file1.parquet differ
diff --git a/examples/src/main/resources/dir1/file3.json b/examples/src/main/resources/dir1/file3.json
new file mode 100644
index 0000000..0490f92
--- /dev/null
+++ b/examples/src/main/resources/dir1/file3.json
@@ -0,0 +1 @@
+{"file":"corrupt.json"}
diff --git a/examples/src/main/resources/partitioned_users.orc/do_not_read_this.txt b/examples/src/main/resources/partitioned_users.orc/do_not_read_this.txt
deleted file mode 100644
index 9c19f2a..0000000
--- a/examples/src/main/resources/partitioned_users.orc/do_not_read_this.txt
+++ /dev/null
@@ -1 +0,0 @@
-do not read this
diff --git a/examples/src/main/resources/partitioned_users.orc/favorite_color=__HIVE_DEFAULT_PARTITION__/users.orc b/examples/src/main/resources/partitioned_users.orc/favorite_color=__HIVE_DEFAULT_PARTITION__/users.orc
deleted file mode 100644
index 890395a..0000000
Binary files a/examples/src/main/resources/partitioned_users.orc/favorite_color=__HIVE_DEFAULT_PARTITION__/users.orc and /dev/null differ
diff --git a/examples/src/main/resources/partitioned_users.orc/favorite_color=red/users.orc b/examples/src/main/resources/partitioned_users.orc/favorite_color=red/users.orc
deleted file mode 100644
index 150615a6..0000000
Binary files a/examples/src/main/resources/partitioned_users.orc/favorite_color=red/users.orc and /dev/null differ
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala
index d4c05e5..2c7abfc 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala
@@ -32,6 +32,7 @@ object SQLDataSourceExample {
       .getOrCreate()
 
     runBasicDataSourceExample(spark)
+    runGenericFileSourceOptionsExample(spark)
     runBasicParquetExample(spark)
     runParquetSchemaMergingExample(spark)
     runJsonDatasetExample(spark)
@@ -40,6 +41,48 @@ object SQLDataSourceExample {
     spark.stop()
   }
 
+  private def runGenericFileSourceOptionsExample(spark: SparkSession): Unit = {
+    // $example on:ignore_corrupt_files$
+    // enable ignore corrupt files
+    spark.sql("set spark.sql.files.ignoreCorruptFiles=true")
+    // dir1/file3.json is corrupt from parquet's view
+    val testCorruptDF = spark.read.parquet(
+      "examples/src/main/resources/dir1/",
+      "examples/src/main/resources/dir1/dir2/")
+    testCorruptDF.show()
+    // +-------------+
+    // |         file|
+    // +-------------+
+    // |file1.parquet|
+    // |file2.parquet|
+    // +-------------+
+    // $example off:ignore_corrupt_files$
+    // $example on:recursive_file_lookup$
+    val recursiveLoadedDF = spark.read.format("parquet")
+      .option("recursiveFileLookup", "true")
+      .load("examples/src/main/resources/dir1")
+    recursiveLoadedDF.show()
+    // +-------------+
+    // |         file|
+    // +-------------+
+    // |file1.parquet|
+    // |file2.parquet|
+    // +-------------+
+    // $example off:recursive_file_lookup$
+    spark.sql("set spark.sql.files.ignoreCorruptFiles=false")
+    // $example on:load_with_path_glob_filter$
+    val testGlobFilterDF = spark.read.format("parquet")
+      .option("pathGlobFilter", "*.parquet") // json file should be filtered out
+      .load("examples/src/main/resources/dir1")
+    testGlobFilterDF.show()
+    // +-------------+
+    // |         file|
+    // +-------------+
+    // |file1.parquet|
+    // +-------------+
+    // $example off:load_with_path_glob_filter$
+  }
+
   private def runBasicDataSourceExample(spark: SparkSession): Unit = {
     // $example on:generic_load_save_functions$
     val usersDF = spark.read.load("examples/src/main/resources/users.parquet")
@@ -56,11 +99,6 @@ object SQLDataSourceExample {
       .option("header", "true")
       .load("examples/src/main/resources/people.csv")
     // $example off:manual_load_options_csv$
-    // $example on:load_with_path_glob_filter$
-    val partitionedUsersDF = spark.read.format("orc")
-      .option("pathGlobFilter", "*.orc")
-      .load("examples/src/main/resources/partitioned_users.orc")
-    // $example off:load_with_path_glob_filter$
     // $example on:manual_save_options_orc$
     usersDF.write.format("orc")
       .option("orc.bloom.filter.columns", "favorite_color")


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org