You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by li...@apache.org on 2018/04/08 04:44:35 UTC
spark git commit: [SPARK-23849][SQL] Tests for the samplingRatio
option of JSON datasource
Repository: spark
Updated Branches:
refs/heads/master 2c1fe6475 -> 6a734575a
[SPARK-23849][SQL] Tests for the samplingRatio option of JSON datasource
## What changes were proposed in this pull request?
Proposed tests checks that only subset of input dataset is touched during schema inferring.
Author: Maxim Gekk <ma...@databricks.com>
Closes #20963 from MaxGekk/json-sampling-tests.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6a734575
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6a734575
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6a734575
Branch: refs/heads/master
Commit: 6a734575a80e6b4ec4963206254451f05d64b742
Parents: 2c1fe64
Author: Maxim Gekk <ma...@databricks.com>
Authored: Sat Apr 7 21:44:32 2018 -0700
Committer: gatorsmile <ga...@gmail.com>
Committed: Sat Apr 7 21:44:32 2018 -0700
----------------------------------------------------------------------
.../execution/datasources/json/JsonSuite.scala | 37 +++++++++++++++++++-
1 file changed, 36 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/6a734575/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index 10bac05..70aee56 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.json
import java.io.{File, StringWriter}
import java.nio.charset.StandardCharsets
-import java.nio.file.Files
+import java.nio.file.{Files, Paths, StandardOpenOption}
import java.sql.{Date, Timestamp}
import java.util.Locale
@@ -2127,4 +2127,39 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
assert(df.schema === expectedSchema)
}
}
+
+ test("SPARK-23849: schema inferring touches less data if samplingRation < 1.0") {
+ val predefinedSample = Set[Int](2, 8, 15, 27, 30, 34, 35, 37, 44, 46,
+ 57, 62, 68, 72)
+ withTempPath { path =>
+ val writer = Files.newBufferedWriter(Paths.get(path.getAbsolutePath),
+ StandardCharsets.UTF_8, StandardOpenOption.CREATE_NEW)
+ for (i <- 0 until 100) {
+ if (predefinedSample.contains(i)) {
+ writer.write(s"""{"f1":${i.toString}}""" + "\n")
+ } else {
+ writer.write(s"""{"f1":${(i.toDouble + 0.1).toString}}""" + "\n")
+ }
+ }
+ writer.close()
+
+ val ds = spark.read.option("samplingRatio", 0.1).json(path.getCanonicalPath)
+ assert(ds.schema == new StructType().add("f1", LongType))
+ }
+ }
+
+ test("SPARK-23849: usage of samplingRation while parsing of dataset of strings") {
+ val dstr = spark.sparkContext.parallelize(0 until 100, 1).map { i =>
+ val predefinedSample = Set[Int](2, 8, 15, 27, 30, 34, 35, 37, 44, 46,
+ 57, 62, 68, 72)
+ if (predefinedSample.contains(i)) {
+ s"""{"f1":${i.toString}}""" + "\n"
+ } else {
+ s"""{"f1":${(i.toDouble + 0.1).toString}}""" + "\n"
+ }
+ }.toDS()
+ val ds = spark.read.option("samplingRatio", 0.1).json(dstr)
+
+ assert(ds.schema == new StructType().add("f1", LongType))
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org