You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ma...@apache.org on 2021/04/13 12:12:36 UTC
[spark] branch branch-3.0 updated: [SPARK-35045][SQL] Add an
internal option to control input buffer in univocity
This is an automated email from the ASF dual-hosted git repository.
maxgekk pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 057dc8d [SPARK-35045][SQL] Add an internal option to control input buffer in univocity
057dc8d is described below
commit 057dc8d6a05a27b08da1f4043d8f5219437f230f
Author: Hyukjin Kwon <gu...@apache.org>
AuthorDate: Tue Apr 13 15:08:01 2021 +0300
[SPARK-35045][SQL] Add an internal option to control input buffer in univocity
This PR makes the input buffer configurable (as an internal option). This is mainly to work around uniVocity/univocity-parsers#449.
To work around uniVocity/univocity-parsers#449.
No, it's only internal option.
Manually tested by modifying the unittest added in https://github.com/apache/spark/pull/31858 as below:
```diff
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index fd25a79619d..b58f0bd3661 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
-2460,6 +2460,7 abstract class CSVSuite
Seq(line).toDF.write.text(path.getAbsolutePath)
assert(spark.read.format("csv")
.option("delimiter", "|")
+ .option("inputBufferSize", "128")
.option("ignoreTrailingWhiteSpace", "true").load(path.getAbsolutePath).count() == 1)
}
}
```
Closes #32145 from HyukjinKwon/SPARK-35045.
Lead-authored-by: Hyukjin Kwon <gu...@apache.org>
Co-authored-by: HyukjinKwon <gu...@apache.org>
Signed-off-by: Max Gekk <ma...@gmail.com>
(cherry picked from commit 1f562159bf61dd5e536db7841b16e74a635e7a97)
Signed-off-by: Max Gekk <ma...@gmail.com>
---
.../src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala | 3 +++
1 file changed, 3 insertions(+)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
index ee7fc1f..d50849c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
@@ -211,6 +211,8 @@ class CSVOptions(
}
val lineSeparatorInWrite: Option[String] = lineSeparator
+ val inputBufferSize: Option[Int] = parameters.get("inputBufferSize").map(_.toInt)
+
def asWriterSettings: CsvWriterSettings = {
val writerSettings = new CsvWriterSettings()
val format = writerSettings.getFormat
@@ -251,6 +253,7 @@ class CSVOptions(
settings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceInRead)
settings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceInRead)
settings.setReadInputOnSeparateThread(false)
+ inputBufferSize.foreach(settings.setInputBufferSize)
settings.setMaxColumns(maxColumns)
settings.setNullValue(nullValue)
settings.setEmptyValue(emptyValueInRead)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org