You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2024/02/06 07:11:36 UTC
(spark) branch master updated: [SPARK-46954][SQL] XML: Wrap InputStreamReader with BufferedReader
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 10439902b6dd [SPARK-46954][SQL] XML: Wrap InputStreamReader with BufferedReader
10439902b6dd is described below
commit 10439902b6ddc2d5826ed16f855a8429d9a15466
Author: Sandip Agarwala <13...@users.noreply.github.com>
AuthorDate: Tue Feb 6 16:11:23 2024 +0900
[SPARK-46954][SQL] XML: Wrap InputStreamReader with BufferedReader
### What changes were proposed in this pull request?
Wrap InputStreamReader with BufferedReader
### Why are the changes needed?
More than doubles the performance.
### Does this PR introduce _any_ user-facing change?
Yes, performance improvement.
### How was this patch tested?
Existing unit tests and manual perf testing
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #45041 from sandip-db/xml_buffered_reader.
Authored-by: Sandip Agarwala <13...@users.noreply.github.com>
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
.../spark/sql/catalyst/xml/StaxXmlParser.scala | 20 ++++----------------
1 file changed, 4 insertions(+), 16 deletions(-)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
index 74413bb8cbb2..66ec636d1a65 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
@@ -16,7 +16,7 @@
*/
package org.apache.spark.sql.catalyst.xml
-import java.io.{CharConversionException, FileNotFoundException, InputStream, InputStreamReader, IOException, StringReader}
+import java.io.{BufferedReader, CharConversionException, FileNotFoundException, InputStream, InputStreamReader, IOException, StringReader}
import java.nio.charset.{Charset, MalformedInputException}
import java.text.NumberFormat
import java.util.Locale
@@ -37,20 +37,7 @@ import org.apache.spark.SparkUpgradeException
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.ExprUtils
-import org.apache.spark.sql.catalyst.util.{
- ArrayBasedMapData,
- BadRecordException,
- DateFormatter,
- DropMalformedMode,
- FailureSafeParser,
- GenericArrayData,
- MapData,
- ParseMode,
- PartialResultArrayException,
- PartialResultException,
- PermissiveMode,
- TimestampFormatter
-}
+import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, BadRecordException, DateFormatter, DropMalformedMode, FailureSafeParser, GenericArrayData, MapData, ParseMode, PartialResultArrayException, PartialResultException, PermissiveMode, TimestampFormatter}
import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
import org.apache.spark.sql.catalyst.xml.StaxXmlParser.convertStream
import org.apache.spark.sql.errors.QueryExecutionErrors
@@ -623,7 +610,8 @@ class StaxXmlParser(
class XmlTokenizer(
inputStream: InputStream,
options: XmlOptions) extends Logging {
- private var reader = new InputStreamReader(inputStream, Charset.forName(options.charset))
+ private var reader = new BufferedReader(
+ new InputStreamReader(inputStream, Charset.forName(options.charset)))
private var currentStartTag: String = _
private var buffer = new StringBuilder()
private val startTag = s"<${options.rowTag}>"
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org