You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ma...@apache.org on 2014/11/15 00:11:57 UTC
spark git commit: [SPARK-4386] Improve performance when writing
Parquet files.
Repository: spark
Updated Branches:
refs/heads/master 0c7b66bd4 -> f76b96837
[SPARK-4386] Improve performance when writing Parquet files.
If you profile the writing of a Parquet file, the single worst time consuming call inside of org.apache.spark.sql.parquet.MutableRowWriteSupport.write is actually in the scala.collection.AbstractSequence.size call. This is because the size call actually ends up COUNTING the elements in a scala.collection.LinearSeqOptimized.length ("optimized?").
This doesn't need to be done. "size" is called repeatedly where needed rather than called once at the top of the method and stored in a 'val'.
Author: Jim Carroll <ji...@dontcallme.com>
Closes #3254 from jimfcarroll/parquet-perf and squashes the following commits:
30cc0b5 [Jim Carroll] Improve performance when writing Parquet files.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f76b9683
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f76b9683
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f76b9683
Branch: refs/heads/master
Commit: f76b9683706232c3d4e8e6e61627b8188dcb79dc
Parents: 0c7b66b
Author: Jim Carroll <ji...@dontcallme.com>
Authored: Fri Nov 14 15:11:53 2014 -0800
Committer: Michael Armbrust <mi...@databricks.com>
Committed: Fri Nov 14 15:11:53 2014 -0800
----------------------------------------------------------------------
.../spark/sql/parquet/ParquetTableSupport.scala | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/f76b9683/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index 7bc2496..ef3687e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -152,14 +152,15 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
}
override def write(record: Row): Unit = {
- if (attributes.size > record.size) {
+ val attributesSize = attributes.size
+ if (attributesSize > record.size) {
throw new IndexOutOfBoundsException(
- s"Trying to write more fields than contained in row (${attributes.size}>${record.size})")
+ s"Trying to write more fields than contained in row (${attributesSize}>${record.size})")
}
var index = 0
writer.startMessage()
- while(index < attributes.size) {
+ while(index < attributesSize) {
// null values indicate optional fields but we do not check currently
if (record(index) != null) {
writer.startField(attributes(index).name, index)
@@ -312,14 +313,15 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
// Optimized for non-nested rows
private[parquet] class MutableRowWriteSupport extends RowWriteSupport {
override def write(record: Row): Unit = {
- if (attributes.size > record.size) {
+ val attributesSize = attributes.size
+ if (attributesSize > record.size) {
throw new IndexOutOfBoundsException(
- s"Trying to write more fields than contained in row (${attributes.size}>${record.size})")
+ s"Trying to write more fields than contained in row (${attributesSize}>${record.size})")
}
var index = 0
writer.startMessage()
- while(index < attributes.size) {
+ while(index < attributesSize) {
// null values indicate optional fields but we do not check currently
if (record(index) != null && record(index) != Nil) {
writer.startField(attributes(index).name, index)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org