You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2021/01/29 08:38:33 UTC
[GitHub] [spark] wangyum opened a new pull request #31393: [SPARK-34289][SQL] Parquet vectorized reader support column index
wangyum opened a new pull request #31393:
URL: https://github.com/apache/spark/pull/31393
### What changes were proposed in this pull request?
This pr make parquet vectorized reader support [column index](https://issues.apache.org/jira/browse/PARQUET-1201).
### Why are the changes needed?
Improve filter performance. for example: `id = 1`, we only need to read `page-0` in `block 1`:
```
block 1:
null count min max
page-0 0 0 99
page-1 0 100 199
page-2 0 200 299
page-3 0 300 399
page-4 0 400 449
block 2:
null count min max
page-0 0 450 549
page-1 0 550 649
page-2 0 650 749
page-3 0 750 849
page-4 0 850 899
```
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Unit test.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] wangyum commented on pull request #31393: [SPARK-34289][SQL] Parquet vectorized reader support column index
Posted by GitBox <gi...@apache.org>.
wangyum commented on pull request #31393:
URL: https://github.com/apache/spark/pull/31393#issuecomment-769767724
Benchmark and benchmark result:
```scala
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.execution.benchmark
import java.io.File
import scala.util.Random
import org.apache.parquet.hadoop.ParquetInputFormat
import org.apache.spark.SparkConf
import org.apache.spark.benchmark.Benchmark
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions.{monotonically_increasing_id, timestamp_seconds}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.SQLConf.ParquetOutputTimestampType
import org.apache.spark.sql.types.{ByteType, Decimal, DecimalType}
/**
* Benchmark to measure read performance with Parquet column index.
* To run this benchmark:
* {{{
* 1. without sbt: bin/spark-submit --class <this class> <spark sql test jar>
* 2. build/sbt "sql/test:runMain <this class>"
* 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain <this class>"
* Results will be written to "benchmarks/ParquetFilterPushdownBenchmark-results.txt".
* }}}
*/
object ParquetColumnIndexBenchmark extends SqlBasedBenchmark {
override def getSparkSession: SparkSession = {
val conf = new SparkConf()
.setAppName(this.getClass.getSimpleName)
// Since `spark.master` always exists, overrides this value
.set("spark.master", "local[1]")
.setIfMissing("spark.driver.memory", "3g")
.setIfMissing("spark.executor.memory", "3g")
.setIfMissing("orc.compression", "snappy")
.setIfMissing("spark.sql.parquet.compression.codec", "snappy")
SparkSession.builder().config(conf).getOrCreate()
}
private val numRows = 1024 * 1024 * 15
private val width = 5
private val mid = numRows / 2
def withTempTable(tableNames: String*)(f: => Unit): Unit = {
try f finally tableNames.foreach(spark.catalog.dropTempView)
}
private def prepareTable(
dir: File, numRows: Int, width: Int, useStringForValue: Boolean): Unit = {
import spark.implicits._
val selectExpr = (1 to width).map(i => s"CAST(value AS STRING) c$i")
val valueCol = if (useStringForValue) {
monotonically_increasing_id().cast("string")
} else {
monotonically_increasing_id()
}
val df = spark.range(numRows).map(_ => Random.nextLong).selectExpr(selectExpr: _*)
.withColumn("value", valueCol)
.sort("value")
saveAsTable(df, dir)
}
private def prepareStringDictTable(
dir: File, numRows: Int, numDistinctValues: Int, width: Int): Unit = {
val selectExpr = (0 to width).map {
case 0 => s"CAST(id % $numDistinctValues AS STRING) AS value"
case i => s"CAST(rand() AS STRING) c$i"
}
val df = spark.range(numRows).selectExpr(selectExpr: _*).sort("value")
saveAsTable(df, dir, true)
}
private def saveAsTable(df: DataFrame, dir: File, useDictionary: Boolean = false): Unit = {
val parquetPath = dir.getCanonicalPath + "/parquet"
df.write.mode("overwrite").parquet(parquetPath)
spark.read.parquet(parquetPath).createOrReplaceTempView("parquetTable")
}
def filterPushDownBenchmark(
values: Int,
title: String,
whereExpr: String,
selectExpr: String = "*"): Unit = {
val benchmark = new Benchmark(title, values, minNumIters = 5, output = output)
Seq(false, true).foreach { columnIndexEnabled =>
val name = s"Parquet Vectorized ${if (columnIndexEnabled) s"(columnIndex)" else ""}"
benchmark.addCase(name) { _ =>
withSQLConf(ParquetInputFormat.COLUMN_INDEX_FILTERING_ENABLED -> s"$columnIndexEnabled") {
spark.sql(s"SELECT $selectExpr FROM parquetTable WHERE $whereExpr").noop()
}
}
}
benchmark.run()
}
private def runIntBenchmark(numRows: Int, width: Int, mid: Int): Unit = {
Seq("value IS NULL", s"$mid < value AND value < $mid").foreach { whereExpr =>
val title = s"Select 0 int row ($whereExpr)".replace("value AND value", "value")
filterPushDownBenchmark(numRows, title, whereExpr)
}
Seq(
s"value = $mid",
s"value <=> $mid",
s"$mid <= value AND value <= $mid",
s"${mid - 1} < value AND value < ${mid + 1}"
).foreach { whereExpr =>
val title = s"Select 1 int row ($whereExpr)".replace("value AND value", "value")
filterPushDownBenchmark(numRows, title, whereExpr)
}
val selectExpr = (1 to width).map(i => s"MAX(c$i)").mkString("", ",", ", MAX(value)")
Seq(10, 50, 90).foreach { percent =>
filterPushDownBenchmark(
numRows,
s"Select $percent% int rows (value < ${numRows * percent / 100})",
s"value < ${numRows * percent / 100}",
selectExpr
)
}
Seq("value IS NOT NULL", "value > -1", "value != -1").foreach { whereExpr =>
filterPushDownBenchmark(
numRows,
s"Select all int rows ($whereExpr)",
whereExpr,
selectExpr)
}
}
private def runStringBenchmark(
numRows: Int, width: Int, searchValue: Int, colType: String): Unit = {
Seq("value IS NULL", s"'$searchValue' < value AND value < '$searchValue'")
.foreach { whereExpr =>
val title = s"Select 0 $colType row ($whereExpr)".replace("value AND value", "value")
filterPushDownBenchmark(numRows, title, whereExpr)
}
Seq(
s"value = '$searchValue'",
s"value <=> '$searchValue'",
s"'$searchValue' <= value AND value <= '$searchValue'"
).foreach { whereExpr =>
val title = s"Select 1 $colType row ($whereExpr)".replace("value AND value", "value")
filterPushDownBenchmark(numRows, title, whereExpr)
}
val selectExpr = (1 to width).map(i => s"MAX(c$i)").mkString("", ",", ", MAX(value)")
Seq("value IS NOT NULL").foreach { whereExpr =>
filterPushDownBenchmark(
numRows,
s"Select all $colType rows ($whereExpr)",
whereExpr,
selectExpr)
}
}
override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
runBenchmark("Pushdown for many distinct value case") {
withTempPath { dir =>
withTempTable("parquetTable") {
Seq(true, false).foreach { useStringForValue =>
prepareTable(dir, numRows, width, useStringForValue)
if (useStringForValue) {
runStringBenchmark(numRows, width, mid, "string")
} else {
runIntBenchmark(numRows, width, mid)
}
}
}
}
}
runBenchmark("Pushdown for few distinct value case (use dictionary encoding)") {
withTempPath { dir =>
val numDistinctValues = 200
withTempTable("parquetTable") {
prepareStringDictTable(dir, numRows, numDistinctValues, width)
runStringBenchmark(numRows, width, numDistinctValues / 2, "distinct string")
}
}
}
runBenchmark("Pushdown benchmark for StringStartsWith") {
withTempPath { dir =>
withTempTable("parquetTable") {
prepareTable(dir, numRows, width, true)
Seq(
"value like '10%'",
"value like '1000%'",
s"value like '${mid.toString.substring(0, mid.toString.length - 1)}%'"
).foreach { whereExpr =>
val title = s"StringStartsWith filter: ($whereExpr)"
filterPushDownBenchmark(numRows, title, whereExpr)
}
}
}
}
runBenchmark(s"Pushdown benchmark for ${DecimalType.simpleString}") {
withTempPath { dir =>
Seq(
s"decimal(${Decimal.MAX_INT_DIGITS}, 2)",
s"decimal(${Decimal.MAX_LONG_DIGITS}, 2)",
s"decimal(${DecimalType.MAX_PRECISION}, 2)"
).foreach { dt =>
val columns = (1 to width).map(i => s"CAST(id AS string) c$i")
val valueCol = if (dt.equalsIgnoreCase(s"decimal(${Decimal.MAX_INT_DIGITS}, 2)")) {
monotonically_increasing_id() % 9999999
} else {
monotonically_increasing_id()
}
val df = spark.range(numRows)
.selectExpr(columns: _*).withColumn("value", valueCol.cast(dt))
withTempTable("parquetTable") {
saveAsTable(df, dir)
Seq(s"value = $mid").foreach { whereExpr =>
val title = s"Select 1 $dt row ($whereExpr)".replace("value AND value", "value")
filterPushDownBenchmark(numRows, title, whereExpr)
}
val selectExpr = (1 to width).map(i => s"MAX(c$i)").mkString("", ",", ", MAX(value)")
Seq(10, 50, 90).foreach { percent =>
filterPushDownBenchmark(
numRows,
s"Select $percent% $dt rows (value < ${numRows * percent / 100})",
s"value < ${numRows * percent / 100}",
selectExpr
)
}
}
}
}
}
runBenchmark("Pushdown benchmark for InSet -> InFilters") {
withTempPath { dir =>
withTempTable("parquetTable") {
prepareTable(dir, numRows, width, false)
Seq(5, 10, 50, 100).foreach { count =>
Seq(10, 50, 90).foreach { distribution =>
val filter =
Range(0, count).map(r => scala.util.Random.nextInt(numRows * distribution / 100))
val whereExpr = s"value in(${filter.mkString(",")})"
val title = s"InSet -> InFilters (values count: $count, distribution: $distribution)"
filterPushDownBenchmark(numRows, title, whereExpr)
}
}
}
}
}
runBenchmark(s"Pushdown benchmark for ${ByteType.simpleString}") {
withTempPath { dir =>
val columns = (1 to width).map(i => s"CAST(id AS string) c$i")
val df = spark.range(numRows).selectExpr(columns: _*)
.withColumn("value", (monotonically_increasing_id() % Byte.MaxValue).cast(ByteType))
.orderBy("value")
withTempTable("parquetTable") {
saveAsTable(df, dir)
Seq(s"value = CAST(${Byte.MaxValue / 2} AS ${ByteType.simpleString})")
.foreach { whereExpr =>
val title = s"Select 1 ${ByteType.simpleString} row ($whereExpr)"
.replace("value AND value", "value")
filterPushDownBenchmark(numRows, title, whereExpr)
}
val selectExpr = (1 to width).map(i => s"MAX(c$i)").mkString("", ",", ", MAX(value)")
Seq(10, 50, 90).foreach { percent =>
filterPushDownBenchmark(
numRows,
s"Select $percent% ${ByteType.simpleString} rows " +
s"(value < CAST(${Byte.MaxValue * percent / 100} AS ${ByteType.simpleString}))",
s"value < CAST(${Byte.MaxValue * percent / 100} AS ${ByteType.simpleString})",
selectExpr
)
}
}
}
}
runBenchmark(s"Pushdown benchmark for Timestamp") {
withTempPath { dir =>
withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_TIMESTAMP_ENABLED.key -> true.toString) {
ParquetOutputTimestampType.values.toSeq.map(_.toString).foreach { fileType =>
withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> fileType) {
val columns = (1 to width).map(i => s"CAST(id AS string) c$i")
val df = spark.range(numRows).selectExpr(columns: _*)
.withColumn("value", timestamp_seconds(monotonically_increasing_id()))
withTempTable("parquetTable") {
saveAsTable(df, dir)
Seq(s"value = timestamp_seconds($mid)").foreach { whereExpr =>
val title = s"Select 1 timestamp stored as $fileType row ($whereExpr)"
.replace("value AND value", "value")
filterPushDownBenchmark(numRows, title, whereExpr)
}
val selectExpr = (1 to width)
.map(i => s"MAX(c$i)").mkString("", ",", ", MAX(value)")
Seq(10, 50, 90).foreach { percent =>
filterPushDownBenchmark(
numRows,
s"Select $percent% timestamp stored as $fileType rows " +
s"(value < timestamp_seconds(${numRows * percent / 100}))",
s"value < timestamp_seconds(${numRows * percent / 100})",
selectExpr
)
}
}
}
}
}
}
}
runBenchmark(s"Pushdown benchmark with many filters") {
val numRows = 1
val width = 500
withTempPath { dir =>
val columns = (1 to width).map(i => s"id c$i")
val df = spark.range(1).selectExpr(columns: _*)
withTempTable("parquetTable") {
saveAsTable(df, dir)
Seq(1, 250, 500).foreach { numFilter =>
val whereExpr = (1 to numFilter).map(i => s"c$i = 0").mkString(" and ")
// Note: InferFiltersFromConstraints will add more filters to this given filters
filterPushDownBenchmark(numRows, s"Select 1 row with $numFilter filters", whereExpr)
}
}
}
}
}
}
```
```
================================================================================================
Pushdown for many distinct value case
================================================================================================
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 0 string row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 92 105 9 171.7 5.8 1.0X
Parquet Vectorized (columnIndex) 70 80 8 225.3 4.4 1.3X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 0 string row ('7864320' < value < '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-----------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 829 849 12 19.0 52.7 1.0X
Parquet Vectorized (columnIndex) 85 92 6 184.8 5.4 9.7X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 string row (value = '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 830 845 11 19.0 52.8 1.0X
Parquet Vectorized (columnIndex) 85 94 7 185.9 5.4 9.8X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 string row (value <=> '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 841 895 47 18.7 53.5 1.0X
Parquet Vectorized (columnIndex) 77 81 5 205.4 4.9 11.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 string row ('7864320' <= value <= '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 826 843 13 19.0 52.5 1.0X
Parquet Vectorized (columnIndex) 79 84 5 197.9 5.1 10.4X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select all string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 16542 16827 247 1.0 1051.7 1.0X
Parquet Vectorized (columnIndex) 16491 16571 57 1.0 1048.5 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 0 int row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 60 66 5 263.7 3.8 1.0X
Parquet Vectorized (columnIndex) 59 66 6 267.7 3.7 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 0 int row (7864320 < value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 793 797 5 19.8 50.4 1.0X
Parquet Vectorized (columnIndex) 79 85 5 199.6 5.0 10.1X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 int row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 792 807 11 19.9 50.3 1.0X
Parquet Vectorized (columnIndex) 72 76 4 218.7 4.6 11.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 int row (value <=> 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 792 832 37 19.8 50.4 1.0X
Parquet Vectorized (columnIndex) 77 85 9 205.0 4.9 10.3X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 int row (7864320 <= value <= 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 782 806 19 20.1 49.7 1.0X
Parquet Vectorized (columnIndex) 70 75 4 224.4 4.5 11.2X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 int row (7864319 < value < 7864321): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 772 797 19 20.4 49.1 1.0X
Parquet Vectorized (columnIndex) 73 79 6 216.1 4.6 10.6X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 10% int rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 2313 2347 30 6.8 147.1 1.0X
Parquet Vectorized (columnIndex) 1657 1681 20 9.5 105.4 1.4X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 50% int rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 8055 8162 69 2.0 512.1 1.0X
Parquet Vectorized (columnIndex) 7800 7861 52 2.0 495.9 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 90% int rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 13875 14027 174 1.1 882.1 1.0X
Parquet Vectorized (columnIndex) 13954 14061 151 1.1 887.2 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select all int rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 15637 15728 70 1.0 994.2 1.0X
Parquet Vectorized (columnIndex) 15481 15634 101 1.0 984.2 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select all int rows (value > -1): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 15156 15369 144 1.0 963.6 1.0X
Parquet Vectorized (columnIndex) 15255 15409 144 1.0 969.9 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select all int rows (value != -1): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 15220 15440 143 1.0 967.7 1.0X
Parquet Vectorized (columnIndex) 15327 15399 60 1.0 974.5 1.0X
================================================================================================
Pushdown for few distinct value case (use dictionary encoding)
================================================================================================
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 0 distinct string row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 51 55 6 309.6 3.2 1.0X
Parquet Vectorized (columnIndex) 49 55 6 319.7 3.1 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 0 distinct string row ('100' < value < '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 907 921 11 17.3 57.7 1.0X
Parquet Vectorized (columnIndex) 56 60 3 279.6 3.6 16.1X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 distinct string row (value = '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 916 927 8 17.2 58.3 1.0X
Parquet Vectorized (columnIndex) 121 126 6 130.0 7.7 7.6X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 distinct string row (value <=> '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 900 908 7 17.5 57.2 1.0X
Parquet Vectorized (columnIndex) 118 124 5 133.2 7.5 7.6X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 distinct string row ('100' <= value <= '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 924 935 10 17.0 58.8 1.0X
Parquet Vectorized (columnIndex) 125 130 5 126.2 7.9 7.4X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select all distinct string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-----------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 16896 17060 109 0.9 1074.2 1.0X
Parquet Vectorized (columnIndex) 17062 17211 114 0.9 1084.8 1.0X
================================================================================================
Pushdown benchmark for StringStartsWith
================================================================================================
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
StringStartsWith filter: (value like '10%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 1028 1060 29 15.3 65.3 1.0X
Parquet Vectorized (columnIndex) 849 863 12 18.5 54.0 1.2X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
StringStartsWith filter: (value like '1000%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-----------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 828 835 6 19.0 52.6 1.0X
Parquet Vectorized (columnIndex) 71 77 4 220.1 4.5 11.6X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
StringStartsWith filter: (value like '786432%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 832 845 10 18.9 52.9 1.0X
Parquet Vectorized (columnIndex) 70 76 3 223.4 4.5 11.8X
================================================================================================
Pushdown benchmark for decimal
================================================================================================
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 decimal(9, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 1128 1159 36 13.9 71.7 1.0X
Parquet Vectorized (columnIndex) 46 49 3 345.5 2.9 24.8X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 10% decimal(9, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 4827 4927 112 3.3 306.9 1.0X
Parquet Vectorized (columnIndex) 2238 2387 123 7.0 142.3 2.2X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 50% decimal(9, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 9972 10135 113 1.6 634.0 1.0X
Parquet Vectorized (columnIndex) 9395 9503 102 1.7 597.3 1.1X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 90% decimal(9, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 11283 11461 192 1.4 717.3 1.0X
Parquet Vectorized (columnIndex) 11070 11236 144 1.4 703.8 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 decimal(18, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-----------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 1170 1181 8 13.4 74.4 1.0X
Parquet Vectorized (columnIndex) 41 43 3 380.2 2.6 28.3X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 10% decimal(18, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 1962 2041 103 8.0 124.7 1.0X
Parquet Vectorized (columnIndex) 1166 1192 23 13.5 74.1 1.7X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 50% decimal(18, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 6211 6276 52 2.5 394.9 1.0X
Parquet Vectorized (columnIndex) 5572 5667 60 2.8 354.3 1.1X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 90% decimal(18, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 10231 10430 302 1.5 650.5 1.0X
Parquet Vectorized (columnIndex) 9985 10291 282 1.6 634.8 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 decimal(38, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-----------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 1746 1796 72 9.0 111.0 1.0X
Parquet Vectorized (columnIndex) 44 47 3 360.3 2.8 40.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 10% decimal(38, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 2758 2861 66 5.7 175.4 1.0X
Parquet Vectorized (columnIndex) 1551 1581 20 10.1 98.6 1.8X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 50% decimal(38, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 8324 8443 91 1.9 529.2 1.0X
Parquet Vectorized (columnIndex) 7661 7719 75 2.1 487.0 1.1X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 90% decimal(38, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 13847 14118 224 1.1 880.4 1.0X
Parquet Vectorized (columnIndex) 13609 13806 150 1.2 865.3 1.0X
================================================================================================
Pushdown benchmark for InSet -> InFilters
================================================================================================
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 5, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 758 1035 581 20.8 48.2 1.0X
Parquet Vectorized (columnIndex) 110 118 7 142.6 7.0 6.9X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 5, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 2871 2928 38 5.5 182.6 1.0X
Parquet Vectorized (columnIndex) 112 117 5 140.9 7.1 25.7X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 5, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 2197 2218 16 7.2 139.7 1.0X
Parquet Vectorized (columnIndex) 111 118 6 141.9 7.0 19.8X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 10, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 759 779 12 20.7 48.3 1.0X
Parquet Vectorized (columnIndex) 150 155 3 105.2 9.5 5.1X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 10, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 3633 3674 28 4.3 231.0 1.0X
Parquet Vectorized (columnIndex) 169 181 14 93.0 10.8 21.5X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 10, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 3623 3666 29 4.3 230.3 1.0X
Parquet Vectorized (columnIndex) 166 176 6 94.6 10.6 21.8X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 50, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 8220 8366 199 1.9 522.6 1.0X
Parquet Vectorized (columnIndex) 8135 8197 66 1.9 517.2 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 50, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 8164 8224 48 1.9 519.1 1.0X
Parquet Vectorized (columnIndex) 8106 8212 60 1.9 515.3 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 50, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 8179 8257 50 1.9 520.0 1.0X
Parquet Vectorized (columnIndex) 8176 8269 76 1.9 519.8 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 100, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 8199 8339 133 1.9 521.3 1.0X
Parquet Vectorized (columnIndex) 8128 8247 100 1.9 516.8 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 100, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 8075 8180 75 1.9 513.4 1.0X
Parquet Vectorized (columnIndex) 8133 8185 57 1.9 517.1 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
InSet -> InFilters (values count: 100, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 8121 8163 33 1.9 516.3 1.0X
Parquet Vectorized (columnIndex) 8093 8159 63 1.9 514.5 1.0X
================================================================================================
Pushdown benchmark for tinyint
================================================================================================
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 tinyint row (value = CAST(63 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 1133 1201 46 13.9 72.1 1.0X
Parquet Vectorized (columnIndex) 85 90 5 184.7 5.4 13.3X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 10% tinyint rows (value < CAST(12 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 1841 1860 19 8.5 117.1 1.0X
Parquet Vectorized (columnIndex) 1104 1115 10 14.2 70.2 1.7X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 50% tinyint rows (value < CAST(63 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 5747 5818 86 2.7 365.4 1.0X
Parquet Vectorized (columnIndex) 5411 5547 108 2.9 344.1 1.1X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 90% tinyint rows (value < CAST(114 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 10143 10249 108 1.6 644.9 1.0X
Parquet Vectorized (columnIndex) 9730 9832 78 1.6 618.6 1.0X
================================================================================================
Pushdown benchmark for Timestamp
================================================================================================
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 timestamp stored as INT96 row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-----------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 4023 4060 35 3.9 255.8 1.0X
Parquet Vectorized (columnIndex) 3908 4044 96 4.0 248.5 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 10% timestamp stored as INT96 rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 4758 4967 179 3.3 302.5 1.0X
Parquet Vectorized (columnIndex) 4750 4828 45 3.3 302.0 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 50% timestamp stored as INT96 rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 7682 7792 108 2.0 488.4 1.0X
Parquet Vectorized (columnIndex) 7661 7753 111 2.1 487.1 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 90% timestamp stored as INT96 rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 10538 10678 134 1.5 670.0 1.0X
Parquet Vectorized (columnIndex) 10489 10602 77 1.5 666.9 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 timestamp stored as TIMESTAMP_MICROS row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 1053 1064 7 14.9 66.9 1.0X
Parquet Vectorized (columnIndex) 41 46 5 382.0 2.6 25.6X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 10% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 1808 1883 83 8.7 114.9 1.0X
Parquet Vectorized (columnIndex) 1112 1143 19 14.1 70.7 1.6X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 50% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 5921 6019 68 2.7 376.4 1.0X
Parquet Vectorized (columnIndex) 5411 5538 100 2.9 344.0 1.1X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 90% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 9713 9785 61 1.6 617.6 1.0X
Parquet Vectorized (columnIndex) 9670 9813 151 1.6 614.8 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 timestamp stored as TIMESTAMP_MILLIS row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 1144 1152 5 13.8 72.7 1.0X
Parquet Vectorized (columnIndex) 38 43 5 413.0 2.4 30.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 10% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 1890 1960 96 8.3 120.2 1.0X
Parquet Vectorized (columnIndex) 1125 1138 13 14.0 71.5 1.7X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 50% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 5895 6148 221 2.7 374.8 1.0X
Parquet Vectorized (columnIndex) 5483 5528 71 2.9 348.6 1.1X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 90% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 9928 10021 61 1.6 631.2 1.0X
Parquet Vectorized (columnIndex) 9747 9855 85 1.6 619.7 1.0X
================================================================================================
Pushdown benchmark with many filters
================================================================================================
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 row with 1 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 183 191 4 0.0 182665658.0 1.0X
Parquet Vectorized (columnIndex) 180 187 6 0.0 179942365.0 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 row with 250 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 2206 2240 35 0.0 2206072129.0 1.0X
Parquet Vectorized (columnIndex) 2206 2301 76 0.0 2206015717.0 1.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Select 1 row with 500 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Parquet Vectorized 12239 12298 53 0.0 12239406583.0 1.0X
Parquet Vectorized (columnIndex) 12276 12502 200 0.0 12275545185.0 1.0X
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] shangxinli commented on pull request #31393: [SPARK-34289][SQL] Parquet vectorized reader support column index
Posted by GitBox <gi...@apache.org>.
shangxinli commented on pull request #31393:
URL: https://github.com/apache/spark/pull/31393#issuecomment-770317824
Nice work!
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #31393: [SPARK-34289][SQL] Parquet vectorized reader support column index
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #31393:
URL: https://github.com/apache/spark/pull/31393#issuecomment-769694751
Kubernetes integration test starting
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/39233/
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] dongjoon-hyun closed pull request #31393: [SPARK-34289][SQL] Parquet vectorized reader support column index
Posted by GitBox <gi...@apache.org>.
dongjoon-hyun closed pull request #31393:
URL: https://github.com/apache/spark/pull/31393
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] AmplabJenkins commented on pull request #31393: [SPARK-34289][SQL] Parquet vectorized reader support column index
Posted by GitBox <gi...@apache.org>.
AmplabJenkins commented on pull request #31393:
URL: https://github.com/apache/spark/pull/31393#issuecomment-769724794
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] viirya commented on pull request #31393: [SPARK-34289][SQL] Parquet vectorized reader support column index
Posted by GitBox <gi...@apache.org>.
viirya commented on pull request #31393:
URL: https://github.com/apache/spark/pull/31393#issuecomment-769936591
cc @sunchao @dongjoon-hyun
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] wangyum commented on pull request #31393: [SPARK-34289][SQL] Parquet vectorized reader support column index
Posted by GitBox <gi...@apache.org>.
wangyum commented on pull request #31393:
URL: https://github.com/apache/spark/pull/31393#issuecomment-771464400
Benchmark with production data:
```sql
CREATE TABLE test11.benchmark_column_index_2 using parquet
CLUSTERED BY (FDBK_RCVR_USER_ID) SORTED BY (FDBK_GIVER_USER_ID) into 2000 buckets
AS
(SELECT * FROM test11.origin_data);
SELECT col... FROM test11.benchmark_column_index_2 WHERE FDBK_GIVER_USER_ID = 992647107 AND ...
```
Enable column index | Disable column index
-- | --
![image](https://user-images.githubusercontent.com/5399861/106572923-fe7a3b80-6573-11eb-89d5-47a809a1642d.png)|![image](https://user-images.githubusercontent.com/5399861/106572597-97f51d80-6573-11eb-8f21-4477b05698e3.png)
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] SparkQA commented on pull request #31393: [SPARK-34289][SQL] Parquet vectorized reader support column index
Posted by GitBox <gi...@apache.org>.
SparkQA commented on pull request #31393:
URL: https://github.com/apache/spark/pull/31393#issuecomment-769667687
**[Test build #134645 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/134645/testReport)** for PR 31393 at commit [`57323be`](https://github.com/apache/spark/commit/57323bece29c2c0c37f6668c99cff6db43e8448c).
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org
[GitHub] [spark] wangyum commented on pull request #31393: [SPARK-34289][SQL] Parquet vectorized reader support column index
Posted by GitBox <gi...@apache.org>.
wangyum commented on pull request #31393:
URL: https://github.com/apache/spark/pull/31393#issuecomment-771464400
Benchmark with production data:
```sql
CREATE TABLE test11.benchmark_column_index_2 using parquet
CLUSTERED BY (FDBK_RCVR_USER_ID) SORTED BY (FDBK_GIVER_USER_ID) into 2000 buckets
AS
(SELECT * FROM test11.origin_data);
SELECT col... FROM test11.benchmark_column_index_2 WHERE FDBK_GIVER_USER_ID = 992647107 AND ...
```
Enable column index | Disable column index
-- | --
![image](https://user-images.githubusercontent.com/5399861/106572923-fe7a3b80-6573-11eb-89d5-47a809a1642d.png)|![image](https://user-images.githubusercontent.com/5399861/106572597-97f51d80-6573-11eb-8f21-4477b05698e3.png)
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org