You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2018/09/17 11:34:00 UTC
spark git commit: [SPARK-25427][SQL][TEST] Add BloomFilter creation
test cases
Repository: spark
Updated Branches:
refs/heads/master 619c94901 -> 0dd61ec47
[SPARK-25427][SQL][TEST] Add BloomFilter creation test cases
## What changes were proposed in this pull request?
Spark supports BloomFilter creation for ORC files. This PR aims to add test coverages to prevent accidental regressions like [SPARK-12417](https://issues.apache.org/jira/browse/SPARK-12417).
## How was this patch tested?
Pass the Jenkins with newly added test cases.
Closes #22418 from dongjoon-hyun/SPARK-25427.
Authored-by: Dongjoon Hyun <do...@apache.org>
Signed-off-by: Wenchen Fan <we...@databricks.com>
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0dd61ec4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0dd61ec4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0dd61ec4
Branch: refs/heads/master
Commit: 0dd61ec47df7078fd4f77d8c58ecf26c630c700e
Parents: 619c949
Author: Dongjoon Hyun <do...@apache.org>
Authored: Mon Sep 17 19:33:51 2018 +0800
Committer: Wenchen Fan <we...@databricks.com>
Committed: Mon Sep 17 19:33:51 2018 +0800
----------------------------------------------------------------------
.../datasources/orc/OrcSourceSuite.scala | 69 ++++++++++++++++++++
.../spark/sql/hive/orc/HiveOrcSourceSuite.scala | 9 +++
2 files changed, 78 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/0dd61ec4/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
index 02bfb71..b6bb1d7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
@@ -21,7 +21,12 @@ import java.io.File
import java.sql.Timestamp
import java.util.Locale
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
import org.apache.orc.OrcConf.COMPRESS
+import org.apache.orc.OrcFile
+import org.apache.orc.OrcProto.Stream.Kind
+import org.apache.orc.impl.RecordReaderImpl
import org.scalatest.BeforeAndAfterAll
import org.apache.spark.sql.Row
@@ -50,6 +55,66 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll {
.createOrReplaceTempView("orc_temp_table")
}
+ protected def testBloomFilterCreation(bloomFilterKind: Kind) {
+ val tableName = "bloomFilter"
+
+ withTempDir { dir =>
+ withTable(tableName) {
+ val sqlStatement = orcImp match {
+ case "native" =>
+ s"""
+ |CREATE TABLE $tableName (a INT, b STRING)
+ |USING ORC
+ |OPTIONS (
+ | path '${dir.toURI}',
+ | orc.bloom.filter.columns '*',
+ | orc.bloom.filter.fpp 0.1
+ |)
+ """.stripMargin
+ case "hive" =>
+ s"""
+ |CREATE TABLE $tableName (a INT, b STRING)
+ |STORED AS ORC
+ |LOCATION '${dir.toURI}'
+ |TBLPROPERTIES (
+ | orc.bloom.filter.columns='*',
+ | orc.bloom.filter.fpp=0.1
+ |)
+ """.stripMargin
+ case impl =>
+ throw new UnsupportedOperationException(s"Unknown ORC implementation: $impl")
+ }
+
+ sql(sqlStatement)
+ sql(s"INSERT INTO $tableName VALUES (1, 'str')")
+
+ val partFiles = dir.listFiles()
+ .filter(f => f.isFile && !f.getName.startsWith(".") && !f.getName.startsWith("_"))
+ assert(partFiles.length === 1)
+
+ val orcFilePath = new Path(partFiles.head.getAbsolutePath)
+ val readerOptions = OrcFile.readerOptions(new Configuration())
+ val reader = OrcFile.createReader(orcFilePath, readerOptions)
+ var recordReader: RecordReaderImpl = null
+ try {
+ recordReader = reader.rows.asInstanceOf[RecordReaderImpl]
+
+ // BloomFilter array is created for all types; `struct`, int (`a`), string (`b`)
+ val sargColumns = Array(true, true, true)
+ val orcIndex = recordReader.readRowIndex(0, null, sargColumns)
+
+ // Check the types and counts of bloom filters
+ assert(orcIndex.getBloomFilterKinds.forall(_ === bloomFilterKind))
+ assert(orcIndex.getBloomFilterIndex.forall(_.getBloomFilterCount > 0))
+ } finally {
+ if (recordReader != null) {
+ recordReader.close()
+ }
+ }
+ }
+ }
+ }
+
test("create temporary orc table") {
checkAnswer(sql("SELECT COUNT(*) FROM normal_orc_source"), Row(10))
@@ -215,4 +280,8 @@ class OrcSourceSuite extends OrcSuite with SharedSQLContext {
|)
""".stripMargin)
}
+
+ test("Check BloomFilter creation") {
+ testBloomFilterCreation(Kind.BLOOM_FILTER_UTF8) // After ORC-101
+ }
}
http://git-wip-us.apache.org/repos/asf/spark/blob/0dd61ec4/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
index d84f9a3..c1ae2f6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
@@ -23,6 +23,7 @@ import org.apache.spark.sql.{AnalysisException, Row}
import org.apache.spark.sql.TestingUDT.{IntervalData, IntervalUDT}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.execution.datasources.orc.OrcSuite
+import org.apache.spark.sql.hive.HiveUtils
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.internal.HiveSerDe
import org.apache.spark.sql.types._
@@ -173,4 +174,12 @@ class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton {
assert(msg.contains("ORC data source does not support calendarinterval data type."))
}
}
+
+ test("Check BloomFilter creation") {
+ Seq(true, false).foreach { convertMetastore =>
+ withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> s"$convertMetastore") {
+ testBloomFilterCreation(org.apache.orc.OrcProto.Stream.Kind.BLOOM_FILTER) // Before ORC-101
+ }
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org