You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2018/09/17 11:34:00 UTC
spark git commit: [SPARK-25427][SQL][TEST] Add BloomFilter creation test cases

Repository: spark
Updated Branches:
  refs/heads/master 619c94901 -> 0dd61ec47


[SPARK-25427][SQL][TEST] Add BloomFilter creation test cases

## What changes were proposed in this pull request?

Spark supports BloomFilter creation for ORC files. This PR aims to add test coverages to prevent accidental regressions like [SPARK-12417](https://issues.apache.org/jira/browse/SPARK-12417).

## How was this patch tested?

Pass the Jenkins with newly added test cases.

Closes #22418 from dongjoon-hyun/SPARK-25427.

Authored-by: Dongjoon Hyun <do...@apache.org>
Signed-off-by: Wenchen Fan <we...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0dd61ec4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0dd61ec4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0dd61ec4

Branch: refs/heads/master
Commit: 0dd61ec47df7078fd4f77d8c58ecf26c630c700e
Parents: 619c949
Author: Dongjoon Hyun <do...@apache.org>
Authored: Mon Sep 17 19:33:51 2018 +0800
Committer: Wenchen Fan <we...@databricks.com>
Committed: Mon Sep 17 19:33:51 2018 +0800

----------------------------------------------------------------------
 .../datasources/orc/OrcSourceSuite.scala        | 69 ++++++++++++++++++++
 .../spark/sql/hive/orc/HiveOrcSourceSuite.scala |  9 +++
 2 files changed, 78 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/0dd61ec4/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
index 02bfb71..b6bb1d7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
@@ -21,7 +21,12 @@ import java.io.File
 import java.sql.Timestamp
 import java.util.Locale
 
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
 import org.apache.orc.OrcConf.COMPRESS
+import org.apache.orc.OrcFile
+import org.apache.orc.OrcProto.Stream.Kind
+import org.apache.orc.impl.RecordReaderImpl
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql.Row
@@ -50,6 +55,66 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll {
       .createOrReplaceTempView("orc_temp_table")
   }
 
+  protected def testBloomFilterCreation(bloomFilterKind: Kind) {
+    val tableName = "bloomFilter"
+
+    withTempDir { dir =>
+      withTable(tableName) {
+        val sqlStatement = orcImp match {
+          case "native" =>
+            s"""
+               |CREATE TABLE $tableName (a INT, b STRING)
+               |USING ORC
+               |OPTIONS (
+               |  path '${dir.toURI}',
+               |  orc.bloom.filter.columns '*',
+               |  orc.bloom.filter.fpp 0.1
+               |)
+            """.stripMargin
+          case "hive" =>
+            s"""
+               |CREATE TABLE $tableName (a INT, b STRING)
+               |STORED AS ORC
+               |LOCATION '${dir.toURI}'
+               |TBLPROPERTIES (
+               |  orc.bloom.filter.columns='*',
+               |  orc.bloom.filter.fpp=0.1
+               |)
+            """.stripMargin
+          case impl =>
+            throw new UnsupportedOperationException(s"Unknown ORC implementation: $impl")
+        }
+
+        sql(sqlStatement)
+        sql(s"INSERT INTO $tableName VALUES (1, 'str')")
+
+        val partFiles = dir.listFiles()
+          .filter(f => f.isFile && !f.getName.startsWith(".") && !f.getName.startsWith("_"))
+        assert(partFiles.length === 1)
+
+        val orcFilePath = new Path(partFiles.head.getAbsolutePath)
+        val readerOptions = OrcFile.readerOptions(new Configuration())
+        val reader = OrcFile.createReader(orcFilePath, readerOptions)
+        var recordReader: RecordReaderImpl = null
+        try {
+          recordReader = reader.rows.asInstanceOf[RecordReaderImpl]
+
+          // BloomFilter array is created for all types; `struct`, int (`a`), string (`b`)
+          val sargColumns = Array(true, true, true)
+          val orcIndex = recordReader.readRowIndex(0, null, sargColumns)
+
+          // Check the types and counts of bloom filters
+          assert(orcIndex.getBloomFilterKinds.forall(_ === bloomFilterKind))
+          assert(orcIndex.getBloomFilterIndex.forall(_.getBloomFilterCount > 0))
+        } finally {
+          if (recordReader != null) {
+            recordReader.close()
+          }
+        }
+      }
+    }
+  }
+
   test("create temporary orc table") {
     checkAnswer(sql("SELECT COUNT(*) FROM normal_orc_source"), Row(10))
 
@@ -215,4 +280,8 @@ class OrcSourceSuite extends OrcSuite with SharedSQLContext {
          |)
        """.stripMargin)
   }
+
+  test("Check BloomFilter creation") {
+    testBloomFilterCreation(Kind.BLOOM_FILTER_UTF8) // After ORC-101
+  }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/0dd61ec4/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
index d84f9a3..c1ae2f6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
@@ -23,6 +23,7 @@ import org.apache.spark.sql.{AnalysisException, Row}
 import org.apache.spark.sql.TestingUDT.{IntervalData, IntervalUDT}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.execution.datasources.orc.OrcSuite
+import org.apache.spark.sql.hive.HiveUtils
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.HiveSerDe
 import org.apache.spark.sql.types._
@@ -173,4 +174,12 @@ class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton {
       assert(msg.contains("ORC data source does not support calendarinterval data type."))
     }
   }
+
+  test("Check BloomFilter creation") {
+    Seq(true, false).foreach { convertMetastore =>
+      withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> s"$convertMetastore") {
+        testBloomFilterCreation(org.apache.orc.OrcProto.Stream.Kind.BLOOM_FILTER) // Before ORC-101
+      }
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org