You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kyuubi.apache.org by ch...@apache.org on 2023/02/16 07:06:38 UTC

[kyuubi] branch branch-1.6 updated: [KYUUBI #4336] Avoid listing all schemas for Spark session catalog on schema pruning

This is an automated email from the ASF dual-hosted git repository.

chengpan pushed a commit to branch branch-1.6
in repository https://gitbox.apache.org/repos/asf/kyuubi.git


The following commit(s) were added to refs/heads/branch-1.6 by this push:
     new 143dac95b [KYUUBI #4336] Avoid listing all schemas for Spark session catalog on schema pruning
143dac95b is described below

commit 143dac95bf02eeef99cb16994d7894d118143f7c
Author: Cheng Pan <ch...@apache.org>
AuthorDate: Thu Feb 16 15:05:53 2023 +0800

    [KYUUBI #4336] Avoid listing all schemas for Spark session catalog on schema pruning
    
    ### _Why are the changes needed?_
    
    Some DBMS tools like DBeaver and HUE will call thrift meta api for listing catalogs, databases, and tables. The current implementation of `CatalogShim_v3_0#getSchemas` will call `listAllNamespaces` first and do schema pruning on the Spark driver, which may cause "permission denied" exception when HMS has permission control, like the ranger plugin.
    
    This PR proposes to call HMS API(through v1 session catalog) directly for `spark_catalog`, to suppress the above issue.
    
    ```
    2023-02-15 20:02:13.048 ERROR org.apache.kyuubi.server.KyuubiTBinaryFrontendService: Error getting schemas:
    org.apache.kyuubi.KyuubiSQLException: Error operating GetSchemas: org.apache.spark.sql.AnalysisException: org.apache.hadoop.hive.ql.metadata.HiveException: MetaException(message:Permission denied: user [user1] does not have [SELECT] privilege on [userdb1])
            at org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:134)
            at org.apache.spark.sql.hive.HiveExternalCatalog.databaseExists(HiveExternalCatalog.scala:249)
            at org.apache.spark.sql.catalyst.catalog.ExternalCatalogWithListener.databaseExists(ExternalCatalogWithListener.scala:69)
            at org.apache.spark.sql.catalyst.catalog.SessionCatalog.databaseExists(SessionCatalog.scala:294)
            at org.apache.spark.sql.execution.datasources.v2.V2SessionCatalog.listNamespaces(V2SessionCatalog.scala:212)
            at org.apache.kyuubi.engine.spark.shim.CatalogShim_v3_0.$anonfun$listAllNamespaces$1(CatalogShim_v3_0.scala:74)
            at org.apache.kyuubi.engine.spark.shim.CatalogShim_v3_0.$anonfun$listAllNamespaces$1$adapted(CatalogShim_v3_0.scala:73)
            at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:293)
            at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
            at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
            at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
            at scala.collection.TraversableLike.flatMap(TraversableLike.scala:293)
            at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:290)
            at scala.collection.mutable.ArrayOps$ofRef.flatMap(ArrayOps.scala:198)
            at org.apache.kyuubi.engine.spark.shim.CatalogShim_v3_0.listAllNamespaces(CatalogShim_v3_0.scala:73)
            at org.apache.kyuubi.engine.spark.shim.CatalogShim_v3_0.listAllNamespaces(CatalogShim_v3_0.scala:90)
            at org.apache.kyuubi.engine.spark.shim.CatalogShim_v3_0.getSchemasWithPattern(CatalogShim_v3_0.scala:118)
            at org.apache.kyuubi.engine.spark.shim.CatalogShim_v3_0.getSchemas(CatalogShim_v3_0.scala:133)
            at org.apache.kyuubi.engine.spark.operation.GetSchemas.runInternal(GetSchemas.scala:43)
            at org.apache.kyuubi.operation.AbstractOperation.run(AbstractOperation.scala:164)
            at org.apache.kyuubi.session.AbstractSession.runOperation(AbstractSession.scala:99)
            at org.apache.kyuubi.engine.spark.session.SparkSessionImpl.runOperation(SparkSessionImpl.scala:78)
            at org.apache.kyuubi.session.AbstractSession.getSchemas(AbstractSession.scala:150)
            at org.apache.kyuubi.service.AbstractBackendService.getSchemas(AbstractBackendService.scala:83)
            at org.apache.kyuubi.service.TFrontendService.GetSchemas(TFrontendService.scala:294)
            at org.apache.kyuubi.shade.org.apache.hive.service.rpc.thrift.TCLIService$Processor$GetSchemas.getResult(TCLIService.java:1617)
            at org.apache.kyuubi.shade.org.apache.hive.service.rpc.thrift.TCLIService$Processor$GetSchemas.getResult(TCLIService.java:1602)
            at org.apache.kyuubi.shade.org.apache.thrift.ProcessFunction.process(ProcessFunction.java:39)
            at org.apache.kyuubi.shade.org.apache.thrift.TBaseProcessor.process(TBaseProcessor.java:39)
            at org.apache.kyuubi.service.authentication.TSetIpAddressProcessor.process(TSetIpAddressProcessor.scala:36)
            at org.apache.kyuubi.shade.org.apache.thrift.server.TThreadPoolServer$WorkerProcess.run(TThreadPoolServer.java:286)
            at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
            at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
            at java.lang.Thread.run(Thread.java:750)
    ```
    
    ### _How was this patch tested?_
    - [ ] Add some test cases that check the changes thoroughly including negative and positive cases if possible
    
    - [ ] Add screenshots for manual tests if appropriate
    
    - [ ] [Run test](https://kyuubi.readthedocs.io/en/master/develop_tools/testing.html#running-tests) locally before make a pull request
    
    Closes #4336 from pan3793/list-schemas.
    
    Closes #4336
    
    9ece864c [Cheng Pan] fix
    f71587e9 [Cheng Pan] Avoid listing all schemas for Spark session catalog on schema prunning
    
    Authored-by: Cheng Pan <ch...@apache.org>
    Signed-off-by: Cheng Pan <ch...@apache.org>
    (cherry picked from commit 89fe835b93ea26d1c2ce5d9991a284449d16caa2)
    Signed-off-by: Cheng Pan <ch...@apache.org>
---
 .../org/apache/kyuubi/engine/spark/shim/CatalogShim_v2_4.scala   | 2 +-
 .../org/apache/kyuubi/engine/spark/shim/CatalogShim_v3_0.scala   | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/shim/CatalogShim_v2_4.scala b/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/shim/CatalogShim_v2_4.scala
index 3478abc66..0f6195acf 100644
--- a/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/shim/CatalogShim_v2_4.scala
+++ b/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/shim/CatalogShim_v2_4.scala
@@ -41,7 +41,7 @@ class CatalogShim_v2_4 extends SparkCatalogShim {
       catalogName: String,
       schemaPattern: String): Seq[Row] = {
     (spark.sessionState.catalog.listDatabases(schemaPattern) ++
-      getGlobalTempViewManager(spark, schemaPattern)).map(Row(_, ""))
+      getGlobalTempViewManager(spark, schemaPattern)).map(Row(_, SparkCatalogShim.SESSION_CATALOG))
   }
 
   def setCurrentDatabase(spark: SparkSession, databaseName: String): Unit = {
diff --git a/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/shim/CatalogShim_v3_0.scala b/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/shim/CatalogShim_v3_0.scala
index 50e641b59..a663ba636 100644
--- a/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/shim/CatalogShim_v3_0.scala
+++ b/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/shim/CatalogShim_v3_0.scala
@@ -129,13 +129,12 @@ class CatalogShim_v3_0 extends CatalogShim_v2_4 {
       spark: SparkSession,
       catalogName: String,
       schemaPattern: String): Seq[Row] = {
-    val catalog = getCatalog(spark, catalogName)
-    var schemas = getSchemasWithPattern(catalog, schemaPattern)
     if (catalogName == SparkCatalogShim.SESSION_CATALOG) {
-      val viewMgr = getGlobalTempViewManager(spark, schemaPattern)
-      schemas = schemas ++ viewMgr
+      super.getSchemas(spark, catalogName, schemaPattern)
+    } else {
+      val catalog = getCatalog(spark, catalogName)
+      getSchemasWithPattern(catalog, schemaPattern).map(Row(_, catalog.name))
     }
-    schemas.map(Row(_, catalog.name))
   }
 
   override def setCurrentDatabase(spark: SparkSession, databaseName: String): Unit = {