You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2022/06/30 06:58:58 UTC

[spark] branch master updated: [SPARK-39597][PYTHON] Make GetTable, TableExists and DatabaseExists in the python side support 3-layer-namespace

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new c1106fbe22c [SPARK-39597][PYTHON] Make GetTable, TableExists and DatabaseExists in the python side support 3-layer-namespace
c1106fbe22c is described below

commit c1106fbe22c6ef36b785befb25686cf64741c447
Author: Ruifeng Zheng <ru...@apache.org>
AuthorDate: Thu Jun 30 14:58:37 2022 +0800

    [SPARK-39597][PYTHON] Make GetTable, TableExists and DatabaseExists in the python side support 3-layer-namespace
    
    ### What changes were proposed in this pull request?
    
    corresponding changes in the python side to  https://issues.apache.org/jira/browse/SPARK-39263
    
    1, make TableExists and DatabaseExists support 3-layer-namespace
    2, add GetTable in the python side
    
    ### Why are the changes needed?
    to support 3-layer-namespace
    
    ### Does this PR introduce _any_ user-facing change?
    Yes, In `TableExists`, when `dbName` is empty, will first try to treat `tabelName` as a multi-layer-namespace.
    
    ### How was this patch tested?
    added UT
    
    Closes #36985 from zhengruifeng/py_3L_db_tbl_exist.
    
    Authored-by: Ruifeng Zheng <ru...@apache.org>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 python/pyspark/sql/catalog.py            | 78 +++++++++++++++++++++++++++++++-
 python/pyspark/sql/tests/test_catalog.py | 18 ++++++++
 2 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py
index cd16be7ba19..17516bbfa66 100644
--- a/python/pyspark/sql/catalog.py
+++ b/python/pyspark/sql/catalog.py
@@ -118,6 +118,9 @@ class Catalog:
         bool
             Indicating whether the database exists
 
+        .. versionchanged:: 3.4
+           Allowed ``dbName`` to be qualified with catalog name.
+
         Examples
         --------
         >>> spark.catalog.databaseExists("test_new_database")
@@ -125,6 +128,8 @@ class Catalog:
         >>> df = spark.sql("CREATE DATABASE test_new_database")
         >>> spark.catalog.databaseExists("test_new_database")
         True
+        >>> spark.catalog.databaseExists("spark_catalog.test_new_database")
+        True
         >>> df = spark.sql("DROP DATABASE test_new_database")
         """
         return self._jcatalog.databaseExists(dbName)
@@ -164,6 +169,47 @@ class Catalog:
             )
         return tables
 
+    def getTable(self, tableName: str) -> Table:
+        """Get the table or view with the specified name. This table can be a temporary view or a
+        table/view. This throws an AnalysisException when no Table can be found.
+
+        .. versionadded:: 3.4.0
+
+        Parameters
+        ----------
+        tableName : str
+                    name of the table to check existence.
+
+        Examples
+        --------
+        >>> df = spark.sql("CREATE TABLE tab1 (name STRING, age INT) USING parquet")
+        >>> spark.catalog.getTable("tab1")
+        Table(name='tab1', catalog='spark_catalog', namespace=['default'], ...
+        >>> spark.catalog.getTable("default.tab1")
+        Table(name='tab1', catalog='spark_catalog', namespace=['default'], ...
+        >>> spark.catalog.getTable("spark_catalog.default.tab1")
+        Table(name='tab1', catalog='spark_catalog', namespace=['default'], ...
+        >>> df = spark.sql("DROP TABLE tab1")
+        >>> spark.catalog.getTable("tab1")
+        Traceback (most recent call last):
+            ...
+        pyspark.sql.utils.AnalysisException: ...
+        """
+        jtable = self._jcatalog.getTable(tableName)
+        jnamespace = jtable.namespace()
+        if jnamespace is not None:
+            namespace = [jnamespace[i] for i in range(0, len(jnamespace))]
+        else:
+            namespace = None
+        return Table(
+            name=jtable.name(),
+            catalog=jtable.catalog(),
+            namespace=namespace,
+            description=jtable.description(),
+            tableType=jtable.tableType(),
+            isTemporary=jtable.isTemporary(),
+        )
+
     @since(2.0)
     def listFunctions(self, dbName: Optional[str] = None) -> List[Function]:
         """Returns a list of functions registered in the specified database.
@@ -255,15 +301,23 @@ class Catalog:
         ----------
         tableName : str
                     name of the table to check existence
+                    If no database is specified, first try to treat ``tableName`` as a
+                    multi-layer-namespace identifier, then try to ``tableName`` as a normal table
+                    name in current database if necessary.
         dbName : str, optional
                  name of the database to check table existence in.
-                 If no database is specified, the current database is used
+
+           .. deprecated:: 3.4.0
+
 
         Returns
         -------
         bool
             Indicating whether the table/view exists
 
+        .. versionchanged:: 3.4
+           Allowed ``tableName`` to be qualified with catalog name when ``dbName`` is None.
+
         Examples
         --------
 
@@ -274,6 +328,12 @@ class Catalog:
         >>> df = spark.sql("CREATE TABLE tab1 (name STRING, age INT) USING parquet")
         >>> spark.catalog.tableExists("tab1")
         True
+        >>> spark.catalog.tableExists("default.tab1")
+        True
+        >>> spark.catalog.tableExists("spark_catalog.default.tab1")
+        True
+        >>> spark.catalog.tableExists("tab1", "default")
+        True
         >>> df = spark.sql("DROP TABLE tab1")
         >>> spark.catalog.tableExists("unexisting_table")
         False
@@ -285,6 +345,12 @@ class Catalog:
         >>> df = spark.sql("CREATE VIEW view1 AS SELECT 1")
         >>> spark.catalog.tableExists("view1")
         True
+        >>> spark.catalog.tableExists("default.view1")
+        True
+        >>> spark.catalog.tableExists("spark_catalog.default.view1")
+        True
+        >>> spark.catalog.tableExists("view1", "default")
+        True
         >>> df = spark.sql("DROP VIEW view1")
         >>> spark.catalog.tableExists("view1")
         False
@@ -298,7 +364,15 @@ class Catalog:
         >>> spark.catalog.tableExists("view1")
         False
         """
-        return self._jcatalog.tableExists(dbName, tableName)
+        if dbName is None:
+            return self._jcatalog.tableExists(tableName)
+        else:
+            warnings.warn(
+                "`dbName` has been deprecated since Spark 3.4 and might be removed in "
+                "a future version. Use tableExists(`dbName.tableName`) instead.",
+                FutureWarning,
+            )
+            return self._jcatalog.tableExists(dbName, tableName)
 
     def createExternalTable(
         self,
diff --git a/python/pyspark/sql/tests/test_catalog.py b/python/pyspark/sql/tests/test_catalog.py
index b2bf83d7a0d..57c071eea35 100644
--- a/python/pyspark/sql/tests/test_catalog.py
+++ b/python/pyspark/sql/tests/test_catalog.py
@@ -50,6 +50,8 @@ class CatalogTests(ReusedSQLTestCase):
             self.assertFalse(spark.catalog.databaseExists("some_db"))
             spark.sql("CREATE DATABASE some_db")
             self.assertTrue(spark.catalog.databaseExists("some_db"))
+            self.assertTrue(spark.catalog.databaseExists("spark_catalog.some_db"))
+            self.assertFalse(spark.catalog.databaseExists("spark_catalog.some_db2"))
 
     def test_list_tables(self):
         from pyspark.sql.catalog import Table
@@ -316,9 +318,25 @@ class CatalogTests(ReusedSQLTestCase):
                 self.assertFalse(spark.catalog.tableExists("tab2", "some_db"))
                 spark.sql("CREATE TABLE tab1 (name STRING, age INT) USING parquet")
                 self.assertTrue(spark.catalog.tableExists("tab1"))
+                self.assertTrue(spark.catalog.tableExists("default.tab1"))
+                self.assertTrue(spark.catalog.tableExists("spark_catalog.default.tab1"))
+                self.assertTrue(spark.catalog.tableExists("tab1", "default"))
                 spark.sql("CREATE TABLE some_db.tab2 (name STRING, age INT) USING parquet")
+                self.assertFalse(spark.catalog.tableExists("tab2"))
+                self.assertTrue(spark.catalog.tableExists("some_db.tab2"))
+                self.assertTrue(spark.catalog.tableExists("spark_catalog.some_db.tab2"))
                 self.assertTrue(spark.catalog.tableExists("tab2", "some_db"))
 
+    def test_get_table(self):
+        spark = self.spark
+        with self.database("some_db"):
+            spark.sql("CREATE DATABASE some_db")
+            with self.table("tab1"):
+                spark.sql("CREATE TABLE tab1 (name STRING, age INT) USING parquet")
+                self.assertEqual(spark.catalog.getTable("tab1").database, "default")
+                self.assertEqual(spark.catalog.getTable("default.tab1").catalog, "spark_catalog")
+                self.assertEqual(spark.catalog.getTable("spark_catalog.default.tab1").name, "tab1")
+
 
 if __name__ == "__main__":
     import unittest


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org