You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2022/08/12 18:50:47 UTC
[GitHub] [spark] amaliujia commented on a diff in pull request #37490: [SPARK-40051][PYTHON][SQL][DOCS] Make pyspark.sql.catalog examples self-contained

amaliujia commented on code in PR #37490:
URL: https://github.com/apache/spark/pull/37490#discussion_r944730697


##########
python/pyspark/sql/catalog.py:
##########
@@ -108,13 +108,22 @@ def setCurrentCatalog(self, catalogName: str) -> None:
         ----------
         catalogName : str
             name of the catalog to set
+
+        Examples
+        --------
+        >>> spark.catalog.setCurrentCatalog("spark_catalog")
         """
         return self._jcatalog.setCurrentCatalog(catalogName)
 
     def listCatalogs(self) -> List[CatalogMetadata]:
         """Returns a list of catalogs in this session.
 
         .. versionadded:: 3.4.0
+
+        Returns
+        -------
+        list
+            A list of :class:`CatalogMetadata`.

Review Comment:
   Seeing you add 
   ```
           >>> spark.catalog.listDatabases()
           [Database(name='default', catalog='spark_catalog', description='default database', ...
   ```
   
   Why don't do the same for this API?



##########
python/pyspark/sql/catalog.py:
##########
@@ -251,19 +329,31 @@ def getTable(self, tableName: str) -> Table:
         Parameters
         ----------
         tableName : str
-                    name of the table to check existence.
+            name of the table to get.

Review Comment:
   add
   ```
               .. versionchanged:: 3.4.0
                  Allow `tableName` to be qualified with catalog name.
   ```
   ?



##########
python/pyspark/sql/catalog.py:
##########
@@ -674,59 +876,267 @@ def registerFunction(
         warnings.warn("Deprecated in 2.3.0. Use spark.udf.register instead.", FutureWarning)
         return self._sparkSession.udf.register(name, f, returnType)
 
-    @since(2.0)
     def isCached(self, tableName: str) -> bool:
-        """Returns true if the table is currently cached in-memory.
+        """
+        Returns true if the table is currently cached in-memory.
+
+        .. versionadded:: 2.0.0
+
+        Parameters
+        ----------
+        tableName : str
+            name of the table to get.
+
+            .. versionchanged:: 3.4.0
+                Allow ``tableName`` to be qualified with catalog name.
+
+        Returns
+        -------
+        bool
+
+        Examples
+        --------
+        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
+        >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
+        >>> spark.catalog.cacheTable("tbl1")
+        >>> spark.catalog.isCached("tbl1")
+        True
+
+        Throw an analysis exception when the table does not exists.
+
+        >>> spark.catalog.isCached("not_existing_table")
+        Traceback (most recent call last):
+            ...
+        pyspark.sql.utils.AnalysisException: ...
 
-        .. versionchanged:: 3.4
-           Allowed ``tableName`` to be qualified with catalog name.
+        Using the fully qualified name for the table.
+
+        >>> spark.catalog.isCached("spark_catalog.default.tbl1")
+        True
+        >>> spark.catalog.uncacheTable("tbl1")
+        >>> _ = spark.sql("DROP TABLE tbl1")
         """
         return self._jcatalog.isCached(tableName)
 
-    @since(2.0)
     def cacheTable(self, tableName: str) -> None:
         """Caches the specified table in-memory.
 
-        .. versionchanged:: 3.4
-           Allowed ``tableName`` to be qualified with catalog name.
+        .. versionadded:: 2.0.0
+
+        Parameters
+        ----------
+        tableName : str
+            name of the table to get.
+
+            .. versionchanged:: 3.4.0
+                Allow ``tableName`` to be qualified with catalog name.
+
+        Examples
+        --------
+        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
+        >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
+        >>> spark.catalog.cacheTable("tbl1")
+
+        Throw an analysis exception when the table does not exists.
+
+        >>> spark.catalog.cacheTable("not_existing_table")
+        Traceback (most recent call last):
+            ...
+        pyspark.sql.utils.AnalysisException: ...
+
+        Using the fully qualified name for the table.
+
+        >>> spark.catalog.cacheTable("spark_catalog.default.tbl1")
+        >>> spark.catalog.uncacheTable("tbl1")
+        >>> _ = spark.sql("DROP TABLE tbl1")
         """
         self._jcatalog.cacheTable(tableName)
 
-    @since(2.0)
     def uncacheTable(self, tableName: str) -> None:
         """Removes the specified table from the in-memory cache.
 
-        .. versionchanged:: 3.4
-           Allowed ``tableName`` to be qualified with catalog name.
+        .. versionadded:: 2.0.0
+
+        Parameters
+        ----------
+        tableName : str
+            name of the table to get.
+
+            .. versionchanged:: 3.4.0
+                Allow ``tableName`` to be qualified with catalog name.
+
+        Examples
+        --------
+        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
+        >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
+        >>> spark.catalog.cacheTable("tbl1")
+        >>> spark.catalog.uncacheTable("tbl1")
+        >>> spark.catalog.isCached("tbl1")
+        False
+
+        Throw an analysis exception when the table does not exists.
+
+        >>> spark.catalog.uncacheTable("not_existing_table")
+        Traceback (most recent call last):
+            ...
+        pyspark.sql.utils.AnalysisException: ...
+
+        Using the fully qualified name for the table.
+
+        >>> spark.catalog.uncacheTable("spark_catalog.default.tbl1")
+        >>> spark.catalog.isCached("tbl1")
+        False
+        >>> _ = spark.sql("DROP TABLE tbl1")
         """
         self._jcatalog.uncacheTable(tableName)
 
-    @since(2.0)
     def clearCache(self) -> None:
-        """Removes all cached tables from the in-memory cache."""
+        """Removes all cached tables from the in-memory cache.
+
+        .. versionadded:: 2.0.0
+
+        Examples
+        --------
+        >>> _ = spark.sql("DROP TABLE IF EXISTS tbl1")
+        >>> _ = spark.sql("CREATE TABLE tbl1 (name STRING, age INT) USING parquet")
+        >>> spark.catalog.clearCache()
+        >>> spark.catalog.isCached("tbl1")
+        False
+        >>> _ = spark.sql("DROP TABLE tbl1")
+        """
         self._jcatalog.clearCache()
 
-    @since(2.0)
     def refreshTable(self, tableName: str) -> None:
         """Invalidates and refreshes all the cached data and metadata of the given table.
 
-        .. versionchanged:: 3.4
-           Allowed ``tableName`` to be qualified with catalog name.
+        .. versionadded:: 2.0.0
+
+        Parameters
+        ----------
+        tableName : str
+            name of the table to get.
+
+            .. versionchanged:: 3.4.0
+                Allow ``tableName`` to be qualified with catalog name.
+
+        Examples
+        --------
+        The example below caches a table, and then remove the data.
+
+        >>> import tempfile
+        >>> with tempfile.TemporaryDirectory() as d:
+        ...     _ = spark.sql("DROP TABLE IF EXISTS tbl1")
+        ...     _ = spark.sql("CREATE TABLE tbl1 (col STRING) USING TEXT LOCATION '{}'".format(d))
+        ...     _ = spark.sql("INSERT INTO tbl1 SELECT 'abc'")
+        ...     spark.catalog.cacheTable("tbl1")
+        ...     spark.table("tbl1").show()
+        +---+
+        |col|
+        +---+
+        |abc|
+        +---+
+
+        Because the table is cached, it computes from the cached data as below.
+
+        >>> spark.table("tbl1").count()
+        1
+
+        After refreshing the table, it shows 0 because the data does not exist anymore.
+
+        >>> spark.catalog.refreshTable("tbl1")

Review Comment:
   Data files were removed because `tempfile.TemporaryDirectory()` and once the lifecycle of it ends, the directory is removed?



##########
python/pyspark/sql/catalog.py:
##########
@@ -613,47 +786,76 @@ def createTable(
             df = self._jcatalog.createTable(tableName, source, scala_datatype, description, options)
         return DataFrame(df, self._sparkSession)
 
-    def dropTempView(self, viewName: str) -> None:
+    def dropTempView(self, viewName: str) -> bool:

Review Comment:
   Is this a API change or user behavior change?



##########
python/pyspark/sql/catalog.py:
##########
@@ -327,25 +434,33 @@ def functionExists(self, functionName: str, dbName: Optional[str] = None) -> boo
         ----------
         functionName : str
             name of the function to check existence
+
+            .. versionchanged:: 3.4.0
+               Allow ``functionName`` to be qualified with catalog name
+
         dbName : str, optional
             name of the database to check function existence in.
-            If no database is specified, the current database is used
 
            .. deprecated:: 3.4.0

Review Comment:
   Same for other places if this makes sense.



##########
python/pyspark/sql/catalog.py:
##########
@@ -327,25 +434,33 @@ def functionExists(self, functionName: str, dbName: Optional[str] = None) -> boo
         ----------
         functionName : str
             name of the function to check existence
+
+            .. versionchanged:: 3.4.0
+               Allow ``functionName`` to be qualified with catalog name
+
         dbName : str, optional
             name of the database to check function existence in.
-            If no database is specified, the current database is used
 
            .. deprecated:: 3.4.0

Review Comment:
   Do not mention this is `deprecated`? 
   
   On the scala side, per the discussion with community, we don't use `@deprecated` annotation but just recommend user to use another API choice: https://github.com/databricks/runtime/blob/ef75e00fb2bd8d30aafae1eb281e6e9d0432d590/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala#L221



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org