You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by GitBox <gi...@apache.org> on 2019/09/02 19:09:24 UTC

[GitHub] [airflow] benjamingrenier commented on a change in pull request #5566: [AIRFLOW-4935] Add method in the bigquery hook to list tables in a dataset

benjamingrenier commented on a change in pull request #5566: [AIRFLOW-4935] Add method in the bigquery hook to list tables in a dataset
URL: https://github.com/apache/airflow/pull/5566#discussion_r320035045
 
 

 ##########
 File path: airflow/contrib/hooks/bigquery_hook.py
 ##########
 @@ -1793,6 +1793,68 @@ def get_datasets_list(self, project_id=None):
 
         return datasets_list
 
+    @GoogleCloudBaseHook.catch_http_exception
+    def get_dataset_tables_list(self, dataset_id, project_id=None, table_prefix=None, max_results=None):
+        """
+        Method returns tables list of a BigQuery dataset. If table prefix is specified,
+        only tables beginning by it are returned.
+
+        .. seealso::
+            For more information, see:
+            https://cloud.google.com/bigquery/docs/reference/rest/v2/tables/list
+
+        :param dataset_id: The BigQuery Dataset ID
+        :type dataset_id: str
+        :param project_id: The GCP Project ID
+        :type project_id: str
+        :param table_prefix: Tables must begin by this prefix to be returned (case sensitive)
+        :type table_prefix: str
+        :param max_results: The maximum number of results to return in a single response page.
+            Leverage the page tokens to iterate through the entire collection.
+        :type max_results: int
+        :return: dataset_tables_list
+
+            Example of returned dataset_tables_list: ::
+
+                    [
+                       {
+                          "projectId": "your-project",
+                          "datasetId": "dataset",
+                          "tableId": "table1"
+                        },
+                        {
+                          "projectId": "your-project",
+                          "datasetId": "dataset",
+                          "tableId": "table2"
+                        }
+                    ]
+        """
+
+        dataset_project_id = project_id if project_id else self.project_id
+
+        optional_params = {}
+        if max_results:
+            optional_params['maxResults'] = max_results
+
+        request = self.service.tables().list(projectId=dataset_project_id,
+                                             datasetId=dataset_id,
+                                             **optional_params)
+        dataset_tables_list = []
+        while request is not None:
+            response = request.execute(num_retries=self.num_retries)
+
+            for table in response.get('tables', []):
+                table_ref = table.get('tableReference')
+                table_id = table_ref.get('tableId')
+                if table_id and (not table_prefix or table_id.startswith(table_prefix)):
+                    dataset_tables_list.append(table_ref)
+
+            request = self.service.tables().list_next(previous_request=request,
+                                                      previous_response=response)
+        self.log.info(dataset_tables_list)
 
 Review comment:
   I will fix it soon. I prepare a commit with more unit tests too.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services