You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@beam.apache.org by GitBox <gi...@apache.org> on 2021/07/27 20:22:58 UTC
[GitHub] [beam] kmjung commented on a change in pull request #15185: [BEAM-10917] Add support for BigQuery Read API in Python BEAM

kmjung commented on a change in pull request #15185:
URL: https://github.com/apache/beam/pull/15185#discussion_r677622943



##########
File path: sdks/python/apache_beam/io/gcp/bigquery.py
##########
@@ -883,6 +890,184 @@ def _export_files(self, bq):
     return table.schema, metadata_list
 
 
+class _CustomBigQueryStorageStreamSourceBase(BoundedSource):
+  """A base class for BoundedSource implementations which read from BigQuery
+  using the BigQuery Storage API."""
+  def __init__(
+      self,
+      project=None,
+      dataset=None,
+      table=None,
+      selected_fields=None,
+      row_restriction=None,
+      pipeline_options=None):
+
+    if table is None:
+      raise ValueError('A BigQuery table must be specified')
+    else:
+      self.table_reference = bigquery_tools.parse_table_reference(
+          table, dataset, project)
+
+    self.project = self.table_reference.projectId
+    self.dataset = self.table_reference.datasetId
+    self.table = self.table_reference.tableId
+    self.selected_fields = selected_fields
+    self.row_restriction = row_restriction
+    self.pipeline_options = pipeline_options
+    self.split_result = None
+    # The maximum number of streams which will be requested when creating a read
+    # session, regardless of the desired bundle size.
+    self.MAX_SPLIT_COUNT = 10000
+    # The minimum number of streams which will be requested when creating a read
+    # session, regardless of the desired bundle size. Note that the server may
+    # still choose to return fewer than ten streams based on the layout of the
+    # table.
+    self.MIN_SPLIT_COUNT = 10
+
+  def _get_project(self):
+    """Returns the project that will be billed."""
+    project = self.pipeline_options.view_as(GoogleCloudOptions).project
+    if isinstance(project, vp.ValueProvider):
+      project = project.get()
+    if not project:
+      project = self.project
+    return project
+
+  def _get_table_size(self, table, dataset, project):
+    if project is None:
+      project = self._get_project()
+
+    bq = bigquery_tools.BigQueryWrapper()
+    table = bq.get_table(project, dataset, table)
+    return table.numBytes
+
+  def display_data(self):
+    return {
+        'project': str(self.project),
+        'dataset': str(self.dataset),
+        'table': str(self.table),
+        'export_file_format': 'AVRO',

Review comment:
       Nit: I don't think that 'export_file_format' is accurate or provides value here.

##########
File path: sdks/python/apache_beam/io/gcp/bigquery.py
##########
@@ -883,6 +890,184 @@ def _export_files(self, bq):
     return table.schema, metadata_list
 
 
+class _CustomBigQueryStorageStreamSourceBase(BoundedSource):
+  """A base class for BoundedSource implementations which read from BigQuery
+  using the BigQuery Storage API."""
+  def __init__(
+      self,
+      project=None,
+      dataset=None,
+      table=None,
+      selected_fields=None,
+      row_restriction=None,
+      pipeline_options=None):
+
+    if table is None:

Review comment:
       Validate project/dataset if they are required params.

##########
File path: sdks/python/apache_beam/io/gcp/bigquery.py
##########
@@ -883,6 +890,184 @@ def _export_files(self, bq):
     return table.schema, metadata_list
 
 
+class _CustomBigQueryStorageStreamSourceBase(BoundedSource):
+  """A base class for BoundedSource implementations which read from BigQuery
+  using the BigQuery Storage API."""
+  def __init__(
+      self,
+      project=None,

Review comment:
       Nit: do you need default values for project/dataset/table?

##########
File path: sdks/python/apache_beam/io/gcp/bigquery.py
##########
@@ -883,6 +890,184 @@ def _export_files(self, bq):
     return table.schema, metadata_list
 
 
+class _CustomBigQueryStorageStreamSourceBase(BoundedSource):
+  """A base class for BoundedSource implementations which read from BigQuery
+  using the BigQuery Storage API."""
+  def __init__(
+      self,
+      project=None,
+      dataset=None,
+      table=None,
+      selected_fields=None,
+      row_restriction=None,
+      pipeline_options=None):
+
+    if table is None:
+      raise ValueError('A BigQuery table must be specified')
+    else:
+      self.table_reference = bigquery_tools.parse_table_reference(
+          table, dataset, project)
+
+    self.project = self.table_reference.projectId
+    self.dataset = self.table_reference.datasetId
+    self.table = self.table_reference.tableId
+    self.selected_fields = selected_fields
+    self.row_restriction = row_restriction
+    self.pipeline_options = pipeline_options
+    self.split_result = None
+    # The maximum number of streams which will be requested when creating a read
+    # session, regardless of the desired bundle size.
+    self.MAX_SPLIT_COUNT = 10000
+    # The minimum number of streams which will be requested when creating a read
+    # session, regardless of the desired bundle size. Note that the server may
+    # still choose to return fewer than ten streams based on the layout of the
+    # table.
+    self.MIN_SPLIT_COUNT = 10
+
+  def _get_project(self):
+    """Returns the project that will be billed."""
+    project = self.pipeline_options.view_as(GoogleCloudOptions).project
+    if isinstance(project, vp.ValueProvider):
+      project = project.get()
+    if not project:
+      project = self.project
+    return project
+
+  def _get_table_size(self, table, dataset, project):
+    if project is None:
+      project = self._get_project()
+
+    bq = bigquery_tools.BigQueryWrapper()
+    table = bq.get_table(project, dataset, table)

Review comment:
       Nit: since we currently do this wrong on the Java side -- what happens if the table doesn't exist?

##########
File path: sdks/python/apache_beam/io/gcp/bigquery.py
##########
@@ -883,6 +890,184 @@ def _export_files(self, bq):
     return table.schema, metadata_list
 
 
+class _CustomBigQueryStorageStreamSourceBase(BoundedSource):

Review comment:
       Naming nit: this isn't a stream source, right? It's a base class for either a table source or (eventually) a query source, but neither of these directly corresponds to a stream.

##########
File path: sdks/python/apache_beam/io/gcp/bigquery.py
##########
@@ -883,6 +890,184 @@ def _export_files(self, bq):
     return table.schema, metadata_list
 
 
+class _CustomBigQueryStorageStreamSourceBase(BoundedSource):
+  """A base class for BoundedSource implementations which read from BigQuery
+  using the BigQuery Storage API."""
+  def __init__(
+      self,
+      project=None,
+      dataset=None,
+      table=None,
+      selected_fields=None,
+      row_restriction=None,
+      pipeline_options=None):
+
+    if table is None:
+      raise ValueError('A BigQuery table must be specified')
+    else:
+      self.table_reference = bigquery_tools.parse_table_reference(
+          table, dataset, project)
+
+    self.project = self.table_reference.projectId
+    self.dataset = self.table_reference.datasetId
+    self.table = self.table_reference.tableId
+    self.selected_fields = selected_fields
+    self.row_restriction = row_restriction
+    self.pipeline_options = pipeline_options
+    self.split_result = None
+    # The maximum number of streams which will be requested when creating a read
+    # session, regardless of the desired bundle size.
+    self.MAX_SPLIT_COUNT = 10000
+    # The minimum number of streams which will be requested when creating a read
+    # session, regardless of the desired bundle size. Note that the server may
+    # still choose to return fewer than ten streams based on the layout of the
+    # table.
+    self.MIN_SPLIT_COUNT = 10
+
+  def _get_project(self):
+    """Returns the project that will be billed."""
+    project = self.pipeline_options.view_as(GoogleCloudOptions).project
+    if isinstance(project, vp.ValueProvider):
+      project = project.get()
+    if not project:
+      project = self.project
+    return project
+
+  def _get_table_size(self, table, dataset, project):
+    if project is None:
+      project = self._get_project()
+
+    bq = bigquery_tools.BigQueryWrapper()
+    table = bq.get_table(project, dataset, table)
+    return table.numBytes
+
+  def display_data(self):
+    return {
+        'project': str(self.project),
+        'dataset': str(self.dataset),
+        'table': str(self.table),
+        'export_file_format': 'AVRO',
+    }
+
+  def estimate_size(self):
+    # The size of stream source cannot be estimate due to server-side liquid
+    # sharding
+    return None
+
+  def split(self, desired_bundle_size, start_position=None, stop_position=None):
+    requested_session = bq_storage.types.ReadSession()
+    requested_session.table = 'projects/{}/datasets/{}/tables/{}'.format(
+        self.project,
+        self.dataset,
+        self.table)
+    requested_session.data_format = bq_storage.types.DataFormat.AVRO
+
+    if (self.selected_fields is not None or self.row_restriction is not None):
+      table_read_options = requested_session.TableReadOptions()
+      if self.selected_fields is not None:
+        table_read_options.selected_fields = self.selected_fields
+      if self.row_restriction is not None:
+        table_read_options.row_restriction = self.row_restriction
+      requested_session.read_options = table_read_options
+
+    storage_client = bq_storage.BigQueryReadClient()
+    stream_count = 0
+    if (desired_bundle_size > 0):
+      stream_count = min(
+          int(
+              self._get_table_size(self.table, self.dataset, self.project) /
+              desired_bundle_size),
+          self.MAX_SPLIT_COUNT)
+
+    stream_count = max(stream_count, self.MIN_SPLIT_COUNT)
+    parent = 'projects/{}'.format(self.project)
+    read_session = storage_client.create_read_session(
+        parent=parent,
+        read_session=requested_session,
+        max_stream_count=stream_count)
+    self.split_result = [
+        _CustomBigQueryStorageStreamSource(stream.name)
+        for stream in read_session.streams
+    ]
+
+    for source in self.split_result:
+      yield SourceBundle(1.0, source, None, None)
+
+  def get_range_tracker(self, start_position, stop_position):
+    class CustomBigQuerySourceRangeTracker(RangeTracker):
+      """A RangeTracker that always returns positions as None."""
+      def start_position(self):
+        return None
+
+      def stop_position(self):
+        return None
+
+    return CustomBigQuerySourceRangeTracker()
+
+  def read(self, range_tracker):
+    raise NotImplementedError(
+        'BigQuery storage source must be split before being read')
+
+
+class _CustomBigQueryStorageStreamSource(BoundedSource):
+  """A source representing a single stream in a read session."""
+  def __init__(self, read_stream_name):
+    self.read_stream_name = read_stream_name
+    self.deserialized_rows = []
+
+  def display_data(self):
+    return {
+        'read_stream': str(self.read_stream_name),
+    }
+
+  def estimate_size(self):
+    # The size of stream source cannot be estimate due to server-side liquid
+    # sharding
+    return None
+
+  def split(self, desired_bundle_size, start_position=None, stop_position=None):
+    # A stream source can't be split without reading from it due to
+    # server-side liquid sharding.
+    raise NotImplementedError('BigQuery storage stream source cannot be split.')
+
+  def get_range_tracker(self, start_position, stop_position):
+    if start_position is None:
+      # Defaulting to the start of the stream.
+      start_position = 0
+
+    # Since the streams are unsplittable we choose OFFSET_INFINITY as the
+    # default end offset so that all data of the source gets read.
+    stop_position = range_trackers.OffsetRangeTracker.OFFSET_INFINITY
+    range_tracker = range_trackers.OffsetRangeTracker(
+        start_position, stop_position)
+    # Ensuring that all try_split() calls will be ignored by the Rangetracker.
+    range_tracker = range_trackers.UnsplittableRangeTracker(range_tracker)
+
+    return range_tracker
+
+  def deserialize_rows(self, read_rows_response, reader):

Review comment:
       What happened to the idea of deserializing a single row at a time?

##########
File path: sdks/python/apache_beam/io/gcp/bigquery.py
##########
@@ -1197,8 +1382,9 @@ def start_bundle(self):
 
     (
         bigquery_tools.BigQueryWrapper.HISTOGRAM_METRIC_LOGGER.
-        minimum_logging_frequency_msec

Review comment:
       I'm not sure that  I understand what's happening here.

##########
File path: sdks/python/apache_beam/io/gcp/bigquery.py
##########
@@ -1920,45 +2109,55 @@ def __init__(self, gcs_location=None, *args, **kwargs):
         gcs_location = StaticValueProvider(str, gcs_location)
 
     self.gcs_location = gcs_location
-
+    self.method = method or ReadFromBigQuery.Method.EXPORT
     self._args = args
     self._kwargs = kwargs
 
   def expand(self, pcoll):
     # TODO(BEAM-11115): Make ReadFromBQ rely on ReadAllFromBQ implementation.
-    temp_location = pcoll.pipeline.options.view_as(
-        GoogleCloudOptions).temp_location
-    job_name = pcoll.pipeline.options.view_as(GoogleCloudOptions).job_name
-    gcs_location_vp = self.gcs_location
-    unique_id = str(uuid.uuid4())[0:10]
-
-    def file_path_to_remove(unused_elm):
-      gcs_location = bigquery_export_destination_uri(
-          gcs_location_vp, temp_location, unique_id, True)
-      return gcs_location + '/'
-
-    files_to_remove_pcoll = beam.pvalue.AsList(
-        pcoll.pipeline
-        | 'FilesToRemoveImpulse' >> beam.Create([None])
-        | 'MapFilesToRemove' >> beam.Map(file_path_to_remove))
-
-    try:
-      step_name = self.label
-    except AttributeError:
-      step_name = 'ReadFromBigQuery_%d' % ReadFromBigQuery.COUNTER
-      ReadFromBigQuery.COUNTER += 1
-    return (
-        pcoll
-        | beam.io.Read(
-            _CustomBigQuerySource(
-                gcs_location=self.gcs_location,
-                pipeline_options=pcoll.pipeline.options,
-                job_name=job_name,
-                step_name=step_name,
-                unique_id=unique_id,
-                *self._args,
-                **self._kwargs))
-        | _PassThroughThenCleanup(files_to_remove_pcoll))
+    if self.method is ReadFromBigQuery.Method.EXPORT:
+      temp_location = pcoll.pipeline.options.view_as(
+          GoogleCloudOptions).temp_location
+      job_name = pcoll.pipeline.options.view_as(GoogleCloudOptions).job_name
+      gcs_location_vp = self.gcs_location
+      unique_id = str(uuid.uuid4())[0:10]
+
+      def file_path_to_remove(unused_elm):
+        gcs_location = bigquery_export_destination_uri(
+            gcs_location_vp, temp_location, unique_id, True)
+        return gcs_location + '/'
+
+      files_to_remove_pcoll = beam.pvalue.AsList(
+          pcoll.pipeline
+          | 'FilesToRemoveImpulse' >> beam.Create([None])
+          | 'MapFilesToRemove' >> beam.Map(file_path_to_remove))
+
+      try:
+        step_name = self.label
+      except AttributeError:
+        step_name = 'ReadFromBigQuery_%d' % ReadFromBigQuery.COUNTER
+        ReadFromBigQuery.COUNTER += 1
+      return (
+          pcoll
+          | beam.io.Read(
+              _CustomBigQuerySource(
+                  gcs_location=self.gcs_location,
+                  pipeline_options=pcoll.pipeline.options,
+                  job_name=job_name,
+                  step_name=step_name,
+                  unique_id=unique_id,
+                  *self._args,
+                  **self._kwargs))
+          | _PassThroughThenCleanup(files_to_remove_pcoll))
+
+    elif self.method is ReadFromBigQuery.Method.DIRECT_READ:
+      return (
+          pcoll
+          | beam.io.Read(
+              _CustomBigQueryStorageStreamSourceBase(
+                  pipeline_options=pcoll.pipeline.options,
+                  *self._args,
+                  **self._kwargs)))

Review comment:
       ... else raise an error?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@beam.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org