You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@beam.apache.org by GitBox <gi...@apache.org> on 2020/04/24 21:45:12 UTC

[GitHub] [beam] TheNeuralBit commented on a change in pull request #11419: [BEAM-9561] Add a framework for running pandas doctests with beam dataframes.

TheNeuralBit commented on a change in pull request #11419:
URL: https://github.com/apache/beam/pull/11419#discussion_r411779740



##########
File path: sdks/python/apache_beam/dataframe/doctests.py
##########
@@ -0,0 +1,280 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A module that allows running existing pandas doctests with Beam dataframes.
+
+This module hooks into the doctesting framework by customizing providing
+a custom runner and, in particular, an OutputChecker, as well as providing
+a fake object for mocking out the pandas module.
+
+The (novel) sequence of events when running a doctest is as follows.
+
+  1. The test invokes `pd.DataFrame(...)` (or similar) and an actual dataframe
+     is computed and stashed but a Beam deferred dataframe is returned
+     in its place.
+  2. Computations are done on these "dataframes," resulting in new objects,
+     but as these are actually deferred, only expression trees are built.
+     In the background, a mapping of id -> deferred dataframe is stored for
+     each newly created dataframe.
+  3. When any dataframe is printed out, the repr has been overwritten to
+     print `Dataframe[id]`. The aforementened mapping is used to map this back
+     to the actual dataframe object, which is then computed via Beam, and its
+     the (stringified) result plugged into the actual output for comparison.
+  4. The comparison is then done on the sorted lines of the expected and actual
+     values.
+"""
+
+from __future__ import absolute_import
+
+import doctest
+import collections
+import contextlib
+import re
+
+import numpy as np
+import pandas as pd
+
+import apache_beam as beam
+from apache_beam.dataframe import expressions
+from apache_beam.dataframe.frame_base import DeferredFrame
+from apache_beam.dataframe import frames  # pylint: disable=unused-import
+from apache_beam.dataframe import transforms
+
+
+class TestEnvironment(object):
+  """A class managing the patching (of methods, inputs, and outputs) needed
+  to run and validate tests.
+  """
+  def __init__(self):
+    self._inputs = {}
+    self._all_frames = {}
+
+  def fake_pandas_module(self):
+    class FakePandas(object):
+      """A stand-in for the pandas top-level module.
+      """
+      # For now, only populated with the frame types (below).
+      # TODO(BEAM-9561): We may want to put more here.
+      pass
+
+    fake_pd = FakePandas()
+    for pandas_type, deferred_type in DeferredFrame._pandas_type_map.items():
+      setattr(
+          fake_pd,
+          pandas_type.__name__,
+          self._deferred_frame(pandas_type, deferred_type))
+
+    return fake_pd
+
+  def _deferred_frame(self, pandas_type, deferred_type):
+    """Creates a "constructor" that record the actual value as an input and
+    returns a placeholder frame in its place."""
+    def wrapper(*args, **kwargs):
+      df = pandas_type(*args, **kwargs)
+      placeholder = expressions.PlaceholderExpression(df[0:0])
+      self._inputs[placeholder] = df
+      return deferred_type(placeholder)
+
+    return wrapper
+
+  @contextlib.contextmanager
+  def _monkey_patch_type(self, deferred_type):
+    """Monkey-patch __init__ to record a pointer to all created frames, and
+    __repr__ to be able to recognize them in the doctest output.
+    """
+    try:
+      old_init, old_repr = deferred_type.__init__, deferred_type.__repr__
+
+      def new_init(df, *args, **kwargs):
+        old_init(df, *args, **kwargs)
+        self._all_frames[id(df)] = df
+
+      deferred_type.__init__ = new_init
+      deferred_type.__repr__ = lambda self: 'DeferredFrame[%s]' % id(self)
+      self._recorded_results = collections.defaultdict(list)
+      yield
+    finally:
+      deferred_type.__init__, deferred_type.__repr__ = old_init, old_repr
+
+  @contextlib.contextmanager
+  def context(self):

Review comment:
       Maybe call this monkey_patched_context so it's clearer what it's doing and/or add a docstring like "Creates a context within which DeferredFrame types are monkey patched to record ids".

##########
File path: sdks/python/apache_beam/dataframe/doctests.py
##########
@@ -0,0 +1,280 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A module that allows running existing pandas doctests with Beam dataframes.
+
+This module hooks into the doctesting framework by customizing providing
+a custom runner and, in particular, an OutputChecker, as well as providing
+a fake object for mocking out the pandas module.
+
+The (novel) sequence of events when running a doctest is as follows.
+
+  1. The test invokes `pd.DataFrame(...)` (or similar) and an actual dataframe
+     is computed and stashed but a Beam deferred dataframe is returned
+     in its place.
+  2. Computations are done on these "dataframes," resulting in new objects,
+     but as these are actually deferred, only expression trees are built.
+     In the background, a mapping of id -> deferred dataframe is stored for
+     each newly created dataframe.
+  3. When any dataframe is printed out, the repr has been overwritten to
+     print `Dataframe[id]`. The aforementened mapping is used to map this back
+     to the actual dataframe object, which is then computed via Beam, and its
+     the (stringified) result plugged into the actual output for comparison.
+  4. The comparison is then done on the sorted lines of the expected and actual
+     values.
+"""
+
+from __future__ import absolute_import
+
+import doctest
+import collections
+import contextlib
+import re
+
+import numpy as np
+import pandas as pd
+
+import apache_beam as beam
+from apache_beam.dataframe import expressions
+from apache_beam.dataframe.frame_base import DeferredFrame
+from apache_beam.dataframe import frames  # pylint: disable=unused-import
+from apache_beam.dataframe import transforms
+
+
+class TestEnvironment(object):
+  """A class managing the patching (of methods, inputs, and outputs) needed
+  to run and validate tests.

Review comment:
       It would be helpful to note here that the purpose of these patches is to collect data about created frames (in `_inputs` and `_all_frames`)

##########
File path: sdks/python/apache_beam/dataframe/doctests.py
##########
@@ -0,0 +1,280 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A module that allows running existing pandas doctests with Beam dataframes.
+
+This module hooks into the doctesting framework by customizing providing
+a custom runner and, in particular, an OutputChecker, as well as providing
+a fake object for mocking out the pandas module.

Review comment:
       ... framework by ~~customizing~~ providing a custom runner and, in particular, an OutputChecker ...




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org