You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@beam.apache.org by "damccorm (via GitHub)" <gi...@apache.org> on 2023/05/17 17:41:24 UTC
[GitHub] [beam] damccorm commented on a diff in pull request #26690: Colab Notebook Integration Tests

damccorm commented on code in PR #26690:
URL: https://github.com/apache/beam/pull/26690#discussion_r1196834461


##########
.test-infra/jenkins/job_PreCommit_Python_Colab.groovy:
##########
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import PrecommitJobBuilder
+
+PrecommitJobBuilder builder = new PrecommitJobBuilder(
+    scope: this,
+    nameBase: 'Python_Notebooks',
+    gradleTask: ':pythonPreCommit',
+    gradleSwitches: [
+      '-Pposargs=apache_beam/testing/'

Review Comment:
   Should this be `apache_beam/testing/notebooks`?



##########
sdks/python/apache_beam/testing/notebooks/runner.py:
##########
@@ -0,0 +1,161 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Executes notebook tests in current enviornment.
+
+Example usage:
+  Run all specs in ./base/:   python runner.py base
+  Run single test spec:       python runner.py --single-test base/testing.spec
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import glob
+import itertools
+import os
+import sys
+from collections import OrderedDict
+
+import yaml
+
+import apache_beam as beam
+import nbformat
+from nbconvert.preprocessors import ExecutePreprocessor
+
+# import testlib
+
+IS_BEAM_DEV = False
+if "dev" in beam.__version__:
+  IS_BEAM_DEV = True
+
+
+def _skip_test(spec):
+  if IS_BEAM_DEV and "sql" in str(spec).lower():
+    return True
+  return False
+
+
+class Executor:
+  """Loads a single test spec, executes and compares target notebook."""
+  def __init__(self, spec_path, timeout_secs):
+    with open(spec_path, "r") as spec_f:
+      self.spec = yaml.safe_load(spec_f)
+    # notebook in spec relative to spec's location
+    self.dir = os.path.dirname(spec_path)
+    self.notebook_path = os.path.join(self.dir, self.spec["notebook"])
+    self.tests = self.spec["tests"]
+    self.timeout_secs = timeout_secs
+
+  def execute_tests(self):
+    """Executes notebook and compares to test spec.
+
+    Returns:
+      # of tests, # of errors, error_dict
+      where error_dict maps (cell number, test class, expected output) to string
+    """
+    # load and execute notebook using nbconvert
+    with open(self.notebook_path, "r") as nb_f:
+      nb = nbformat.read(nb_f, as_version=4)
+    ExecutePreprocessor.timeout = self.timeout_secs
+    ep = ExecutePreprocessor(allow_errors=True)
+    # Executes full notebook, result is classic JSON representation
+    exec_nb, _ = ep.preprocess(nb, {"metadata": {"path": self.dir + "/"}})
+    test_count = 0
+    error_count = 0
+    errors = OrderedDict()
+    code_cells = {}
+    for cell in exec_nb["cells"]:
+      if cell["cell_type"] == "code":
+        code_cells[cell["execution_count"]] = cell
+    # generate test cases for each line, substituting in the output
+    for cell_num in sorted(self.tests.keys()):
+      if cell_num not in code_cells:
+        test_count += 1
+        error_count += 1
+        errors[(cell_num, "", "")] = "Given cell does not exist."
+      else:
+        cell = code_cells[cell_num]
+        for test in self.tests[cell_num]:
+          cls, setup = list(test.items())[0]
+          test_count += 1
+          try:
+            getattr(sys.modules["testlib"], cls)(setup).check(cell)
+          except Exception as e:
+            error_count += 1
+            errors[(cell_num, cls, setup)] = str(e)
+    return test_count, error_count, errors
+
+
+def main():
+  """Executes tests.
+
+  Return codes:
+    0 if all tests passed, 1 otherwise
+    2 if no tests were found
+  """
+  parser = argparse.ArgumentParser("Notebook test runner")
+  # TODO: for environments with multiple kernels, add kernel name
+  parser.add_argument("suites", nargs="*")
+  parser.add_argument("--single-test")
+  parser.add_argument(
+      "--timeout-secs",
+      type=int,
+      default=30,
+      help="The time to wait (in seconds) for output from the execution of a "
+      "cell.")
+  args = parser.parse_args()
+
+  if args.single_test:
+    specs = [args.single_test]

Review Comment:
   Can we validate that suites isn't set here?



##########
sdks/python/apache_beam/testing/notebooks/runner.py:
##########
@@ -0,0 +1,161 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Executes notebook tests in current enviornment.
+
+Example usage:
+  Run all specs in ./base/:   python runner.py base
+  Run single test spec:       python runner.py --single-test base/testing.spec
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import glob
+import itertools
+import os
+import sys
+from collections import OrderedDict
+
+import yaml
+
+import apache_beam as beam
+import nbformat
+from nbconvert.preprocessors import ExecutePreprocessor
+
+# import testlib
+
+IS_BEAM_DEV = False
+if "dev" in beam.__version__:
+  IS_BEAM_DEV = True
+
+
+def _skip_test(spec):
+  if IS_BEAM_DEV and "sql" in str(spec).lower():
+    return True
+  return False
+
+
+class Executor:
+  """Loads a single test spec, executes and compares target notebook."""
+  def __init__(self, spec_path, timeout_secs):
+    with open(spec_path, "r") as spec_f:
+      self.spec = yaml.safe_load(spec_f)
+    # notebook in spec relative to spec's location
+    self.dir = os.path.dirname(spec_path)
+    self.notebook_path = os.path.join(self.dir, self.spec["notebook"])
+    self.tests = self.spec["tests"]
+    self.timeout_secs = timeout_secs
+
+  def execute_tests(self):
+    """Executes notebook and compares to test spec.
+
+    Returns:
+      # of tests, # of errors, error_dict
+      where error_dict maps (cell number, test class, expected output) to string
+    """
+    # load and execute notebook using nbconvert
+    with open(self.notebook_path, "r") as nb_f:
+      nb = nbformat.read(nb_f, as_version=4)
+    ExecutePreprocessor.timeout = self.timeout_secs
+    ep = ExecutePreprocessor(allow_errors=True)
+    # Executes full notebook, result is classic JSON representation
+    exec_nb, _ = ep.preprocess(nb, {"metadata": {"path": self.dir + "/"}})
+    test_count = 0
+    error_count = 0
+    errors = OrderedDict()
+    code_cells = {}
+    for cell in exec_nb["cells"]:
+      if cell["cell_type"] == "code":
+        code_cells[cell["execution_count"]] = cell
+    # generate test cases for each line, substituting in the output
+    for cell_num in sorted(self.tests.keys()):
+      if cell_num not in code_cells:
+        test_count += 1
+        error_count += 1
+        errors[(cell_num, "", "")] = "Given cell does not exist."
+      else:
+        cell = code_cells[cell_num]
+        for test in self.tests[cell_num]:
+          cls, setup = list(test.items())[0]
+          test_count += 1
+          try:
+            getattr(sys.modules["testlib"], cls)(setup).check(cell)

Review Comment:
   I don't love this pattern because it makes it harder to see/understand the relationship between these 2 modules (and you lose things like good typing and some nicer errors). e.g. if we decided to move testlib.py in the future, would we notice that this breaks things like this? How many modules do you think there will be? Could we just introduce a function like:
   
   ```
   def get_test_case(cls: str, setup: Any) -> NotebookTestCase:
     if cls == "RegexMatch":
       return RegexMatch(setup)
     ...
     raise Exception(f"no module named {cls} found in testlib.py")
   ```



##########
.test-infra/jenkins/job_PreCommit_Python_Colab.groovy:
##########
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import PrecommitJobBuilder
+
+PrecommitJobBuilder builder = new PrecommitJobBuilder(
+    scope: this,
+    nameBase: 'Python_Notebooks',
+    gradleTask: ':pythonPreCommit',
+    gradleSwitches: [
+      '-Pposargs=apache_beam/testing/'

Review Comment:
   Will this actually invoke the tests though? This precommit is set up to use pytest and I don't see any pytest tests here. Seems like this should actually be invoking runner directly (e.g. `python runner.py --args`, probably wrapped in a gradle command though)



##########
sdks/python/apache_beam/testing/notebooks/testlib.py:
##########
@@ -0,0 +1,65 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Library of supported test cases.
+
+All tests should conform to the NotebookTestCase abstract class.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+from abc import ABC
+from abc import abstractmethod
+
+
+class NotebookTestCase(ABC):
+  @abstractmethod
+  def __init__(self, setup):
+    """Construct a NotebookTestCase.
+
+    Args:
+      setup: arbitrary JSON-serializable object specified by test spec
+    """
+    pass
+
+  # should raise exception on failure
+  @abstractmethod
+  def check(self, cell):

Review Comment:
   Are we able to provide an input type hint here? Same question about __init__,  but I'm less hopeful there



##########
sdks/python/apache_beam/testing/notebooks/runner.py:
##########
@@ -0,0 +1,161 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Executes notebook tests in current enviornment.
+
+Example usage:
+  Run all specs in ./base/:   python runner.py base
+  Run single test spec:       python runner.py --single-test base/testing.spec
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import glob
+import itertools
+import os
+import sys
+from collections import OrderedDict
+
+import yaml
+
+import apache_beam as beam
+import nbformat
+from nbconvert.preprocessors import ExecutePreprocessor
+
+# import testlib
+
+IS_BEAM_DEV = False
+if "dev" in beam.__version__:
+  IS_BEAM_DEV = True
+
+
+def _skip_test(spec):
+  if IS_BEAM_DEV and "sql" in str(spec).lower():

Review Comment:
   Why do we need this?



##########
sdks/python/setup.py:
##########
@@ -334,11 +334,13 @@ def get_portability_package_data():
             # https://github.com/jupyter/jupyter_client/issues/637
             'jupyter-client>=6.1.11,!=6.1.13,<8.1.1',
             'timeloop>=1.0.2,<2',
+            'nbformat>=5.0.5',
+            'nbconvert>=6.2.0',
           ] + dataframe_dependency,
           'interactive_test': [
             # notebok utils
-            'nbformat>=5.0.5,<6',
-            'nbconvert>=6.2.0,<8',
+            'nbformat>=5.0.5',

Review Comment:
   Any reason to force these dependencies?



##########
.test-infra/jenkins/job_PreCommit_Python_Colab.groovy:
##########
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import PrecommitJobBuilder
+
+PrecommitJobBuilder builder = new PrecommitJobBuilder(
+    scope: this,
+    nameBase: 'Python_Notebooks',
+    gradleTask: ':pythonPreCommit',
+    gradleSwitches: [
+      '-Pposargs=apache_beam/testing/'

Review Comment:
   I'd expect this to be a no-op right now since there are no test specs, right (which means it would exit with code 2)? So maybe we should just remove this one until we have specs to run.



##########
sdks/python/apache_beam/testing/notebooks/runner.py:
##########
@@ -0,0 +1,161 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Executes notebook tests in current enviornment.
+
+Example usage:
+  Run all specs in ./base/:   python runner.py base
+  Run single test spec:       python runner.py --single-test base/testing.spec
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import glob
+import itertools
+import os
+import sys
+from collections import OrderedDict
+
+import yaml
+
+import apache_beam as beam
+import nbformat
+from nbconvert.preprocessors import ExecutePreprocessor
+
+# import testlib
+
+IS_BEAM_DEV = False
+if "dev" in beam.__version__:
+  IS_BEAM_DEV = True
+
+
+def _skip_test(spec):
+  if IS_BEAM_DEV and "sql" in str(spec).lower():
+    return True
+  return False
+
+
+class Executor:
+  """Loads a single test spec, executes and compares target notebook."""
+  def __init__(self, spec_path, timeout_secs):
+    with open(spec_path, "r") as spec_f:
+      self.spec = yaml.safe_load(spec_f)
+    # notebook in spec relative to spec's location
+    self.dir = os.path.dirname(spec_path)
+    self.notebook_path = os.path.join(self.dir, self.spec["notebook"])
+    self.tests = self.spec["tests"]
+    self.timeout_secs = timeout_secs
+
+  def execute_tests(self):
+    """Executes notebook and compares to test spec.
+
+    Returns:
+      # of tests, # of errors, error_dict
+      where error_dict maps (cell number, test class, expected output) to string
+    """
+    # load and execute notebook using nbconvert
+    with open(self.notebook_path, "r") as nb_f:
+      nb = nbformat.read(nb_f, as_version=4)
+    ExecutePreprocessor.timeout = self.timeout_secs
+    ep = ExecutePreprocessor(allow_errors=True)
+    # Executes full notebook, result is classic JSON representation
+    exec_nb, _ = ep.preprocess(nb, {"metadata": {"path": self.dir + "/"}})
+    test_count = 0
+    error_count = 0
+    errors = OrderedDict()
+    code_cells = {}
+    for cell in exec_nb["cells"]:
+      if cell["cell_type"] == "code":
+        code_cells[cell["execution_count"]] = cell
+    # generate test cases for each line, substituting in the output
+    for cell_num in sorted(self.tests.keys()):
+      if cell_num not in code_cells:
+        test_count += 1
+        error_count += 1
+        errors[(cell_num, "", "")] = "Given cell does not exist."
+      else:
+        cell = code_cells[cell_num]
+        for test in self.tests[cell_num]:
+          cls, setup = list(test.items())[0]
+          test_count += 1
+          try:
+            getattr(sys.modules["testlib"], cls)(setup).check(cell)
+          except Exception as e:
+            error_count += 1
+            errors[(cell_num, cls, setup)] = str(e)
+    return test_count, error_count, errors
+
+
+def main():
+  """Executes tests.
+
+  Return codes:
+    0 if all tests passed, 1 otherwise
+    2 if no tests were found
+  """
+  parser = argparse.ArgumentParser("Notebook test runner")
+  # TODO: for environments with multiple kernels, add kernel name
+  parser.add_argument("suites", nargs="*")
+  parser.add_argument("--single-test")
+  parser.add_argument(
+      "--timeout-secs",
+      type=int,
+      default=30,
+      help="The time to wait (in seconds) for output from the execution of a "
+      "cell.")
+  args = parser.parse_args()
+
+  if args.single_test:
+    specs = [args.single_test]

Review Comment:
   and validate that it is set before using it in the else



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@beam.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org