You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ho...@apache.org on 2022/02/20 07:33:05 UTC
[arrow-datafusion] branch master updated: DataFusion + Conbench Integration (#1791)
This is an automated email from the ASF dual-hosted git repository.
houqp pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/master by this push:
new 2681386 DataFusion + Conbench Integration (#1791)
2681386 is described below
commit 2681386cf5b7b061cd188a1925eb7698e07cde22
Author: diana <di...@gmail.com>
AuthorDate: Sun Feb 20 00:33:00 2022 -0700
DataFusion + Conbench Integration (#1791)
---
conbench/.flake8 | 2 +
conbench/.gitignore | 130 ++++++++++++++++++++
conbench/.isort.cfg | 2 +
conbench/README.md | 252 ++++++++++++++++++++++++++++++++++++++
conbench/_criterion.py | 98 +++++++++++++++
conbench/benchmarks.json | 8 ++
conbench/benchmarks.py | 41 +++++++
conbench/requirements-test.txt | 3 +
conbench/requirements.txt | 1 +
dev/release/rat_exclude_files.txt | 5 +
10 files changed, 542 insertions(+)
diff --git a/conbench/.flake8 b/conbench/.flake8
new file mode 100644
index 0000000..e44b810
--- /dev/null
+++ b/conbench/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+ignore = E501
diff --git a/conbench/.gitignore b/conbench/.gitignore
new file mode 100755
index 0000000..aa44ee2
--- /dev/null
+++ b/conbench/.gitignore
@@ -0,0 +1,130 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
diff --git a/conbench/.isort.cfg b/conbench/.isort.cfg
new file mode 100644
index 0000000..f238bf7
--- /dev/null
+++ b/conbench/.isort.cfg
@@ -0,0 +1,2 @@
+[settings]
+profile = black
diff --git a/conbench/README.md b/conbench/README.md
new file mode 100644
index 0000000..f655ac8
--- /dev/null
+++ b/conbench/README.md
@@ -0,0 +1,252 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# DataFusion + Conbench Integration
+
+
+## Quick start
+
+```
+$ cd ~/arrow-datafusion/conbench/
+$ conda create -y -n conbench python=3.9
+$ conda activate conbench
+(conbench) $ pip install -r requirements.txt
+(conbench) $ conbench datafusion
+```
+
+## Example output
+
+```
+{
+ "batch_id": "3c82f9d23fce49328b78ba9fd963b254",
+ "context": {
+ "benchmark_language": "Rust"
+ },
+ "github": {
+ "commit": "e8c198b9fac6cd8822b950b9f71898e47965488d",
+ "repository": "https://github.com/dianaclarke/arrow-datafusion"
+ },
+ "info": {},
+ "machine_info": {
+ "architecture_name": "x86_64",
+ "cpu_core_count": "8",
+ "cpu_frequency_max_hz": "2400000000",
+ "cpu_l1d_cache_bytes": "65536",
+ "cpu_l1i_cache_bytes": "131072",
+ "cpu_l2_cache_bytes": "4194304",
+ "cpu_l3_cache_bytes": "0",
+ "cpu_model_name": "Apple M1",
+ "cpu_thread_count": "8",
+ "gpu_count": "0",
+ "gpu_product_names": [],
+ "kernel_name": "20.6.0",
+ "memory_bytes": "17179869184",
+ "name": "diana",
+ "os_name": "macOS",
+ "os_version": "10.16"
+ },
+ "run_id": "ec2a50b9380c470b96d7eb7d63ab5b77",
+ "stats": {
+ "data": [
+ "0.001532",
+ "0.001394",
+ "0.001333",
+ "0.001356",
+ "0.001379",
+ "0.001361",
+ "0.001307",
+ "0.001348",
+ "0.001436",
+ "0.001397",
+ "0.001339",
+ "0.001523",
+ "0.001593",
+ "0.001415",
+ "0.001344",
+ "0.001312",
+ "0.001402",
+ "0.001362",
+ "0.001329",
+ "0.001330",
+ "0.001447",
+ "0.001413",
+ "0.001536",
+ "0.001330",
+ "0.001333",
+ "0.001338",
+ "0.001333",
+ "0.001331",
+ "0.001426",
+ "0.001575",
+ "0.001362",
+ "0.001343",
+ "0.001334",
+ "0.001383",
+ "0.001476",
+ "0.001356",
+ "0.001362",
+ "0.001334",
+ "0.001390",
+ "0.001497",
+ "0.001330",
+ "0.001347",
+ "0.001331",
+ "0.001468",
+ "0.001377",
+ "0.001351",
+ "0.001328",
+ "0.001509",
+ "0.001338",
+ "0.001355",
+ "0.001332",
+ "0.001485",
+ "0.001370",
+ "0.001366",
+ "0.001507",
+ "0.001358",
+ "0.001331",
+ "0.001463",
+ "0.001362",
+ "0.001336",
+ "0.001428",
+ "0.001343",
+ "0.001359",
+ "0.001905",
+ "0.001726",
+ "0.001411",
+ "0.001433",
+ "0.001391",
+ "0.001453",
+ "0.001346",
+ "0.001339",
+ "0.001420",
+ "0.001330",
+ "0.001422",
+ "0.001683",
+ "0.001426",
+ "0.001349",
+ "0.001342",
+ "0.001430",
+ "0.001330",
+ "0.001436",
+ "0.001331",
+ "0.001415",
+ "0.001332",
+ "0.001408",
+ "0.001343",
+ "0.001392",
+ "0.001371",
+ "0.001655",
+ "0.001354",
+ "0.001438",
+ "0.001347",
+ "0.001341",
+ "0.001374",
+ "0.001453",
+ "0.001352",
+ "0.001358",
+ "0.001398",
+ "0.001362",
+ "0.001454"
+ ],
+ "iqr": "0.000088",
+ "iterations": 100,
+ "max": "0.001905",
+ "mean": "0.001401",
+ "median": "0.001362",
+ "min": "0.001307",
+ "q1": "0.001340",
+ "q3": "0.001428",
+ "stdev": "0.000095",
+ "time_unit": "s",
+ "times": [],
+ "unit": "s"
+ },
+ "tags": {
+ "name": "aggregate_query_group_by",
+ "suite": "aggregate_query_group_by"
+ },
+ "timestamp": "2022-02-09T01:32:55.769468+00:00"
+}
+```
+
+## Debug with test benchmark
+
+```
+(conbench) $ cd ~/arrow-datafusion/conbench/
+(conbench) $ conbench test --iterations=3
+
+Benchmark result:
+{
+ "batch_id": "41a144761bc24d82b94efa70d6e460b3",
+ "context": {
+ "benchmark_language": "Python"
+ },
+ "github": {
+ "commit": "e8c198b9fac6cd8822b950b9f71898e47965488d",
+ "repository": "https://github.com/dianaclarke/arrow-datafusion"
+ },
+ "info": {
+ "benchmark_language_version": "Python 3.9.7"
+ },
+ "machine_info": {
+ "architecture_name": "x86_64",
+ "cpu_core_count": "8",
+ "cpu_frequency_max_hz": "2400000000",
+ "cpu_l1d_cache_bytes": "65536",
+ "cpu_l1i_cache_bytes": "131072",
+ "cpu_l2_cache_bytes": "4194304",
+ "cpu_l3_cache_bytes": "0",
+ "cpu_model_name": "Apple M1",
+ "cpu_thread_count": "8",
+ "gpu_count": "0",
+ "gpu_product_names": [],
+ "kernel_name": "20.6.0",
+ "memory_bytes": "17179869184",
+ "name": "diana",
+ "os_name": "macOS",
+ "os_version": "10.16"
+ },
+ "run_id": "71f46362db8844afacea82cba119cefc",
+ "stats": {
+ "data": [
+ "0.000001",
+ "0.000001",
+ "0.000000"
+ ],
+ "iqr": "0.000000",
+ "iterations": 3,
+ "max": "0.000001",
+ "mean": "0.000001",
+ "median": "0.000001",
+ "min": "0.000000",
+ "q1": "0.000000",
+ "q3": "0.000001",
+ "stdev": "0.000001",
+ "time_unit": "s",
+ "times": [],
+ "unit": "s"
+ },
+ "tags": {
+ "name": "test"
+ },
+ "timestamp": "2022-02-09T01:36:45.823615+00:00"
+}
+```
+
diff --git a/conbench/_criterion.py b/conbench/_criterion.py
new file mode 100644
index 0000000..168a1b9
--- /dev/null
+++ b/conbench/_criterion.py
@@ -0,0 +1,98 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import collections
+import csv
+import os
+import pathlib
+import subprocess
+
+import conbench.runner
+from conbench.machine_info import github_info
+
+
+def _result_in_seconds(row):
+ # sample_measured_value - The value of the measurement for this sample.
+ # Note that this is the measured value for the whole sample, not the
+ # time-per-iteration To calculate the time-per-iteration, use
+ # sample_measured_value/iteration_count
+ # -- https://bheisler.github.io/criterion.rs/book/user_guide/csv_output.html
+ count = int(row["iteration_count"])
+ sample = float(row["sample_measured_value"])
+ return sample / count / 10**9
+
+
+def _parse_benchmark_group(row):
+ parts = row["group"].split(",")
+ if len(parts) > 1:
+ suite, name = parts[0], ",".join(parts[1:])
+ else:
+ suite, name = row["group"], row["group"]
+ return suite.strip(), name.strip()
+
+
+def _read_results(src_dir):
+ results = collections.defaultdict(lambda: collections.defaultdict(list))
+ path = pathlib.Path(os.path.join(src_dir, "target", "criterion"))
+ for path in list(path.glob("**/new/raw.csv")):
+ with open(path) as csv_file:
+ reader = csv.DictReader(csv_file)
+ for row in reader:
+ suite, name = _parse_benchmark_group(row)
+ results[suite][name].append(_result_in_seconds(row))
+ return results
+
+
+def _execute_command(command):
+ try:
+ print(command)
+ result = subprocess.run(command, capture_output=True, check=True)
+ except subprocess.CalledProcessError as e:
+ print(e.stderr.decode("utf-8"))
+ raise e
+ return result.stdout.decode("utf-8"), result.stderr.decode("utf-8")
+
+
+class CriterionBenchmark(conbench.runner.Benchmark):
+ external = True
+
+ def run(self, **kwargs):
+ src_dir = os.path.join(os.getcwd(), "..")
+ self._cargo_bench(src_dir)
+ results = _read_results(src_dir)
+ for suite in results:
+ self.conbench.mark_new_batch()
+ for name, data in results[suite].items():
+ yield self._record_result(suite, name, data, kwargs)
+
+ def _cargo_bench(self, src_dir):
+ os.chdir(src_dir)
+ _execute_command(["cargo", "bench"])
+
+ def _record_result(self, suite, name, data, options):
+ tags = {"suite": suite}
+ result = {"data": data, "unit": "s"}
+ context = {"benchmark_language": "Rust"}
+ github = github_info()
+ return self.conbench.record(
+ result,
+ name,
+ tags=tags,
+ context=context,
+ github=github,
+ options=options,
+ )
diff --git a/conbench/benchmarks.json b/conbench/benchmarks.json
new file mode 100644
index 0000000..bb70335
--- /dev/null
+++ b/conbench/benchmarks.json
@@ -0,0 +1,8 @@
+[
+ {
+ "command": "datafusion",
+ "flags": {
+ "language": "Rust"
+ }
+ }
+]
diff --git a/conbench/benchmarks.py b/conbench/benchmarks.py
new file mode 100644
index 0000000..9ad3e31
--- /dev/null
+++ b/conbench/benchmarks.py
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import conbench.runner
+
+import _criterion
+
+
+@conbench.runner.register_benchmark
+class TestBenchmark(conbench.runner.Benchmark):
+ name = "test"
+
+ def run(self, **kwargs):
+ yield self.conbench.benchmark(
+ self._f(),
+ self.name,
+ options=kwargs,
+ )
+
+ def _f(self):
+ return lambda: 1 + 1
+
+
+@conbench.runner.register_benchmark
+class CargoBenchmarks(_criterion.CriterionBenchmark):
+ name = "datafusion"
+ description = "Run Arrow Datafusion micro benchmarks."
diff --git a/conbench/requirements-test.txt b/conbench/requirements-test.txt
new file mode 100644
index 0000000..5e5647a
--- /dev/null
+++ b/conbench/requirements-test.txt
@@ -0,0 +1,3 @@
+black
+flake8
+isort
diff --git a/conbench/requirements.txt b/conbench/requirements.txt
new file mode 100644
index 0000000..a877c7b
--- /dev/null
+++ b/conbench/requirements.txt
@@ -0,0 +1 @@
+conbench
diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt
index 745c2ba..b236d3a 100644
--- a/dev/release/rat_exclude_files.txt
+++ b/dev/release/rat_exclude_files.txt
@@ -17,6 +17,11 @@ CHANGELOG.md
datafusion/CHANGELOG.md
ballista/CHANGELOG.md
python/CHANGELOG.md
+conbench/benchmarks.json
+conbench/requirements.txt
+conbench/requirements-test.txt
+conbench/.flake8
+conbench/.isort.cfg
dev/requirements*.txt
dev/archery/MANIFEST.in
dev/archery/requirements*.txt