You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by fo...@apache.org on 2023/05/25 18:45:15 UTC
[iceberg] branch master updated: Python: Refactor integration tests (#7698)
This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new 3a584a2835 Python: Refactor integration tests (#7698)
3a584a2835 is described below
commit 3a584a28352e5f13ca128599f4d331aa5eeaa374
Author: Fokko Driesprong <fo...@apache.org>
AuthorDate: Thu May 25 13:45:09 2023 -0500
Python: Refactor integration tests (#7698)
* Python: Refactor integration tests
This splits out running and building the integration tests that
enables quick development iterations. I've also added ipython
that will give a more meaningful error when something goes wrong
with the provisioning of the tests.
* Simplify SQL
---
python/Makefile | 12 +++-
python/dev/Dockerfile | 4 +-
python/dev/entrypoint.sh | 2 +-
python/dev/provision.py | 140 ++++++++++++++++++++++---------------
python/mkdocs/docs/contributing.md | 16 +++++
5 files changed, 112 insertions(+), 62 deletions(-)
diff --git a/python/Makefile b/python/Makefile
index ea0a3e82a8..444a3785bc 100644
--- a/python/Makefile
+++ b/python/Makefile
@@ -34,10 +34,16 @@ test-s3:
test-integration:
docker-compose -f dev/docker-compose-integration.yml kill
- docker-compose -f dev/docker-compose-integration.yml build
+ docker-compose -f dev/docker-compose-integration.yml rm -f
docker-compose -f dev/docker-compose-integration.yml up -d
- sleep 30
- poetry run pytest tests/ -m integration ${PYTEST_ARGS}
+ sleep 10
+ docker-compose -f dev/docker-compose-integration.yml exec -T spark-iceberg ipython ./provision.py
+ poetry run pytest tests/ -v -m integration ${PYTEST_ARGS}
+
+test-integration-rebuild:
+ docker-compose -f dev/docker-compose-integration.yml kill
+ docker-compose -f dev/docker-compose-integration.yml rm -f
+ docker-compose -f dev/docker-compose-integration.yml build --no-cache
test-adlfs:
sh ./dev/run-azurite.sh
diff --git a/python/dev/Dockerfile b/python/dev/Dockerfile
index 65d5503b57..c6bbe543d3 100644
--- a/python/dev/Dockerfile
+++ b/python/dev/Dockerfile
@@ -37,7 +37,7 @@ RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p /home/iceberg/
WORKDIR ${SPARK_HOME}
ENV SPARK_VERSION=3.3.2
-ENV ICEBERG_VERSION=1.2.0
+ENV ICEBERG_VERSION=1.2.1
ENV AWS_SDK_VERSION=2.20.18
RUN curl -s https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \
@@ -62,6 +62,8 @@ ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}"
RUN chmod u+x /opt/spark/sbin/* && \
chmod u+x /opt/spark/bin/*
+RUN pip3 install -q ipython
+
COPY entrypoint.sh .
COPY provision.py .
diff --git a/python/dev/entrypoint.sh b/python/dev/entrypoint.sh
index d777f8f5a2..574e876c77 100755
--- a/python/dev/entrypoint.sh
+++ b/python/dev/entrypoint.sh
@@ -22,4 +22,4 @@ start-master.sh -p 7077
start-worker.sh spark://spark-iceberg:7077
start-history-server.sh
-python3 ./provision.py
+tail -f /dev/null
diff --git a/python/dev/provision.py b/python/dev/provision.py
index 81bd094c58..73ec34fdc1 100644
--- a/python/dev/provision.py
+++ b/python/dev/provision.py
@@ -14,15 +14,12 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_date, date_add, expr
spark = SparkSession.builder.getOrCreate()
-print("Create database")
-
spark.sql(
"""
CREATE DATABASE IF NOT EXISTS default;
@@ -31,19 +28,7 @@ spark.sql(
spark.sql(
"""
- use default;
-"""
-)
-
-spark.sql(
- """
- DROP TABLE IF EXISTS test_null_nan;
-"""
-)
-
-spark.sql(
- """
- CREATE TABLE test_null_nan
+ CREATE OR REPLACE TABLE default.test_null_nan
USING iceberg
AS SELECT
1 AS idx,
@@ -59,78 +44,122 @@ UNION ALL SELECT
spark.sql(
"""
- DROP TABLE IF EXISTS test_null_nan_rewritten;
+ CREATE OR REPLACE TABLE default.test_null_nan_rewritten
+ USING iceberg
+ AS SELECT * FROM default.test_null_nan
"""
)
spark.sql(
"""
- CREATE TABLE test_null_nan_rewritten
- USING iceberg
- AS SELECT * FROM test_null_nan
+CREATE OR REPLACE TABLE default.test_limit as
+ SELECT * LATERAL VIEW explode(ARRAY(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) AS idx;
"""
)
spark.sql(
"""
- DROP TABLE IF EXISTS test_limit;
+CREATE OR REPLACE TABLE default.test_positional_mor_deletes (
+ dt date,
+ number integer,
+ letter string
+)
+USING iceberg
+TBLPROPERTIES (
+ 'write.delete.mode'='merge-on-read',
+ 'write.update.mode'='merge-on-read',
+ 'write.merge.mode'='merge-on-read',
+ 'format-version'='2'
+);
"""
)
+# Partitioning is not really needed, but there is a bug:
+# https://github.com/apache/iceberg/pull/7685
spark.sql(
"""
- CREATE TABLE test_limit
- USING iceberg
- AS SELECT
- 1 AS idx
- UNION ALL SELECT
- 2 AS idx
- UNION ALL SELECT
- 3 AS idx
- UNION ALL SELECT
- 4 AS idx
- UNION ALL SELECT
- 5 AS idx
- UNION ALL SELECT
- 6 AS idx
- UNION ALL SELECT
- 7 AS idx
- UNION ALL SELECT
- 8 AS idx
- UNION ALL SELECT
- 9 AS idx
- UNION ALL SELECT
- 10 AS idx
+ ALTER TABLE default.test_positional_mor_deletes ADD PARTITION FIELD years(dt) AS dt_years
+"""
+)
+
+spark.sql(
"""
+INSERT INTO default.test_positional_mor_deletes
+VALUES
+ (CAST('2023-03-01' AS date), 1, 'a'),
+ (CAST('2023-03-02' AS date), 2, 'b'),
+ (CAST('2023-03-03' AS date), 3, 'c'),
+ (CAST('2023-03-04' AS date), 4, 'd'),
+ (CAST('2023-03-05' AS date), 5, 'e'),
+ (CAST('2023-03-06' AS date), 6, 'f'),
+ (CAST('2023-03-07' AS date), 7, 'g'),
+ (CAST('2023-03-08' AS date), 8, 'h'),
+ (CAST('2023-03-09' AS date), 9, 'i'),
+ (CAST('2023-03-10' AS date), 10, 'j'),
+ (CAST('2023-03-11' AS date), 11, 'k'),
+ (CAST('2023-03-12' AS date), 12, 'l');
+"""
)
spark.sql(
"""
- DROP TABLE IF EXISTS test_deletes;
+DELETE FROM default.test_positional_mor_deletes WHERE number = 9
"""
)
spark.sql(
"""
- CREATE TABLE test_deletes
+ CREATE OR REPLACE TABLE default.test_positional_mor_double_deletes (
+ dt date,
+ number integer,
+ letter string
+ )
USING iceberg
TBLPROPERTIES (
'write.delete.mode'='merge-on-read',
'write.update.mode'='merge-on-read',
- 'write.merge.mode'='merge-on-read'
- )
- AS SELECT
- 1 AS idx,
- True AS deleted
-UNION ALL SELECT
- 2 AS idx,
- False AS deleted;
+ 'write.merge.mode'='merge-on-read',
+ 'format-version'='2'
+ );
+"""
+)
+
+# Partitioning is not really needed, but there is a bug:
+# https://github.com/apache/iceberg/pull/7685
+spark.sql(
+ """
+ ALTER TABLE default.test_positional_mor_double_deletes ADD PARTITION FIELD years(dt) AS dt_years
+"""
+)
+
+spark.sql(
+ """
+INSERT INTO default.test_positional_mor_double_deletes
+VALUES
+ (CAST('2023-03-01' AS date), 1, 'a'),
+ (CAST('2023-03-02' AS date), 2, 'b'),
+ (CAST('2023-03-03' AS date), 3, 'c'),
+ (CAST('2023-03-04' AS date), 4, 'd'),
+ (CAST('2023-03-05' AS date), 5, 'e'),
+ (CAST('2023-03-06' AS date), 6, 'f'),
+ (CAST('2023-03-07' AS date), 7, 'g'),
+ (CAST('2023-03-08' AS date), 8, 'h'),
+ (CAST('2023-03-09' AS date), 9, 'i'),
+ (CAST('2023-03-10' AS date), 10, 'j'),
+ (CAST('2023-03-11' AS date), 11, 'k'),
+ (CAST('2023-03-12' AS date), 12, 'l');
+"""
+)
+
+spark.sql(
+ """
+ DELETE FROM default.test_positional_mor_double_deletes WHERE number = 9
"""
)
spark.sql(
"""
- DELETE FROM test_deletes WHERE deleted = True;
+ DELETE FROM default.test_positional_mor_double_deletes WHERE letter == 'f'
"""
)
@@ -156,6 +185,3 @@ all_types_dataframe = (
all_types_dataframe.writeTo("default.test_all_types").tableProperty("format-version", "2").partitionedBy(
"intCol"
).createOrReplace()
-
-while True:
- time.sleep(1)
diff --git a/python/mkdocs/docs/contributing.md b/python/mkdocs/docs/contributing.md
index bf6f12872d..989cbbea44 100644
--- a/python/mkdocs/docs/contributing.md
+++ b/python/mkdocs/docs/contributing.md
@@ -107,6 +107,22 @@ make test PYTEST_ARGS="--pdb"
To see all available pytest arguments, run `make test PYTEST_ARGS="--help"`.
+### Integration tests
+
+PyIceberg has integration tests with Apache Spark. Spark will create a new database and provision some tables that PyIceberg can query against.
+
+```sh
+make test-integration
+```
+
+This will restart the containers, to get to a clean state, and then run the PyTest suite. In case something changed in the Dockerfile or the provision script, you can run:
+
+```sh
+make test-integration-rebuild
+```
+
+To rebuild the containers from scratch.
+
## Code standards
Below are the formalized conventions that we adhere to in the PyIceberg project. The goal of this is to have a common agreement on how to evolve the codebase, but also using it as guidelines for newcomers to the project.