You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by bl...@apache.org on 2023/02/28 21:47:30 UTC
[iceberg] branch master updated: Python-legacy: Remove python_legacy (#6960)
This is an automated email from the ASF dual-hosted git repository.
blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new 54b9ef934e Python-legacy: Remove python_legacy (#6960)
54b9ef934e is described below
commit 54b9ef934e943fd08f008e51d5d31722ab6fbb27
Author: Fokko Driesprong <fo...@apache.org>
AuthorDate: Tue Feb 28 22:47:22 2023 +0100
Python-legacy: Remove python_legacy (#6960)
---
.github/labeler.yml | 1 -
.github/workflows/delta-conversion-ci.yml | 1 -
.github/workflows/flink-ci.yml | 1 -
.github/workflows/hive-ci.yml | 1 -
.github/workflows/java-ci.yml | 1 -
.github/workflows/python-legacy-ci.yml | 55 --
.github/workflows/spark-ci.yml | 1 -
.gitignore | 1 -
python_legacy/CHANGELOG.md | 24 -
python_legacy/README.md | 48 --
python_legacy/iceberg/__init__.py | 16 -
python_legacy/iceberg/api/__init__.py | 44 --
python_legacy/iceberg/api/append_files.py | 24 -
python_legacy/iceberg/api/combined_scan_task.py | 27 -
python_legacy/iceberg/api/data_file.py | 98 ---
python_legacy/iceberg/api/data_operations.py | 23 -
python_legacy/iceberg/api/delete_files.py | 34 -
python_legacy/iceberg/api/expire_snapshots.py | 30 -
python_legacy/iceberg/api/expressions/__init__.py | 107 ---
python_legacy/iceberg/api/expressions/binder.py | 91 ---
python_legacy/iceberg/api/expressions/evaluator.py | 92 ---
.../iceberg/api/expressions/expression.py | 252 -------
.../iceberg/api/expressions/expression_parser.py | 162 -----
.../iceberg/api/expressions/expressions.py | 299 ---------
.../expressions/inclusive_manifest_evaluator.py | 161 -----
.../api/expressions/inclusive_metrics_evaluator.py | 188 ------
.../api/expressions/java_variables/__init__.py | 21 -
python_legacy/iceberg/api/expressions/literals.py | 594 -----------------
python_legacy/iceberg/api/expressions/predicate.py | 295 ---------
.../iceberg/api/expressions/projections.py | 114 ----
python_legacy/iceberg/api/expressions/reference.py | 114 ----
.../iceberg/api/expressions/residual_evaluator.py | 117 ----
.../api/expressions/strict_metrics_evaluator.py | 221 -------
python_legacy/iceberg/api/expressions/term.py | 73 --
python_legacy/iceberg/api/expressions/transform.py | 75 ---
python_legacy/iceberg/api/file_format.py | 44 --
python_legacy/iceberg/api/file_scan_task.py | 47 --
python_legacy/iceberg/api/files.py | 126 ----
python_legacy/iceberg/api/filterable.py | 32 -
python_legacy/iceberg/api/filtered_snapshot.py | 40 --
python_legacy/iceberg/api/io/__init__.py | 30 -
python_legacy/iceberg/api/io/closeable_group.py | 31 -
python_legacy/iceberg/api/io/closeable_iterable.py | 27 -
.../iceberg/api/io/delegating_input_stream.py | 22 -
.../iceberg/api/io/delegating_output_stream.py | 22 -
python_legacy/iceberg/api/io/file_appender.py | 29 -
python_legacy/iceberg/api/io/input_file.py | 38 --
python_legacy/iceberg/api/io/output_file.py | 28 -
.../iceberg/api/io/position_output_stream.py | 32 -
.../iceberg/api/io/seekable_input_stream.py | 46 --
python_legacy/iceberg/api/manifest_file.py | 95 ---
python_legacy/iceberg/api/metrics.py | 30 -
python_legacy/iceberg/api/overwrite_files.py | 30 -
python_legacy/iceberg/api/partition_field.py | 43 --
python_legacy/iceberg/api/partition_spec.py | 352 ----------
python_legacy/iceberg/api/pending_update.py | 25 -
python_legacy/iceberg/api/replace_partitions.py | 30 -
python_legacy/iceberg/api/rewrite_files.py | 30 -
python_legacy/iceberg/api/rollback.py | 33 -
python_legacy/iceberg/api/scan_task.py | 28 -
python_legacy/iceberg/api/schema.py | 164 -----
python_legacy/iceberg/api/snapshot.py | 53 --
python_legacy/iceberg/api/snapshot_iterable.py | 21 -
python_legacy/iceberg/api/struct_like.py | 28 -
python_legacy/iceberg/api/table.py | 76 ---
python_legacy/iceberg/api/table_scan.py | 62 --
python_legacy/iceberg/api/tables.py | 34 -
python_legacy/iceberg/api/transaction.py | 55 --
python_legacy/iceberg/api/transforms/__init__.py | 48 --
python_legacy/iceberg/api/transforms/bucket.py | 196 ------
python_legacy/iceberg/api/transforms/dates.py | 87 ---
python_legacy/iceberg/api/transforms/identity.py | 91 ---
.../iceberg/api/transforms/projection_util.py | 66 --
python_legacy/iceberg/api/transforms/timestamps.py | 83 ---
python_legacy/iceberg/api/transforms/transform.py | 43 --
.../iceberg/api/transforms/transform_util.py | 84 ---
python_legacy/iceberg/api/transforms/transforms.py | 120 ----
python_legacy/iceberg/api/transforms/truncate.py | 231 -------
.../iceberg/api/transforms/unknown_transform.py | 61 --
.../iceberg/api/transforms/void_transform.py | 52 --
python_legacy/iceberg/api/types/__init__.py | 134 ----
python_legacy/iceberg/api/types/conversions.py | 101 ---
python_legacy/iceberg/api/types/type.py | 118 ----
python_legacy/iceberg/api/types/type_util.py | 575 ----------------
python_legacy/iceberg/api/types/types.py | 732 ---------------------
python_legacy/iceberg/api/update_properties.py | 30 -
python_legacy/iceberg/api/update_schema.py | 33 -
python_legacy/iceberg/core/__init__.py | 61 --
python_legacy/iceberg/core/avro/__init__.py | 21 -
.../iceberg/core/avro/avro_schema_util.py | 35 -
python_legacy/iceberg/core/avro/avro_to_iceberg.py | 301 ---------
python_legacy/iceberg/core/avro/iceberg_to_avro.py | 90 ---
.../iceberg/core/base_combined_scan_task.py | 37 --
python_legacy/iceberg/core/base_file_scan_task.py | 126 ----
.../core/base_metastore_table_operations.py | 137 ----
.../iceberg/core/base_metastore_tables.py | 85 ---
python_legacy/iceberg/core/base_snapshot.py | 132 ----
python_legacy/iceberg/core/base_table.py | 110 ----
python_legacy/iceberg/core/base_table_scan.py | 213 ------
python_legacy/iceberg/core/base_transaction.py | 177 -----
python_legacy/iceberg/core/config_properties.py | 26 -
python_legacy/iceberg/core/data_files.py | 228 -------
python_legacy/iceberg/core/data_table_scan.py | 98 ---
python_legacy/iceberg/core/filesystem/__init__.py | 27 -
.../iceberg/core/filesystem/file_status.py | 34 -
.../iceberg/core/filesystem/file_system.py | 120 ----
.../core/filesystem/filesystem_table_operations.py | 140 ----
.../iceberg/core/filesystem/filesystem_tables.py | 65 --
.../iceberg/core/filesystem/local_filesystem.py | 81 ---
.../iceberg/core/filesystem/s3_filesystem.py | 259 --------
python_legacy/iceberg/core/filesystem/util.py | 39 --
python_legacy/iceberg/core/filtered_manifest.py | 118 ----
python_legacy/iceberg/core/generic_data_file.py | 137 ----
.../iceberg/core/generic_manifest_file.py | 194 ------
.../core/generic_partition_field_summary.py | 61 --
python_legacy/iceberg/core/manifest_entry.py | 145 ----
python_legacy/iceberg/core/manifest_list_writer.py | 55 --
python_legacy/iceberg/core/manifest_reader.py | 162 -----
python_legacy/iceberg/core/partition_data.py | 94 ---
.../iceberg/core/partition_spec_parser.py | 98 ---
python_legacy/iceberg/core/partition_summary.py | 68 --
python_legacy/iceberg/core/scan_summary.py | 409 ------------
python_legacy/iceberg/core/schema_parser.py | 182 -----
python_legacy/iceberg/core/schema_update.py | 40 --
python_legacy/iceberg/core/snapshot_parser.py | 73 --
python_legacy/iceberg/core/table_metadata.py | 218 ------
.../iceberg/core/table_metadata_parser.py | 124 ----
python_legacy/iceberg/core/table_operations.py | 42 --
python_legacy/iceberg/core/table_properties.py | 78 ---
python_legacy/iceberg/core/util/__init__.py | 36 -
python_legacy/iceberg/core/util/atomic_integer.py | 36 -
python_legacy/iceberg/core/util/bin_packing.py | 79 ---
python_legacy/iceberg/core/util/profile.py | 36 -
python_legacy/iceberg/exceptions/__init__.py | 30 -
python_legacy/iceberg/exceptions/exceptions.py | 47 --
python_legacy/iceberg/hive/__init__.py | 25 -
.../iceberg/hive/hive_table_operations.py | 208 ------
python_legacy/iceberg/hive/hive_tables.py | 105 ---
python_legacy/iceberg/hive/hive_types.py | 37 --
python_legacy/iceberg/parquet/__init__.py | 20 -
python_legacy/iceberg/parquet/dataset_utils.py | 158 -----
python_legacy/iceberg/parquet/parquet_reader.py | 224 -------
.../iceberg/parquet/parquet_schema_utils.py | 41 --
.../iceberg/parquet/parquet_to_iceberg.py | 152 -----
python_legacy/setup.py | 56 --
python_legacy/tests/__init__.py | 16 -
python_legacy/tests/api/__init__.py | 16 -
python_legacy/tests/api/expressions/__init__.py | 16 -
python_legacy/tests/api/expressions/conftest.py | 425 ------------
.../tests/api/expressions/test_evaluator.py | 161 -----
.../api/expressions/test_expression_binding.py | 143 ----
.../api/expressions/test_expression_helpers.py | 47 --
.../expressions/test_expression_serializations.py | 21 -
.../test_inclusive_manifest_evaluator.py | 182 -----
.../test_inclusive_metrics_evaluator.py | 130 ----
.../api/expressions/test_literal_serialization.py | 20 -
.../expressions/test_misc_literal_conversions.py | 263 --------
.../test_numeric_literal_conversions.py | 116 ----
.../api/expressions/test_predicate_binding.py | 199 ------
.../tests/api/expressions/test_str_to_expr.py | 173 -----
.../expressions/test_strict_metrics_evaluator.py | 159 -----
.../expressions/test_string_literal_conversions.py | 110 ----
python_legacy/tests/api/test_conversions.py | 241 -------
python_legacy/tests/api/test_file_format.py | 43 --
python_legacy/tests/api/test_helpers.py | 156 -----
python_legacy/tests/api/test_partition_spec.py | 82 ---
python_legacy/tests/api/transforms/__init__.py | 16 -
python_legacy/tests/api/transforms/test_bucket.py | 49 --
.../tests/api/transforms/test_bucketing.py | 64 --
python_legacy/tests/api/transforms/test_dates.py | 44 --
.../tests/api/transforms/test_identity.py | 93 ---
.../tests/api/transforms/test_timestamps.py | 46 --
.../tests/api/transforms/test_truncate.py | 59 --
python_legacy/tests/api/types/__init__.py | 16 -
.../tests/api/types/test_binary_comparator.py | 41 --
.../tests/api/types/test_char_seq_comparator.py | 43 --
.../tests/api/types/test_comparable_comparator.py | 51 --
.../tests/api/types/test_readabilty_checks.py | 57 --
python_legacy/tests/api/types/test_type_util.py | 49 --
python_legacy/tests/core/__init__.py | 16 -
python_legacy/tests/core/avro/__init__.py | 16 -
python_legacy/tests/core/avro/conftest.py | 64 --
python_legacy/tests/core/avro/test_avro.py | 22 -
.../tests/core/avro/test_read_projection.py | 27 -
python_legacy/tests/core/conftest.py | 314 ---------
python_legacy/tests/core/test_base_table_scan.py | 38 --
python_legacy/tests/core/test_filesystem_tables.py | 32 -
python_legacy/tests/core/test_partition_spec.py | 111 ----
.../tests/core/test_partition_spec_parser.py | 110 ----
python_legacy/tests/core/test_snapshot_json.py | 58 --
.../tests/core/test_table_metadata_json.py | 111 ----
.../tests/core/test_table_metadata_parser.py | 43 --
python_legacy/tests/core/utils/__init__.py | 14 -
python_legacy/tests/core/utils/test_bin_packing.py | 36 -
python_legacy/tests/hive/__init__.py | 18 -
python_legacy/tests/hive/conftest.py | 37 --
python_legacy/tests/hive/test_hive_tables.py | 252 -------
python_legacy/tests/parquet/__init__.py | 16 -
python_legacy/tests/parquet/conftest.py | 226 -------
python_legacy/tests/parquet/test_dataset_utils.py | 52 --
python_legacy/tests/parquet/test_parquet_reader.py | 285 --------
.../tests/parquet/test_parquet_to_iceberg.py | 76 ---
python_legacy/tox.ini | 115 ----
203 files changed, 19740 deletions(-)
diff --git a/.github/labeler.yml b/.github/labeler.yml
index c623fbc6dd..10e68b33f2 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -46,7 +46,6 @@ CORE:
- core/**/*
PYTHON:
- python/**/*
- - python_legacy/**/*
PARQUET:
- parquet/**/*
ARROW:
diff --git a/.github/workflows/delta-conversion-ci.yml b/.github/workflows/delta-conversion-ci.yml
index 5302f048f7..02474b39a0 100644
--- a/.github/workflows/delta-conversion-ci.yml
+++ b/.github/workflows/delta-conversion-ci.yml
@@ -41,7 +41,6 @@ on:
- 'flink/**'
- 'pig/**'
- 'python/**'
- - 'python_legacy/**'
- 'docs/**'
- 'open-api/**'
- 'format/**'
diff --git a/.github/workflows/flink-ci.yml b/.github/workflows/flink-ci.yml
index 24a11fb10a..b67fa609cf 100644
--- a/.github/workflows/flink-ci.yml
+++ b/.github/workflows/flink-ci.yml
@@ -41,7 +41,6 @@ on:
- 'spark/**'
- 'pig/**'
- 'python/**'
- - 'python_legacy/**'
- 'docs/**'
- 'open-api/**'
- 'format/**'
diff --git a/.github/workflows/hive-ci.yml b/.github/workflows/hive-ci.yml
index ba6cc1535e..761d77655b 100644
--- a/.github/workflows/hive-ci.yml
+++ b/.github/workflows/hive-ci.yml
@@ -39,7 +39,6 @@ on:
- 'flink/**'
- 'pig/**'
- 'python/**'
- - 'python_legacy/**'
- 'docs/**'
- 'open-api/**'
- 'format/**'
diff --git a/.github/workflows/java-ci.yml b/.github/workflows/java-ci.yml
index 5b8e0356f4..0fc7619547 100644
--- a/.github/workflows/java-ci.yml
+++ b/.github/workflows/java-ci.yml
@@ -36,7 +36,6 @@ on:
- '.asf.yml'
- 'dev/**'
- 'python/**'
- - 'python_legacy/**'
- 'docs/**'
- 'open-api/**'
- 'format/**'
diff --git a/.github/workflows/python-legacy-ci.yml b/.github/workflows/python-legacy-ci.yml
deleted file mode 100644
index 91e1c79e88..0000000000
--- a/.github/workflows/python-legacy-ci.yml
+++ /dev/null
@@ -1,55 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-name: "Python Legacy CI"
-on:
- push:
- branches:
- - 'master'
- - '0.**'
- tags:
- - 'apache-iceberg-**'
- pull_request:
- paths:
- - '.github/workflows/python-legacy-ci.yml'
- - 'python_legacy/**'
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: ${{ github.event_name == 'pull_request' }}
-
-jobs:
- tox:
- runs-on: ubuntu-20.04
- strategy:
- matrix:
- python: [3.7, 3.8, 3.9]
-
- steps:
- - uses: actions/checkout@v3
- - uses: actions/setup-python@v4
- with:
- python-version: ${{ matrix.python }}
- - working-directory: ./python_legacy
- run: |
- pip install -e .[dev]
- pip install -U "tox-gh-actions==2.12.0"
- - working-directory: ./python_legacy
- run: tox
-
diff --git a/.github/workflows/spark-ci.yml b/.github/workflows/spark-ci.yml
index eb8abb6103..526d45e33b 100644
--- a/.github/workflows/spark-ci.yml
+++ b/.github/workflows/spark-ci.yml
@@ -41,7 +41,6 @@ on:
- 'flink/**'
- 'pig/**'
- 'python/**'
- - 'python_legacy/**'
- 'docs/**'
- 'open-api/**'
- 'format/**'
diff --git a/.gitignore b/.gitignore
index b9c93a1bc4..a259f71237 100644
--- a/.gitignore
+++ b/.gitignore
@@ -63,7 +63,6 @@ spark-warehouse/
derby.log
# Python stuff
-python_legacy/.mypy_cache/
python/.mypy_cache/
python/htmlcov
python/coverage.xml
diff --git a/python_legacy/CHANGELOG.md b/python_legacy/CHANGELOG.md
deleted file mode 100644
index e331c3a09c..0000000000
--- a/python_legacy/CHANGELOG.md
+++ /dev/null
@@ -1,24 +0,0 @@
-<!--
- - Licensed to the Apache Software Foundation (ASF) under one or more
- - contributor license agreements. See the NOTICE file distributed with
- - this work for additional information regarding copyright ownership.
- - The ASF licenses this file to You under the Apache License, Version 2.0
- - (the "License"); you may not use this file except in compliance with
- - the License. You may obtain a copy of the License at
- -
- - http://www.apache.org/licenses/LICENSE-2.0
- -
- - Unless required by applicable law or agreed to in writing, software
- - distributed under the License is distributed on an "AS IS" BASIS,
- - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- - See the License for the specific language governing permissions and
- - limitations under the License.
- -->
-
-# iceberg Changelog
-
-## 2021-09-04
-* Rename this one to python_legacy. - [Jun He]
-
-## iceberg 0.0.1 (2019-02-08)
-* Library creation. - [Ted Gooch]
diff --git a/python_legacy/README.md b/python_legacy/README.md
deleted file mode 100644
index 3424ec3fa8..0000000000
--- a/python_legacy/README.md
+++ /dev/null
@@ -1,48 +0,0 @@
-<!--
- - Licensed to the Apache Software Foundation (ASF) under one or more
- - contributor license agreements. See the NOTICE file distributed with
- - this work for additional information regarding copyright ownership.
- - The ASF licenses this file to You under the Apache License, Version 2.0
- - (the "License"); you may not use this file except in compliance with
- - the License. You may obtain a copy of the License at
- -
- - http://www.apache.org/licenses/LICENSE-2.0
- -
- - Unless required by applicable law or agreed to in writing, software
- - distributed under the License is distributed on an "AS IS" BASIS,
- - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- - See the License for the specific language governing permissions and
- - limitations under the License.
- -->
-
-# Iceberg Python
-
-Iceberg is a python library for programatic access to iceberg table metadata as well as data access. The intention is to provide a functional subset of the java library.
-
-## Getting Started
-
-Iceberg python is currently in development, for development and testing purposes the best way to install the library is to perform the following steps:
-
-```
-git clone https://github.com/apache/iceberg.git
-cd iceberg/python_legacy
-pip install -e .
-```
-
-## Testing
-
-Testing is done using tox. The config can be found in `tox.ini` within the python directory of the iceberg project.
-
-```
-# simply run tox from within the python dir
-tox
-```
-
-## Get in Touch
-
-- Email:
- * [dev@iceberg.apache.org](mailto:dev@iceberg.apache.org)
-
-- Issues
- * [File a github incident](https://github.com/apache/iceberg/issues)
-
diff --git a/python_legacy/iceberg/__init__.py b/python_legacy/iceberg/__init__.py
deleted file mode 100644
index 13a83393a9..0000000000
--- a/python_legacy/iceberg/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
diff --git a/python_legacy/iceberg/api/__init__.py b/python_legacy/iceberg/api/__init__.py
deleted file mode 100644
index b6d22bbffe..0000000000
--- a/python_legacy/iceberg/api/__init__.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-__all__ = ["CombinedScanTask", "DataFile", "DataOperations", "FileFormat", "FileScanTask",
- "Files", "Filterable", "FilteredSnapshot", "ManifestFile", "PartitionFieldSummary",
- "Metrics", "PartitionSpec", "PartitionSpecBuilder",
- "Schema", "Snapshot", "SnapshotIterable", "StructLike",
- "Table", "Tables", "TableScan", "Transaction", "UpdateSchema"]
-
-from .combined_scan_task import CombinedScanTask
-from .data_file import DataFile
-from .data_operations import DataOperations
-from .file_format import FileFormat
-from .file_scan_task import FileScanTask
-from .files import Files
-from .filterable import Filterable
-from .filtered_snapshot import FilteredSnapshot
-from .manifest_file import ManifestFile, PartitionFieldSummary
-from .metrics import Metrics
-from .partition_spec import (PartitionSpec,
- PartitionSpecBuilder)
-from .schema import Schema
-from .snapshot import Snapshot
-from .snapshot_iterable import SnapshotIterable
-from .struct_like import StructLike
-from .table import Table
-from .table_scan import TableScan
-from .tables import Tables
-from .transaction import Transaction
-from .update_schema import UpdateSchema
diff --git a/python_legacy/iceberg/api/append_files.py b/python_legacy/iceberg/api/append_files.py
deleted file mode 100644
index 435f2f825b..0000000000
--- a/python_legacy/iceberg/api/append_files.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from .pending_update import PendingUpdate
-
-
-class AppendFiles(PendingUpdate):
-
- def append_file(self, file):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/combined_scan_task.py b/python_legacy/iceberg/api/combined_scan_task.py
deleted file mode 100644
index e385ba524e..0000000000
--- a/python_legacy/iceberg/api/combined_scan_task.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from .scan_task import ScanTask
-
-
-class CombinedScanTask(ScanTask):
-
- def files(self):
- raise NotImplementedError()
-
- def as_combined_scan_task(self):
- return self
diff --git a/python_legacy/iceberg/api/data_file.py b/python_legacy/iceberg/api/data_file.py
deleted file mode 100644
index 6e85d6e84f..0000000000
--- a/python_legacy/iceberg/api/data_file.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from iceberg.api.types import (BinaryType,
- IntegerType,
- ListType,
- LongType,
- MapType,
- NestedField,
- StringType,
- StructType)
-
-
-class DataFile(object):
-
- @staticmethod
- def get_type(partition_type):
- return StructType.of([NestedField.required(100, "file_path", StringType.get()),
- NestedField.required(101, "file_format", StringType.get()),
- NestedField.required(102, "partition", partition_type),
- NestedField.required(103, "record_count", LongType.get()),
- NestedField.required(104, "file_size_in_bytes", LongType.get()),
- NestedField.required(105, "block_size_in_bytes", LongType.get()),
- NestedField.optional(106, "file_ordinal", IntegerType.get()),
- NestedField.optional(107, "sort_columns", ListType.of_required(112, IntegerType.get())),
- NestedField.optional(108, "column_sizes", MapType.of_required(117, 118,
- IntegerType.get(),
- LongType.get())),
- NestedField.optional(109, "value_counts", MapType.of_required(119, 120,
- IntegerType.get(),
- LongType.get())),
- NestedField.optional(110, "null_value_counts", MapType.of_required(121, 122,
- IntegerType.get(),
- LongType.get())),
- NestedField.optional(125, "lower_bounds", MapType.of_required(126, 127,
- IntegerType.get(),
- BinaryType.get())),
- NestedField.optional(128, "upper_bounds", MapType.of_required(129, 130,
- IntegerType.get(),
- BinaryType.get()))]
- # NEXT ID TO ASSIGN: 131
- )
-
- def path(self):
- raise NotImplementedError()
-
- def format(self):
- raise NotImplementedError()
-
- def partition(self):
- raise NotImplementedError()
-
- def record_count(self):
- raise NotImplementedError()
-
- def file_size_in_bytes(self):
- raise NotImplementedError()
-
- def block_size_in_bytes(self):
- raise NotImplementedError()
-
- def file_ordinal(self):
- raise NotImplementedError()
-
- def sort_columns(self):
- raise NotImplementedError()
-
- def column_sizes(self):
- raise NotImplementedError()
-
- def value_counts(self):
- raise NotImplementedError()
-
- def null_value_counts(self):
- raise NotImplementedError()
-
- def lower_bounds(self):
- raise NotImplementedError()
-
- def upper_bounds(self):
- raise NotImplementedError()
-
- def copy(self):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/data_operations.py b/python_legacy/iceberg/api/data_operations.py
deleted file mode 100644
index 7740a2bce1..0000000000
--- a/python_legacy/iceberg/api/data_operations.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-class DataOperations(object):
- APPEND = "append"
- REPLACE = "replace"
- OVERWRITE = "overwrite"
- DELETE = "delete"
diff --git a/python_legacy/iceberg/api/delete_files.py b/python_legacy/iceberg/api/delete_files.py
deleted file mode 100644
index 9b6c01846b..0000000000
--- a/python_legacy/iceberg/api/delete_files.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from .pending_update import PendingUpdate
-
-
-class DeleteFiles(PendingUpdate):
-
- def delete_files(self, path=None, datafile=None):
- if datafile is not None:
- path = datafile.path
-
- self._delete_from_path(path)
- return self
-
- def _delete_from_path(self):
- raise NotImplementedError()
-
- def delete_from_row_filter(self, expr):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/expire_snapshots.py b/python_legacy/iceberg/api/expire_snapshots.py
deleted file mode 100644
index 0406844097..0000000000
--- a/python_legacy/iceberg/api/expire_snapshots.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from .pending_update import PendingUpdate
-
-
-class ExpireSnapshots(PendingUpdate):
-
- def expire_snapshot_id(self, snapshot_id):
- raise NotImplementedError()
-
- def expire_older_than(self, timestamp_millis):
- raise NotImplementedError()
-
- def delete_with(self, delete_funct):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/expressions/__init__.py b/python_legacy/iceberg/api/expressions/__init__.py
deleted file mode 100644
index 18450a2bd5..0000000000
--- a/python_legacy/iceberg/api/expressions/__init__.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from __future__ import absolute_import
-
-__all__ = ["ABOVE_MAX",
- "And",
- "BELOW_MIN",
- "BinaryLiteral",
- "Binder",
- "BooleanLiteral",
- "BoundPredicate",
- "BoundReference",
- "Evaluator",
- "Expression",
- "ExpressionVisitors",
- "Expressions",
- "DateLiteral",
- "DecimalLiteral",
- "DoubleLiteral",
- "FALSE",
- "FalseExp",
- "FixedLiteral",
- "FloatLiteral",
- "inclusive",
- "InclusiveManifestEvaluator",
- "InclusiveMetricsEvaluator",
- "InclusiveProjection",
- "IntegerLiteral",
- "JAVA_MAX_FLOAT",
- "JAVA_MAX_INT",
- "JAVA_MIN_FLOAT",
- "JAVA_MIN_INT",
- "Literal",
- "Literals",
- "NamedReference",
- "Not",
- "Operation",
- "Or",
- "Predicate",
- "ResidualEvaluator",
- "strict",
- "StrictMetricsEvaluator",
- "StrictProjection",
- "StringLiteral",
- "TRUE",
- "TrueExp",
- "UUIDLiteral",
- "UnboundPredicate"]
-
-from .evaluator import (Binder,
- Evaluator)
-from .expression import (And,
- Expression,
- FALSE,
- FalseExp,
- Not,
- Operation,
- Or,
- TRUE,
- TrueExp)
-from .expressions import Expressions, ExpressionVisitors
-from .inclusive_manifest_evaluator import InclusiveManifestEvaluator
-from .inclusive_metrics_evaluator import InclusiveMetricsEvaluator
-from .java_variables import (JAVA_MAX_FLOAT,
- JAVA_MAX_INT,
- JAVA_MIN_FLOAT,
- JAVA_MIN_INT)
-from .literals import (ABOVE_MAX,
- BELOW_MIN,
- BinaryLiteral,
- BooleanLiteral,
- DateLiteral,
- DecimalLiteral,
- DoubleLiteral,
- FixedLiteral,
- FloatLiteral,
- IntegerLiteral,
- Literal,
- Literals,
- StringLiteral,
- UUIDLiteral)
-from .predicate import (BoundPredicate,
- Predicate,
- UnboundPredicate)
-from .projections import (inclusive,
- InclusiveProjection,
- strict,
- StrictProjection)
-from .reference import (BoundReference,
- NamedReference)
-from .residual_evaluator import ResidualEvaluator
-from .strict_metrics_evaluator import StrictMetricsEvaluator
diff --git a/python_legacy/iceberg/api/expressions/binder.py b/python_legacy/iceberg/api/expressions/binder.py
deleted file mode 100644
index 3be2d46e03..0000000000
--- a/python_legacy/iceberg/api/expressions/binder.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from .expressions import Expressions, ExpressionVisitors
-from .predicate import BoundPredicate
-
-
-class Binder(object):
-
- @staticmethod
- def bind(struct, expr, case_sensitive=True):
- return ExpressionVisitors.visit(expr, Binder.BindVisitor(struct, case_sensitive))
-
- @staticmethod
- def bound_references(struct, exprs, case_sensitive=True):
- if exprs is None:
- return set()
- visitor = Binder.ReferenceVisitor()
- for expr in exprs:
- ExpressionVisitors.visit(Binder.bind(struct, expr, case_sensitive), visitor)
-
- return visitor.references
-
- def __init__(self):
- pass
-
- class BindVisitor(ExpressionVisitors.ExpressionVisitor):
-
- def __init__(self, struct, case_sensitive=True):
- self.struct = struct
- self.case_sensitive = case_sensitive
-
- def always_true(self):
- return Expressions.always_true()
-
- def always_false(self):
- return Expressions.always_false()
-
- def not_(self, result):
- return Expressions.not_(result)
-
- def and_(self, left_result, right_result):
- return Expressions.and_(left_result, right_result)
-
- def or_(self, left_result, right_result):
- return Expressions.or_(left_result, right_result)
-
- def predicate(self, pred):
- if isinstance(pred, BoundPredicate):
- raise RuntimeError("Found already bound predicate: {}".format(pred))
-
- return pred.bind(self.struct, self.case_sensitive)
-
- class ReferenceVisitor(ExpressionVisitors.ExpressionVisitor):
-
- def __init__(self):
- self.references = set()
-
- def always_true(self):
- return self.references
-
- def always_false(self):
- return self.references
-
- def not_(self, result):
- return self.references
-
- def and_(self, left_result, right_result):
- return self.references
-
- def or_(self, left_result, right_result):
- return self.references
-
- def predicate(self, pred):
- if isinstance(pred, BoundPredicate):
- self.references.add(pred.ref.field_id)
- return self.references
diff --git a/python_legacy/iceberg/api/expressions/evaluator.py b/python_legacy/iceberg/api/expressions/evaluator.py
deleted file mode 100644
index 2371624e15..0000000000
--- a/python_legacy/iceberg/api/expressions/evaluator.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import threading
-
-from .binder import Binder
-from .expressions import ExpressionVisitors
-
-
-class Evaluator(object):
-
- def __init__(self, struct, unbound, case_sensitive=True):
- self.expr = Binder.bind(struct, unbound, case_sensitive)
- self.thread_local_data = threading.local()
-
- def _visitor(self):
- if not hasattr(self.thread_local_data, "visitors"):
- self.thread_local_data.visitors = Evaluator.EvalVisitor()
-
- return self.thread_local_data.visitors
-
- def eval(self, data):
- return self._visitor().eval(data, self.expr)
-
- class EvalVisitor(ExpressionVisitors.BoundExpressionVisitor):
-
- def __init__(self):
- super(Evaluator.EvalVisitor, self).__init__()
- self.struct = None
-
- def eval(self, row, expr):
- self.struct = row
- return ExpressionVisitors.visit(expr, self)
-
- def always_true(self):
- return True
-
- def always_false(self):
- return False
-
- def not_(self, result):
- return not result
-
- def and_(self, left_result, right_result):
- return left_result and right_result
-
- def or_(self, left_result, right_result):
- return left_result or right_result
-
- def is_null(self, ref):
- return ref.get(self.struct) is None
-
- def not_null(self, ref):
- return not (ref.get(self.struct) is None)
-
- def lt(self, ref, lit):
- return ref.get(self.struct) < lit.value
-
- def lt_eq(self, ref, lit):
- return ref.get(self.struct) <= lit.value
-
- def gt(self, ref, lit):
- return ref.get(self.struct) > lit.value
-
- def gt_eq(self, ref, lit):
- return ref.get(self.struct) >= lit.value
-
- def eq(self, ref, lit):
- return ref.get(self.struct) == lit.value
-
- def not_eq(self, ref, lit):
- return ref.get(self.struct) != lit.value
-
- def in_(self, ref, lit):
- raise NotImplementedError()
-
- def not_in(self, ref, lit):
- return not self.in_(ref, lit.value)
diff --git a/python_legacy/iceberg/api/expressions/expression.py b/python_legacy/iceberg/api/expressions/expression.py
deleted file mode 100644
index 3dc0f06a31..0000000000
--- a/python_legacy/iceberg/api/expressions/expression.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from enum import Enum
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
- from .predicate import Predicate
-
-
-class Expression(object):
-
- left: 'Predicate'
- right: 'Predicate'
- child: 'Predicate'
-
- def __init__(self):
- pass
-
- def op(self):
- raise RuntimeError("No implementation for base class")
-
- def negate(self):
- raise RuntimeError("%s cannot be negated" % self)
-
-
-class Operation(Enum):
- TRUE = "TRUE"
- FALSE = "FALSE"
- IS_NULL = "IS_NULL"
- NOT_NULL = "NOT_NULL"
- IS_NAN = "IS_NAN"
- NOT_NAN = "NOT_NAN"
- LT = "LT"
- LT_EQ = "LT_EQ"
- GT = "GT"
- GT_EQ = "GT_EQ"
- EQ = "EQ"
- NOT_EQ = "NOT_EQ"
- IN = "IN"
- NOT_IN = "NOT_IN"
- NOT = "NOT"
- AND = "AND"
- OR = "OR"
-
- def negate(self): # noqa
- if self == Operation.IS_NULL:
- return Operation.NOT_NULL
- elif self == Operation.NOT_NULL:
- return Operation.IS_NULL
- elif self == Operation.IS_NAN:
- return Operation.NOT_NAN
- elif self == Operation.NOT_NAN:
- return Operation.IS_NAN
- elif self == Operation.LT:
- return Operation.GT_EQ
- elif self == Operation.LT_EQ:
- return Operation.GT
- elif self == Operation.GT:
- return Operation.LT_EQ
- elif self == Operation.GT_EQ:
- return Operation.LT
- elif self == Operation.EQ:
- return Operation.NOT_EQ
- elif self == Operation.NOT_EQ:
- return Operation.EQ
- elif self == Operation.IN:
- return Operation.NOT_IN
- elif self == Operation.NOT_IN:
- return Operation.IN
- else:
- raise RuntimeError("No negation for operation: %s" % self)
-
- def flipLR(self):
- if self == Operation.LT:
- return Operation.GT
- elif self == Operation.LT_EQ:
- return Operation.GT_EQ
- elif self == Operation.GT:
- return Operation.LT
- elif self == Operation.GT_EQ:
- return Operation.LT_EQ
- elif self == Operation.EQ:
- return Operation.EQ
- elif self == Operation.NOT_EQ:
- return Operation.NOT_EQ
- elif self == Operation.AND:
- return Operation.AND
- elif self == Operation.OR:
- return Operation.OR
- else:
- raise RuntimeError("No left-right flip for operation: %s" % self)
-
- def op(self):
- pass
-
-
-class And(Expression):
-
- def __init__(self, left, right):
- self.left = left
- self.right = right
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
- elif other is None or not isinstance(other, And):
- return False
-
- return self.left == other.left and self.right == other.right
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
- def op(self):
- return Operation.AND
-
- def negate(self):
- from .expressions import Expressions
- return Expressions.or_(self.left.negate(), self.right.negate()) # noqa
-
- def __repr__(self):
- return "And({},{})".format(self.left, self.right)
-
- def __str__(self):
- return '({} and {})'.format(self.left, self.right)
-
-
-class FalseExp(Expression):
-
- def op(self):
- return Operation.FALSE
-
- def negate(self):
- return TRUE
-
- def __repr__(self):
- return "false"
-
- def __str__(self):
- return self.__repr__()
-
- def __eq__(self, other):
- if isinstance(other, FalseExp):
- return True
-
- return False
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
-
-class Or(Expression):
-
- def __init__(self, left, right):
- self.left = left
- self.right = right
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
- elif other is None or not isinstance(other, Or):
- return False
-
- return self.left == other.left and self.right == other.right
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
- def op(self):
- return Operation.OR
-
- def negate(self):
- from .expressions import Expressions
- return Expressions.and_(self.left.negate(), self.right.negate()) # noqa
-
- def __repr__(self):
- return "Or({},{})".format(self.left, self.right)
-
- def __str__(self):
- return '({} or {})'.format(self.left, self.right)
-
-
-class Not(Expression):
-
- def __init__(self, child):
- self.child = child
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
- elif other is None or not isinstance(other, Not):
- return False
-
- return self.child == other.child
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
- def op(self):
- return Operation.NOT
-
- def negate(self):
- return self.child
-
- def __repr__(self):
- return "Not({})".format(self.child)
-
- def __str__(self):
- return 'not({})'.format(self.child)
-
-
-class TrueExp(Expression):
-
- def op(self):
- return Operation.TRUE
-
- def negate(self):
- return False
-
- def __repr__(self):
- return "true"
-
- def __str__(self):
- return self.__repr__()
-
- def __eq__(self, other):
- if isinstance(other, TrueExp):
- return True
-
- return False
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
-
-TRUE = TrueExp()
-FALSE = FalseExp()
diff --git a/python_legacy/iceberg/api/expressions/expression_parser.py b/python_legacy/iceberg/api/expressions/expression_parser.py
deleted file mode 100644
index e231747d87..0000000000
--- a/python_legacy/iceberg/api/expressions/expression_parser.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Derived from the SimpleSQL Parser example in pyparsing, retrofitted to just handle the
-# where clause predicates
-# https://github.com/pyparsing/pyparsing/blob/master/examples/simpleSQL.py
-
-import logging
-
-from pyparsing import (
- alphanums,
- alphas,
- CaselessKeyword,
- delimitedList,
- Group,
- infixNotation,
- oneOf,
- opAssoc,
- pyparsing_common as ppc,
- quotedString,
- removeQuotes,
- Word
-)
-
-
-_logger = logging.getLogger(__name__)
-
-AND, OR, IN, IS, NOT, NULL, BETWEEN = map(
- CaselessKeyword, "and or in is not null between".split()
-)
-NOT_NULL = NOT + NULL
-
-ident = Word(alphas, alphanums + "_$").setName("identifier")
-columnName = delimitedList(ident, ".", combine=True).setName("column name")
-
-binop = oneOf("= == != < > >= <= eq ne lt le gt ge <>", caseless=False)
-realNum = ppc.real()
-intNum = ppc.signed_integer()
-
-columnRval = (realNum
- | intNum
- | quotedString.setParseAction(removeQuotes)
- | columnName) # need to add support for alg expressions
-whereCondition = Group(
- (columnName + binop + columnRval)
- | (columnName + IN + Group("(" + delimitedList(columnRval) + ")"))
- | (columnName + IS + (NULL | NOT_NULL))
- | (columnName + BETWEEN + columnRval + AND + columnRval)
-
-)
-
-whereExpression = infixNotation(
- Group(whereCondition
- | NOT + whereCondition
- | NOT + Group('(' + whereCondition + ')')
- | NOT + columnName),
- [(NOT, 1, opAssoc.LEFT), (AND, 2, opAssoc.LEFT), (OR, 2, opAssoc.LEFT), (IS, 2, opAssoc.LEFT)],
-)
-
-op_map = {"=": "eq",
- "==": "eq",
- "eq": "eq",
- ">": "gt",
- "gt": "gt",
- ">=": "gte",
- "gte": "gte",
- "<": "lt",
- "lt": "lt",
- "<=": "lte",
- "lte": "lte",
- "!": "not",
- "not": "not",
- "!=": "neq",
- "<>": "neq",
- "neq": "neq",
- "||": "or",
- "or": "or",
- "&&": "and",
- "and": "and",
- "in": "in",
- "between": "between",
- "is": "is"}
-
-
-def get_expr_tree(tokens):
- if isinstance(tokens, (str, int)):
- return tokens
- if len(tokens) > 1:
- if (tokens[0] == "not"):
- return {"not": get_expr_tree(tokens[1])}
- if (tokens[0] == "(" and tokens[-1] == ")"):
- return get_expr_tree(tokens[1:-1])
- else:
- return get_expr_tree(tokens[0])
-
- op = op_map[tokens[1]]
-
- if op == "in":
- return {'in': [get_expr_tree(tokens[0]), [token for token in tokens[2][1:-1]]]}
- elif op == "between":
- return {'and': [{"gte": [get_expr_tree(tokens[0]), tokens[2]]},
- {"lte": [get_expr_tree(tokens[0]), tokens[4]]}]}
- elif op == "is":
-
- if tokens[2] == 'null':
- return {"missing": tokens[0]}
- else:
- return {"exists": tokens[0]}
- if len(tokens) > 3:
- binary_tuples = get_expr_tree(tokens[2:])
- else:
- binary_tuples = get_expr_tree(tokens[2])
-
- return {op: [get_expr_tree(tokens[0]),
- binary_tuples]}
-
-
-def get_expr(node, expr_map):
- if isinstance(node, dict):
- for i in node.keys():
- op = i
- if op == "literal":
- return node["literal"]
- mapped_op = expr_map.get(op, expr_map)
- if len(mapped_op) == 1:
- mapped_op = mapped_op[0]
- if mapped_op is None:
- raise RuntimeError("no mapping for op: %s" % op)
- if op in ("not", "exists", "missing"):
- return mapped_op(get_expr(node[op], expr_map))
-
- return mapped_op(*get_expr(node[op], expr_map))
- elif isinstance(node, (list, tuple)):
- return (get_expr(item, expr_map) for item in node)
- elif isinstance(node, (str, int, float)):
- return node
- else:
- raise RuntimeError("unknown node type" % node)
-
-
-def parse_expr_string(predicate_string, expr_map):
- from pyparsing import ParseException
-
- try:
- expr = whereExpression.parseString(predicate_string, parseAll=True)
- expr = get_expr_tree(expr)
- return get_expr(expr, expr_map)
- except ParseException as pe:
- _logger.error("Error parsing string expression into iceberg expression: %s" % str(pe))
- raise
diff --git a/python_legacy/iceberg/api/expressions/expressions.py b/python_legacy/iceberg/api/expressions/expressions.py
deleted file mode 100644
index 4bcdc2cd12..0000000000
--- a/python_legacy/iceberg/api/expressions/expressions.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import logging
-
-from .expression import (And,
- FALSE,
- Not,
- Operation,
- Or,
- TRUE)
-from .expression_parser import parse_expr_string
-from .predicate import (Predicate,
- UnboundPredicate)
-from .reference import NamedReference
-
-_logger = logging.getLogger(__name__)
-
-
-class Expressions(object):
-
- @staticmethod
- def and_(left, right):
- if left == Expressions.always_false() or right == Expressions.always_false():
- return Expressions.always_false()
- elif left == Expressions.always_true():
- return right
- elif right == Expressions.always_true():
- return left
-
- return And(left, right)
-
- @staticmethod
- def or_(left, right):
- if left == Expressions.always_true() or right == Expressions.always_true():
- return Expressions.always_true()
- elif left == Expressions.always_false():
- return right
- elif right == Expressions.always_false():
- return left
-
- return Or(left, right)
-
- @staticmethod
- def not_(child):
- if child == Expressions.always_true():
- return Expressions.always_false()
- elif child == Expressions.always_false():
- return Expressions.always_true()
- elif isinstance(child, Not):
- return child.child
-
- return Not(child)
-
- @staticmethod
- def is_null(name):
- return UnboundPredicate(Operation.IS_NULL, Expressions.ref(name))
-
- @staticmethod
- def not_null(name):
- return UnboundPredicate(Operation.NOT_NULL, Expressions.ref(name))
-
- @staticmethod
- def is_nan(name):
- return UnboundPredicate(Operation.IS_NAN, Expressions.ref(name))
-
- @staticmethod
- def not_nan(name):
- return UnboundPredicate(Operation.NOT_NAN, Expressions.ref(name))
-
- @staticmethod
- def less_than(name, value):
- return UnboundPredicate(Operation.LT, Expressions.ref(name), value)
-
- @staticmethod
- def less_than_or_equal(name, value):
- return UnboundPredicate(Operation.LT_EQ, Expressions.ref(name), value)
-
- @staticmethod
- def greater_than(name, value):
- return UnboundPredicate(Operation.GT, Expressions.ref(name), value)
-
- @staticmethod
- def greater_than_or_equal(name, value):
- return UnboundPredicate(Operation.GT_EQ, Expressions.ref(name), value)
-
- @staticmethod
- def equal(name, value):
- return UnboundPredicate(Operation.EQ, Expressions.ref(name), value)
-
- @staticmethod
- def not_equal(name, value):
- return UnboundPredicate(Operation.NOT_EQ, Expressions.ref(name), value)
-
- @staticmethod
- def predicate(op, name, value=None, lit=None):
- if value is not None and op not in (Operation.IS_NULL, Operation.NOT_NULL):
- return UnboundPredicate(op, Expressions.ref(name), value)
- elif lit is not None and op not in (Operation.IS_NULL, Operation.NOT_NULL):
- return UnboundPredicate(op, Expressions.ref(name), value)
- elif op in (Operation.IS_NULL, Operation.NOT_NULL):
- if value is not None or lit is not None:
- raise RuntimeError("Cannot create {} predicate inclusive a value".format(op))
- return UnboundPredicate(op, Expressions.ref(name))
- else:
- raise RuntimeError("Cannot create {} predicate without a value".format(op))
-
- @staticmethod
- def always_true():
- return TRUE
-
- @staticmethod
- def always_false():
- return FALSE
-
- @staticmethod
- def rewrite_not(expr):
- return ExpressionVisitors.visit(expr, RewriteNot.get()) # noqa
-
- @staticmethod
- def ref(name):
- return NamedReference(name)
-
- @staticmethod
- def convert_string_to_expr(predicate_string):
- expr_map = {"and": (Expressions.and_,),
- "eq": (Expressions.equal,),
- "exists": (Expressions.not_null,),
- "gt": (Expressions.greater_than,),
- "gte": (Expressions.greater_than_or_equal,),
- "lt": (Expressions.less_than,),
- "lte": (Expressions.less_than_or_equal,),
- "missing": (Expressions.is_null,),
- "neq": (Expressions.not_equal,),
- "not": (Expressions.not_,),
- "or": (Expressions.or_,)}
-
- return parse_expr_string(predicate_string, expr_map)
-
-
-class ExpressionVisitors(object):
-
- @staticmethod
- def visit(expr, visitor):
- if isinstance(expr, Predicate):
- return visitor.predicate(expr)
-
- if expr.op() == Operation.TRUE:
- return visitor.always_true()
- elif expr.op() == Operation.FALSE:
- return visitor.always_false()
- elif expr.op() == Operation.NOT:
- return visitor.not_(ExpressionVisitors.visit(expr.child, visitor))
- elif expr.op() == Operation.AND:
- return visitor.and_(ExpressionVisitors.visit(expr.left, visitor),
- ExpressionVisitors.visit(expr.right, visitor))
- elif expr.op() == Operation.OR:
- return visitor.or_(ExpressionVisitors.visit(expr.left, visitor),
- ExpressionVisitors.visit(expr.right, visitor))
- else:
- raise RuntimeError("Unknown operation: {}".format(expr.op()))
-
- class ExpressionVisitor(object):
-
- def always_true(self):
- return NotImplementedError()
-
- def always_false(self):
- return NotImplementedError()
-
- def not_(self, result):
- return NotImplementedError()
-
- def and_(self, left_result, right_result):
- return NotImplementedError()
-
- def or_(self, left_result, right_result):
- return NotImplementedError()
-
- def predicate(self, pred):
- return NotImplementedError()
-
- class BoundExpressionVisitor(ExpressionVisitor):
-
- def __init__(self):
- super(ExpressionVisitors.BoundExpressionVisitor, self).__init__()
-
- def is_null(self, ref):
- return NotImplementedError()
-
- def not_null(self, ref):
- return NotImplementedError()
-
- def is_nan(self, ref):
- return NotImplementedError()
-
- def not_nan(self, ref):
- return NotImplementedError()
-
- def lt(self, ref, lit):
- return NotImplementedError()
-
- def lt_eq(self, ref, lit):
- return NotImplementedError()
-
- def gt(self, ref, lit):
- return NotImplementedError()
-
- def gt_eq(self, ref, lit):
- return None
-
- def eq(self, ref, lit):
- return None
-
- def not_eq(self, ref, lit):
- return None
-
- def in_(self, ref, lit):
- return None
-
- def not_in(self, ref, lit):
- return None
-
- def predicate(self, pred): # noqa
-
- if isinstance(pred, UnboundPredicate):
- raise RuntimeError("Not a bound Predicate: {}".format(pred))
-
- if pred.op == Operation.IS_NULL:
- return self.is_null(pred.ref)
- elif pred.op == Operation.NOT_NULL:
- return self.not_null(pred.ref)
- elif pred.op in [Operation.IS_NAN, Operation.NOT_NAN]:
- raise NotImplementedError("IS_NAN and NOT_NAN not fully implemented for expressions")
- elif pred.op == Operation.LT:
- return self.lt(pred.ref, pred.lit)
- elif pred.op == Operation.LT_EQ:
- return self.lt_eq(pred.ref, pred.lit)
- elif pred.op == Operation.GT:
- return self.gt(pred.ref, pred.lit)
- elif pred.op == Operation.GT_EQ:
- return self.gt_eq(pred.ref, pred.lit)
- elif pred.op == Operation.EQ:
- return self.eq(pred.ref, pred.lit)
- elif pred.op == Operation.NOT_EQ:
- return self.not_eq(pred.ref, pred.lit)
- elif pred.op == Operation.IN:
- return self.in_(pred.ref, pred.lit)
- elif pred.op == Operation.NOT_IN:
- return self.not_in(pred.ref, pred.lit)
- else:
- raise RuntimeError("Unknown operation for Predicate: {}".format(pred.op))
-
-
-class RewriteNot(ExpressionVisitors.ExpressionVisitor):
- __instance = None
-
- @staticmethod
- def get():
- if RewriteNot.__instance is None:
- RewriteNot()
- return RewriteNot.__instance
-
- def __init__(self):
- if RewriteNot.__instance is not None:
- raise Exception("Multiple RewriteNot Types created")
- RewriteNot.__instance = self
-
- def always_true(self):
- return Expressions.always_true()
-
- def always_false(self):
- return Expressions.always_false()
-
- def not_(self, result):
- return result.negate()
-
- def and_(self, left_result, right_result):
- return Expressions.and_(left_result, right_result)
-
- def or_(self, left_result, right_result):
- return Expressions.or_(left_result, right_result)
-
- def predicate(self, pred):
- return pred
diff --git a/python_legacy/iceberg/api/expressions/inclusive_manifest_evaluator.py b/python_legacy/iceberg/api/expressions/inclusive_manifest_evaluator.py
deleted file mode 100644
index 953ee99d08..0000000000
--- a/python_legacy/iceberg/api/expressions/inclusive_manifest_evaluator.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import threading
-
-from .binder import Binder
-from .expressions import Expressions, ExpressionVisitors
-from .projections import inclusive
-from ..types import Conversions
-
-ROWS_MIGHT_MATCH = True
-ROWS_CANNOT_MATCH = False
-
-
-class InclusiveManifestEvaluator(object):
-
- def __init__(self, spec, row_filter, case_sensitive=True):
- self.struct = spec.partition_type()
- self.expr = Binder.bind(self.struct,
- Expressions.rewrite_not(inclusive(spec, case_sensitive=case_sensitive)
- .project(row_filter)),
- case_sensitive=case_sensitive)
- self.thread_local_data = threading.local()
-
- def _visitor(self):
- if not hasattr(self.thread_local_data, "visitors"):
- self.thread_local_data.visitors = ManifestEvalVisitor(self.expr)
-
- return self.thread_local_data.visitors
-
- def eval(self, manifest):
- return self._visitor().eval(manifest)
-
-
-class ManifestEvalVisitor(ExpressionVisitors.BoundExpressionVisitor):
-
- def __init__(self, expr):
- self.expr = expr
- self.stats = None
-
- def eval(self, manifest):
- self.stats = manifest.partitions
- if self.stats is None:
- return ROWS_MIGHT_MATCH
-
- return ExpressionVisitors.visit(self.expr, self)
-
- def always_true(self):
- return ROWS_MIGHT_MATCH
-
- def always_false(self):
- return ROWS_CANNOT_MATCH
-
- def not_(self, result):
- return not result
-
- def and_(self, left_result, right_result):
- return left_result and right_result
-
- def or_(self, left_result, right_result):
- return left_result or right_result
-
- def is_null(self, ref):
- if not self.stats[ref.pos].contains_null():
- return ROWS_CANNOT_MATCH
-
- return ROWS_MIGHT_MATCH
-
- def not_null(self, ref):
- lower_bound = self.stats[ref.pos].lower_bound()
- if lower_bound is None:
- return ROWS_CANNOT_MATCH
-
- return ROWS_MIGHT_MATCH
-
- def lt(self, ref, lit):
- lower_bound = self.stats[ref.pos].lower_bound()
- if lower_bound is None:
- return ROWS_CANNOT_MATCH
-
- lower = Conversions.from_byte_buffer(ref.type, lower_bound)
-
- if lower >= lit.value:
- return ROWS_CANNOT_MATCH
-
- return ROWS_MIGHT_MATCH
-
- def lt_eq(self, ref, lit):
- lower_bound = self.stats[ref.pos].lower_bound()
- if lower_bound is None:
- return ROWS_CANNOT_MATCH
-
- lower = Conversions.from_byte_buffer(ref.type, lower_bound)
-
- if lower > lit.value:
- return ROWS_CANNOT_MATCH
-
- return ROWS_MIGHT_MATCH
-
- def gt(self, ref, lit):
- upper_bound = self.stats[ref.pos].upper_bound()
- if upper_bound is None:
- return ROWS_CANNOT_MATCH
-
- upper = Conversions.from_byte_buffer(ref.type, upper_bound)
-
- if upper <= lit.value:
- return ROWS_CANNOT_MATCH
-
- return ROWS_MIGHT_MATCH
-
- def gt_eq(self, ref, lit):
- upper_bound = self.stats[ref.pos].upper_bound()
- if upper_bound is None:
- return ROWS_CANNOT_MATCH
-
- upper = Conversions.from_byte_buffer(ref.type, upper_bound)
-
- if upper < lit.value:
- return ROWS_CANNOT_MATCH
-
- return ROWS_MIGHT_MATCH
-
- def eq(self, ref, lit):
- field_stats = self.stats[ref.pos]
- if field_stats.lower_bound() is None:
- return ROWS_CANNOT_MATCH
-
- lower = Conversions.from_byte_buffer(ref.type, field_stats.lower_bound())
- if lower > lit.value:
- return ROWS_CANNOT_MATCH
-
- upper = Conversions.from_byte_buffer(ref.type, field_stats.upper_bound())
-
- if upper < lit.value:
- return ROWS_CANNOT_MATCH
-
- return ROWS_MIGHT_MATCH
-
- def not_eq(self, ref, lit):
- return ROWS_MIGHT_MATCH
-
- def in_(self, ref, lit):
- return ROWS_MIGHT_MATCH
-
- def not_in(self, ref, lit):
- return ROWS_MIGHT_MATCH
diff --git a/python_legacy/iceberg/api/expressions/inclusive_metrics_evaluator.py b/python_legacy/iceberg/api/expressions/inclusive_metrics_evaluator.py
deleted file mode 100644
index 73b636fff0..0000000000
--- a/python_legacy/iceberg/api/expressions/inclusive_metrics_evaluator.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import threading
-
-from .expressions import Expressions, ExpressionVisitors
-from ..expressions.binder import Binder
-from ..types import Conversions
-
-
-class InclusiveMetricsEvaluator(object):
-
- def __init__(self, schema, unbound, case_sensitive=True):
- self.schema = schema
- self.struct = schema.as_struct()
- self.case_sensitive = case_sensitive
- self.expr = Binder.bind(self.struct, Expressions.rewrite_not(unbound), case_sensitive)
- self.thread_local_data = threading.local()
-
- def _visitor(self):
- if not hasattr(self.thread_local_data, "visitors"):
- self.thread_local_data.visitors = MetricsEvalVisitor(self.expr, self.schema, self.struct)
-
- return self.thread_local_data.visitors
-
- def eval(self, file):
- return self._visitor().eval(file)
-
-
-class MetricsEvalVisitor(ExpressionVisitors.BoundExpressionVisitor):
- ROWS_MIGHT_MATCH = True
- ROWS_CANNOT_MATCH = False
-
- def __init__(self, expr, schema, struct):
- self.expr = expr
- self.value_counts = None
- self.null_counts = None
- self.lower_bounds = None
- self.upper_bounds = None
- self.schema = schema
- self.struct = struct
-
- def eval(self, file):
- if file.record_count() <= 0:
- return MetricsEvalVisitor.ROWS_CANNOT_MATCH
-
- self.value_counts = file.value_counts()
- self.null_counts = file.null_value_counts()
- self.lower_bounds = file.lower_bounds()
- self.upper_bounds = file.upper_bounds()
-
- return ExpressionVisitors.visit(self.expr, self)
-
- def always_true(self):
- return MetricsEvalVisitor.ROWS_MIGHT_MATCH
-
- def always_false(self):
- return MetricsEvalVisitor.ROWS_CANNOT_MATCH
-
- def not_(self, result):
- return not result
-
- def and_(self, left_result, right_result):
- return left_result and right_result
-
- def or_(self, left_result, right_result):
- return left_result or right_result
-
- def is_null(self, ref):
- id = ref.field.field_id
-
- if self.struct.field(id=id) is None:
- raise RuntimeError("Cannot filter by nested column: %s" % self.schema.find_field(id))
-
- if self.null_counts is not None and self.null_counts.get(id, -1) == 0:
- return MetricsEvalVisitor.ROWS_CANNOT_MATCH
-
- return MetricsEvalVisitor.ROWS_MIGHT_MATCH
-
- def not_null(self, ref):
- id = ref.field.field_id
-
- if self.struct.field(id=id) is None:
- raise RuntimeError("Cannot filter by nested column: %s" % self.schema.find_field(id))
-
- if self.value_counts is not None and id in self.value_counts and id in self.null_counts \
- and self.value_counts.get(id) - self.null_counts.get(id) == 0:
- return MetricsEvalVisitor.ROWS_CANNOT_MATCH
-
- return MetricsEvalVisitor.ROWS_MIGHT_MATCH
-
- def lt(self, ref, lit):
- id = ref.field.field_id
- field = self.struct.field(id=id)
-
- if field is None:
- raise RuntimeError("Cannot filter by nested column: %s" % self.schema.find_field(id))
-
- if self.lower_bounds is not None and id in self.lower_bounds:
- lower = Conversions.from_byte_buffer(field.type, self.lower_bounds.get(id))
- if lower >= lit.value:
- return MetricsEvalVisitor.ROWS_CANNOT_MATCH
-
- return MetricsEvalVisitor.ROWS_MIGHT_MATCH
-
- def lt_eq(self, ref, lit):
- id = ref.field.field_id
- field = self.struct.field(id=id)
-
- if field is None:
- raise RuntimeError("Cannot filter by nested column: %s" % self.schema.find_field(id))
-
- if self.lower_bounds is not None and id in self.lower_bounds:
- lower = Conversions.from_byte_buffer(field.type, self.lower_bounds.get(id))
- if lower > lit.value:
- return MetricsEvalVisitor.ROWS_CANNOT_MATCH
-
- return MetricsEvalVisitor.ROWS_MIGHT_MATCH
-
- def gt(self, ref, lit):
- id = ref.field.field_id
- field = self.struct.field(id=id)
-
- if field is None:
- raise RuntimeError("Cannot filter by nested column: %s" % self.schema.find_field(id))
-
- if self.upper_bounds is not None and id in self.upper_bounds:
- upper = Conversions.from_byte_buffer(field.type, self.upper_bounds.get(id))
- if upper <= lit.value:
- return MetricsEvalVisitor.ROWS_CANNOT_MATCH
-
- return MetricsEvalVisitor.ROWS_MIGHT_MATCH
-
- def gt_eq(self, ref, lit):
- id = ref.field.field_id
- field = self.struct.field(id=id)
-
- if field is None:
- raise RuntimeError("Cannot filter by nested column: %s" % self.schema.find_field(id))
-
- if self.upper_bounds is not None and id in self.upper_bounds:
- upper = Conversions.from_byte_buffer(field.type, self.upper_bounds.get(id))
- if upper < lit.value:
- return MetricsEvalVisitor.ROWS_CANNOT_MATCH
-
- return MetricsEvalVisitor.ROWS_MIGHT_MATCH
-
- def eq(self, ref, lit):
- id = ref.field.field_id
- field = self.struct.field(id=id)
-
- if field is None:
- raise RuntimeError("Cannot filter by nested column: %s" % self.schema.find_field(id))
-
- if self.lower_bounds is not None and id in self.lower_bounds:
- lower = Conversions.from_byte_buffer(field.type, self.lower_bounds.get(id))
- if lower > lit.value:
- return MetricsEvalVisitor.ROWS_CANNOT_MATCH
-
- if self.upper_bounds is not None and id in self.upper_bounds:
- upper = Conversions.from_byte_buffer(field.type, self.upper_bounds.get(id))
- if upper < lit.value:
- return MetricsEvalVisitor.ROWS_CANNOT_MATCH
-
- return MetricsEvalVisitor.ROWS_MIGHT_MATCH
-
- def not_eq(self, ref, lit):
- return MetricsEvalVisitor.ROWS_MIGHT_MATCH
-
- def in_(self, ref, lit):
- return MetricsEvalVisitor.ROWS_MIGHT_MATCH
-
- def not_in(self, ref, lit):
- return MetricsEvalVisitor.ROWS_MIGHT_MATCH
diff --git a/python_legacy/iceberg/api/expressions/java_variables/__init__.py b/python_legacy/iceberg/api/expressions/java_variables/__init__.py
deleted file mode 100644
index 1ee068e2f7..0000000000
--- a/python_legacy/iceberg/api/expressions/java_variables/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-JAVA_MAX_INT = 2147483647
-JAVA_MIN_INT = -2147483648
-JAVA_MAX_FLOAT = 3.4028235E38
-JAVA_MIN_FLOAT = -3.4028235E38
diff --git a/python_legacy/iceberg/api/expressions/literals.py b/python_legacy/iceberg/api/expressions/literals.py
deleted file mode 100644
index 483ff8f61e..0000000000
--- a/python_legacy/iceberg/api/expressions/literals.py
+++ /dev/null
@@ -1,594 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import datetime
-from decimal import (Decimal,
- ROUND_HALF_UP)
-import uuid
-
-import pytz
-
-
-from .expression import (FALSE,
- TRUE)
-from .java_variables import (JAVA_MAX_FLOAT,
- JAVA_MIN_FLOAT)
-from ..types.conversions import Conversions
-from ..types.type import TypeID
-
-
-class Literals(object):
-
- EPOCH = datetime.datetime.utcfromtimestamp(0)
- EPOCH_DAY = EPOCH.date()
-
- @staticmethod
- def from_(value): # noqa: C901
- if value is None:
- raise RuntimeError("Cannot create an expression literal from None")
- if isinstance(value, bool):
- return BooleanLiteral(value)
- elif isinstance(value, int):
- if Literal.JAVA_MIN_INT < value < Literal.JAVA_MAX_INT:
- return IntegerLiteral(value)
- return LongLiteral(value)
- elif isinstance(value, float):
- if Literal.JAVA_MIN_FLOAT < value < Literal.JAVA_MAX_FLOAT:
- return FloatLiteral(value)
- return DoubleLiteral(value)
- elif isinstance(value, str):
- return StringLiteral(value)
- elif isinstance(value, uuid.UUID):
- return UUIDLiteral(value)
- elif isinstance(value, bytearray):
- return BinaryLiteral(value)
- elif isinstance(value, bytes):
- return FixedLiteral(value)
- elif isinstance(value, Decimal):
- return DecimalLiteral(value)
- else:
- raise NotImplementedError("Unimplemented Type Literal for value: %s" % value)
-
- @staticmethod
- def above_max():
- return ABOVE_MAX
-
- @staticmethod
- def below_min():
- return BELOW_MIN
-
-
-class Literal(object):
- JAVA_MAX_INT = 2147483647
- JAVA_MIN_INT = -2147483648
- JAVA_MAX_FLOAT = 3.4028235E38
- JAVA_MIN_FLOAT = -3.4028235E38
-
- @staticmethod
- def of(value): # noqa: C901
-
- if isinstance(value, bool):
- return BooleanLiteral(value)
- elif isinstance(value, int):
- if value < Literal.JAVA_MIN_INT or value > Literal.JAVA_MAX_INT:
- return LongLiteral(value)
- return IntegerLiteral(value)
- elif isinstance(value, float):
- if value < Literal.JAVA_MIN_FLOAT or value > Literal.JAVA_MAX_FLOAT:
- return DoubleLiteral(value)
- return FloatLiteral(value)
- elif isinstance(value, str):
- return StringLiteral(value)
- elif isinstance(value, uuid.UUID):
- return UUIDLiteral(value)
- elif isinstance(value, bytes):
- return FixedLiteral(value)
- elif isinstance(value, bytearray):
- return BinaryLiteral(value)
- elif isinstance(value, Decimal):
- return DecimalLiteral(value)
-
- def to(self, type_var):
- raise NotImplementedError()
-
- def to_byte_buffer(self):
- raise NotImplementedError()
-
-
-class BaseLiteral(Literal):
- def __init__(self, value, type_id):
- self.value = value
- self.byte_buffer = None
- self.type_id = type_id
-
- def to(self, type_var):
- raise NotImplementedError()
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
- elif other is None or not isinstance(other, BaseLiteral):
- return False
-
- return self.value == other.value
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
- def __repr__(self):
- return "BaseLiteral(%s)" % str(self.value)
-
- def __str__(self):
- return str(self.value)
-
- def to_byte_buffer(self):
- if self.byte_buffer is None:
- self.byte_buffer = Conversions.to_byte_buffer(self.type_id, self.value)
-
- return self.byte_buffer
-
-
-class ComparableLiteral(BaseLiteral):
-
- def __init__(self, value, type_id):
- super(ComparableLiteral, self).__init__(value, type_id)
-
- def to(self, type):
- raise NotImplementedError()
-
- def __eq__(self, other):
- return self.value == other.value
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
- def __lt__(self, other):
- if self.value is None:
- return True
-
- if other is None or other.value is None:
- return False
-
- return self.value < other.value
-
- def __gt__(self, other):
- if self.value is None:
- return False
-
- if other is None or other.value is None:
- return True
-
- return self.value > other.value
-
- def __le__(self, other):
- if self.value is None:
- return True
-
- if other is None or other.value is None:
- return False
-
- return self.value <= other.value
-
- def __ge__(self, other):
- if self.value is None:
- return False
-
- if other is None or other.value is None:
- return True
-
- return self.value >= other.value
-
-
-class AboveMax(Literal):
- def __init__(self):
- super(AboveMax, self).__init__()
-
- def value(self):
- raise RuntimeError("AboveMax has no value")
-
- def to(self, type):
- raise RuntimeError("Cannot change the type of AboveMax")
-
- def __str__(self):
- return "aboveMax"
-
-
-class BelowMin(Literal):
- def __init__(self):
- super(BelowMin, self).__init__()
-
- def value(self):
- raise RuntimeError("BelowMin has no value")
-
- def to(self, type):
- raise RuntimeError("Cannot change the type of BelowMin")
-
- def __str__(self):
- return "belowMin"
-
-
-class BooleanLiteral(ComparableLiteral):
-
- def __init__(self, value):
- super(BooleanLiteral, self).__init__(value, TypeID.BOOLEAN)
-
- def to(self, type_var):
- if type_var.type_id == TypeID.BOOLEAN:
- return self
-
-
-class IntegerLiteral(ComparableLiteral):
-
- def __init__(self, value):
- super(IntegerLiteral, self).__init__(value, TypeID.INTEGER)
-
- def to(self, type_var):
- if type_var.type_id == TypeID.INTEGER:
- return self
- elif type_var.type_id == TypeID.LONG:
- return LongLiteral(self.value)
- elif type_var.type_id == TypeID.FLOAT:
- return FloatLiteral(float(self.value))
- elif type_var.type_id == TypeID.DOUBLE:
- return DoubleLiteral(float(self.value))
- elif type_var.type_id == TypeID.DATE:
- return DateLiteral(self.value)
- elif type_var.type_id == TypeID.DECIMAL:
- if type_var.scale == 0:
- return DecimalLiteral(Decimal(self.value))
- else:
- return DecimalLiteral(Decimal(self.value)
- .quantize(Decimal("." + "".join(["0" for i in range(1, type_var.scale)]) + "1"),
- rounding=ROUND_HALF_UP))
-
-
-class LongLiteral(ComparableLiteral):
-
- def __init__(self, value):
- super(LongLiteral, self).__init__(value, TypeID.LONG)
-
- def to(self, type_var): # noqa: C901
- if type_var.type_id == TypeID.INTEGER:
- if Literal.JAVA_MAX_INT < self.value:
- return ABOVE_MAX
- elif Literal.JAVA_MIN_INT > self.value:
- return BELOW_MIN
-
- return IntegerLiteral(self.value)
- elif type_var.type_id == TypeID.LONG:
- return self
- elif type_var.type_id == TypeID.FLOAT:
- return FloatLiteral(float(self.value))
- elif type_var.type_id == TypeID.DOUBLE:
- return DoubleLiteral(float(self.value))
- elif type_var.type_id == TypeID.TIME:
- return TimeLiteral(self.value)
- elif type_var.type_id == TypeID.TIMESTAMP:
- return TimestampLiteral(self.value)
- elif type_var.type_id == TypeID.DECIMAL:
- if type_var.scale == 0:
- return DecimalLiteral(Decimal(self.value))
- else:
- return DecimalLiteral(Decimal(self.value)
- .quantize(Decimal("." + "".join(["0" for i in range(1, type_var.scale)]) + "1"),
- rounding=ROUND_HALF_UP))
-
-
-class FloatLiteral(ComparableLiteral):
-
- def __init__(self, value):
- super(FloatLiteral, self).__init__(value, TypeID.FLOAT)
-
- def to(self, type_var):
- if type_var.type_id == TypeID.FLOAT:
- return self
- elif type_var.type_id == TypeID.DOUBLE:
- return DoubleLiteral(self.value)
- elif type_var.type_id == TypeID.DECIMAL:
- if type_var.scale == 0:
- return DecimalLiteral(Decimal(self.value)
- .quantize(Decimal('1.'),
- rounding=ROUND_HALF_UP))
- else:
- return DecimalLiteral(Decimal(self.value)
- .quantize(Decimal("." + "".join(["0" for i in range(1, type_var.scale)]) + "1"),
- rounding=ROUND_HALF_UP))
-
-
-class DoubleLiteral(ComparableLiteral):
-
- def __init__(self, value):
- super(DoubleLiteral, self).__init__(value, TypeID.DOUBLE)
-
- def to(self, type_var):
- if type_var.type_id == TypeID.FLOAT:
- if JAVA_MAX_FLOAT < self.value:
- return ABOVE_MAX
- elif JAVA_MIN_FLOAT > self.value:
- return BELOW_MIN
-
- return FloatLiteral(self.value)
- elif type_var.type_id == TypeID.DOUBLE:
- return self
- elif type_var.type_id == TypeID.DECIMAL:
- if type_var.scale == 0:
- return DecimalLiteral(Decimal(self.value)
- .quantize(Decimal('1.'),
- rounding=ROUND_HALF_UP))
- else:
- return DecimalLiteral(Decimal(self.value)
- .quantize(Decimal("." + "".join(["0" for i in range(1, type_var.scale)]) + "1"),
- rounding=ROUND_HALF_UP))
-
-
-class DateLiteral(ComparableLiteral):
-
- def __init__(self, value):
- super(DateLiteral, self).__init__(value, TypeID.DATE)
-
- def to(self, type_var):
- if type_var.type_id == TypeID.DATE:
- return self
-
-
-class TimeLiteral(ComparableLiteral):
-
- def __init__(self, value):
- super(TimeLiteral, self).__init__(value, TypeID.TIME)
-
- def to(self, type_var):
- if type_var.type_id == TypeID.TIME:
- return self
-
-
-class TimestampLiteral(ComparableLiteral):
-
- def __init__(self, value):
- super(TimestampLiteral, self).__init__(value, TypeID.TIMESTAMP)
-
- def to(self, type_var):
- if type_var.type_id == TypeID.TIMESTAMP:
- return self
- elif type_var.type_id == TypeID.DATE:
- return DateLiteral((datetime.datetime.fromtimestamp(self.value / 1000000) - Literals.EPOCH).days)
-
-
-class DecimalLiteral(ComparableLiteral):
-
- def __init__(self, value):
- super(DecimalLiteral, self).__init__(value, TypeID.DECIMAL)
-
- def to(self, type_var):
- if type_var.type_id == TypeID.DECIMAL and type_var.scale == abs(self.value.as_tuple().exponent):
- return self
-
-
-class StringLiteral(BaseLiteral):
- def __init__(self, value):
- super(StringLiteral, self).__init__(value, TypeID.STRING)
-
- def to(self, type_var): # noqa: C901
- value_upper = self.value.upper()
- import dateutil.parser
- if type_var.type_id == TypeID.DATE:
- return DateLiteral((dateutil.parser.parse(self.value) - Literals.EPOCH).days)
- elif type_var.type_id == TypeID.TIME:
- return TimeLiteral(
- int((dateutil.parser.parse(Literals.EPOCH.strftime("%Y-%m-%d ") + self.value) - Literals.EPOCH)
- .total_seconds() * 1000000))
- elif type_var.type_id == TypeID.TIMESTAMP:
- timestamp = dateutil.parser.parse(self.value)
- EPOCH = Literals.EPOCH
- if bool(timestamp.tzinfo) != bool(type_var.adjust_to_utc):
- raise RuntimeError("Cannot convert to %s when string is: %s" % (type_var, self.value))
-
- if timestamp.tzinfo is not None:
- EPOCH = EPOCH.replace(tzinfo=pytz.UTC)
-
- return TimestampLiteral(int((timestamp - EPOCH).total_seconds() * 1000000))
- elif type_var.type_id == TypeID.STRING:
- return self
- elif type_var.type_id == TypeID.UUID:
- return UUIDLiteral(uuid.UUID(self.value))
- elif type_var.type_id == TypeID.DECIMAL:
- dec_val = Decimal(str(self.value))
- if abs(dec_val.as_tuple().exponent) == type_var.scale:
- if type_var.scale == 0:
- return DecimalLiteral(Decimal(str(self.value))
- .quantize(Decimal('1.'),
- rounding=ROUND_HALF_UP))
- else:
- return DecimalLiteral(Decimal(str(self.value))
- .quantize(Decimal("." + "".join(["0" for i in range(1, type_var.scale)]) + "1"),
- rounding=ROUND_HALF_UP))
- elif type_var.type_id == TypeID.BOOLEAN and value_upper in ["TRUE", "FALSE"]:
- return BooleanLiteral(value_upper == "TRUE")
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
-
- if other is None or not isinstance(other, StringLiteral):
- return False
-
- return self.value == other.value
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
- def __lt__(self, other):
- if other is None:
- return False
-
- return self.value < other.value
-
- def __gt__(self, other):
- if other is None:
- return True
-
- return self.value > other.value
-
- def __le__(self, other):
- if other is None:
- return False
-
- return self.value <= other.value
-
- def __ge__(self, other):
- if other is None:
- return True
-
- return self.value >= other.value
-
- def __str__(self):
- return '"' + self.value + '"'
-
-
-class UUIDLiteral(ComparableLiteral):
- def __init__(self, value):
- super(UUIDLiteral, self).__init__(value, TypeID.UUID)
-
- def to(self, type_var):
- if type_var.type_id == TypeID.UUID:
- return self
-
-
-class FixedLiteral(BaseLiteral):
- def __init__(self, value):
- super(FixedLiteral, self).__init__(value, TypeID.FIXED)
-
- def to(self, type_var):
- if type_var.type_id == TypeID.FIXED:
- if len(self.value) == type_var.length:
- return self
- elif type_var.type_id == TypeID.BINARY:
- return BinaryLiteral(self.value)
-
- def write_replace(self):
- return FixedLiteralProxy(self.value)
-
- def __eq__(self, other):
- return self.value == other.value
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
- def __lt__(self, other):
- if other is None:
- return False
-
- return self.value < other.value
-
- def __gt__(self, other):
- if other is None:
- return True
-
- return self.value > other.value
-
- def __le__(self, other):
- if other is None:
- return False
-
- return self.value <= other.value
-
- def __ge__(self, other):
- if other is None:
- return True
-
- return self.value >= other.value
-
-
-class BinaryLiteral(BaseLiteral):
- def __init__(self, value):
- super(BinaryLiteral, self).__init__(value, TypeID.BINARY)
-
- def to(self, type_var):
- if type_var.type_id == TypeID.FIXED:
- if type_var.length == len(self.value):
- return FixedLiteral(self.value)
- return None
- elif type_var.type_id == TypeID.BINARY:
- return self
-
- def write_replace(self):
- return BinaryLiteralProxy(self.value)
-
- def __eq__(self, other):
- return self.value == other.value
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
- def __lt__(self, other):
- if other is None:
- return False
-
- return self.value < other.value
-
- def __gt__(self, other):
- if other is None:
- return True
-
- return self.value > other.value
-
- def __le__(self, other):
- if other is None:
- return False
-
- return self.value <= other.value
-
- def __ge__(self, other):
- if other is None:
- return True
-
- return self.value >= other.value
-
-
-class FixedLiteralProxy(object):
-
- def __init__(self, buffer=None):
- if buffer is not None:
- self.bytes = list(buffer)
-
- def read_resolve(self):
- return FixedLiteral(self.bytes)
-
-
-class ConstantExpressionProxy(object):
-
- def __init__(self, true_or_false=None):
- if true_or_false is not None:
- self.true_or_false = true_or_false
-
- def read_resolve(self):
- if self.true_or_false:
- return TRUE
- else:
- return FALSE
-
-
-class BinaryLiteralProxy(FixedLiteralProxy):
-
- def __init__(self, buffer=None):
- super(BinaryLiteralProxy, self).__init__(buffer)
-
- def read_resolve(self):
- return BinaryLiteral(self.bytes)
-
-
-ABOVE_MAX = AboveMax()
-BELOW_MIN = BelowMin()
diff --git a/python_legacy/iceberg/api/expressions/predicate.py b/python_legacy/iceberg/api/expressions/predicate.py
deleted file mode 100644
index e868a07243..0000000000
--- a/python_legacy/iceberg/api/expressions/predicate.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from __future__ import annotations
-
-from math import isnan
-from typing import Any, List, Optional, TYPE_CHECKING, Union
-
-from iceberg.exceptions import ValidationException
-
-from .expression import (Expression,
- Operation)
-from .literals import (BaseLiteral,
- Literals)
-from .term import BoundTerm, UnboundTerm
-from ..types import TypeID
-
-if TYPE_CHECKING:
- from iceberg.api import StructLike
-
-
-class Predicate(Expression):
-
- def __init__(self, op: Operation, term: Union[BoundTerm, UnboundTerm]):
- if term is None:
- raise ValueError("Term cannot be None")
-
- self.op: Operation = op
- self.term: Union[BoundTerm, UnboundTerm] = term
-
- @property
- def ref(self):
- return self.term.ref
-
- @property
- def lit(self):
- raise NotImplementedError("Not Implemented for base class")
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
- elif other is None or not isinstance(other, Predicate):
- return False
-
- return self.op == other.op and self.ref == other.ref and self.lit == other.lit
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
- def __repr__(self):
- return "Predicate({},{},{})".format(self.op, self.ref, self.lit)
-
- def __str__(self):
- if self.op == Operation.IS_NULL:
- return "is_null({})".format(self.ref)
- elif self.op == Operation.NOT_NULL:
- return "not_null({})".format(self.ref)
- elif self.op == Operation.LT:
- return "less_than({})".format(self.ref)
- elif self.op == Operation.LT_EQ:
- return "less_than_equal({})".format(self.ref)
- elif self.op == Operation.GT:
- return "greater_than({})".format(self.ref)
- elif self.op == Operation.GT_EQ:
- return "greater_than_equal({})".format(self.ref)
- elif self.op == Operation.EQ:
- return "equal({})".format(self.ref)
- elif self.op == Operation.NOT_EQ:
- return "not_equal({})".format(self.ref)
- else:
- return "invalid predicate: operation = {}".format(self.op)
-
-
-class BoundPredicate(Predicate):
-
- def __init__(self, op: Operation, term: BoundTerm, lit: BaseLiteral = None, literals: List[BaseLiteral] = None,
- is_unary_predicate: bool = False, is_literal_predicate: bool = False,
- is_set_predicate: bool = False):
- self.is_unary_predicate = is_unary_predicate
- self.is_literal_predicate = is_literal_predicate
- self.is_set_predicate = is_set_predicate
-
- super(BoundPredicate, self).__init__(op, term)
- ValidationException.check(sum([is_unary_predicate, is_literal_predicate, is_set_predicate]) == 1,
- "Only a single predicate type may be set: %s=%s, %s=%s, %s=%s",
- ("is_unary_predicate", is_unary_predicate,
- "is_literal_predicate", is_literal_predicate,
- "is_set_predicate", is_set_predicate))
-
- self._literals: Optional[List[BaseLiteral]] = None
- if self.is_unary_predicate:
- ValidationException.check(lit is None, "Unary Predicates may not have a literal", ())
-
- elif self.is_literal_predicate:
- ValidationException.check(lit is not None, "Literal Predicates must have a literal set", ())
- self._literals = [lit] # type: ignore
-
- elif self.is_set_predicate:
- ValidationException.check(literals is not None, "Set Predicates must have literals set", ())
- self._literals = literals
- else:
- raise ValueError(f"Unable to instantiate {op} -> (lit={lit}, literal={literals}")
-
- @property
- def lit(self) -> Optional[BaseLiteral]:
- if self._literals is None or len(self._literals) == 0:
- return None
- return self._literals[0]
-
- def eval(self, struct: StructLike) -> bool:
- ValidationException.check(isinstance(self.term, BoundTerm), "Term must be bound to eval: %s", (self.term))
- return self.test(self.term.eval(struct)) # type: ignore
-
- def test(self, struct: StructLike = None, value: Any = None) -> bool:
- ValidationException.check(struct is None or value is None, "Either struct or value must be none", ())
- if struct is not None:
- ValidationException.check(isinstance(self.term, BoundTerm), "Term must be bound to eval: %s", (self.term))
- return self.test(value=self.term.eval(struct)) # type: ignore
- else:
- if self.is_unary_predicate:
- return self.test_unary_predicate(value)
- elif self.is_literal_predicate:
- return self.test_literal_predicate(value)
- else:
- return self.test_set_predicate(value)
-
- def test_unary_predicate(self, value: Any) -> bool:
-
- if self.op == Operation.IS_NULL:
- return value is None
- elif self.op == Operation.NOT_NULL:
- return value is not None
- elif self.op == Operation.IS_NAN:
- return isnan(value)
- elif self.op == Operation.NOT_NAN:
- return not isnan(value)
- else:
- raise ValueError(f"{self.op} is not a valid unary predicate")
-
- def test_literal_predicate(self, value: Any) -> bool:
- if self.lit is None:
- raise ValidationException("Literal must not be none", ())
-
- if self.op == Operation.LT:
- return value < self.lit.value
- elif self.op == Operation.LT_EQ:
- return value <= self.lit.value
- elif self.op == Operation.GT:
- return value > self.lit.value
- elif self.op == Operation.GT_EQ:
- return value >= self.lit.value
- elif self.op == Operation.EQ:
- return value == self.lit.value
- elif self.op == Operation.NOT_EQ:
- return value != self.lit.value
- else:
- raise ValueError(f"{self.op} is not a valid literal predicate")
-
- def test_set_predicate(self, value: Any) -> bool:
- if self._literals is None:
- raise ValidationException("Literals must not be none", ())
-
- if self.op == Operation.IN:
- return value in self._literals
- elif self.op == Operation.NOT_IN:
- return value not in self._literals
- else:
- raise ValueError(f"{self.op} is not a valid set predicate")
-
-
-class UnboundPredicate(Predicate):
-
- def __init__(self, op, term, value=None, lit=None, values=None, literals=None):
- self._literals = None
- num_set_args = sum([1 for x in [value, lit, values, literals] if x is not None])
-
- if num_set_args > 1:
- raise ValueError(f"Only one of value={value}, lit={lit}, values={values}, literals={literals} may be set")
- super(UnboundPredicate, self).__init__(op, term)
- if isinstance(value, BaseLiteral):
- lit = value
- value = None
- if value is not None:
- self._literals = [Literals.from_(value)]
- elif lit is not None:
- self._literals = [lit]
- elif values is not None:
- self._literals = map(Literals.from_, values)
- elif literals is not None:
- self._literals = literals
-
- @property
- def literals(self):
- return self._literals
-
- @property
- def lit(self):
- if self.op in [Operation.IN, Operation.NOT_IN]:
- raise ValueError(f"{self.op} predicate cannot return a literal")
-
- return None if self.literals is None else self.literals[0]
-
- def negate(self):
- return UnboundPredicate(self.op.negate(), self.term, literals=self.literals)
-
- def bind(self, struct, case_sensitive=True):
- bound = self.term.bind(struct, case_sensitive=case_sensitive)
-
- if self.literals is None:
- return self.bind_unary_operation(bound)
- elif self.op in [Operation.IN, Operation.NOT_IN]:
- return self.bind_in_operation(bound)
-
- return self.bind_literal_operation(bound)
-
- def bind_unary_operation(self, bound_term: BoundTerm) -> BoundPredicate:
- from .expressions import Expressions
- if self.op == Operation.IS_NULL:
- if bound_term.ref.field.is_required:
- return Expressions.always_false()
- return BoundPredicate(Operation.IS_NULL, bound_term, is_unary_predicate=True)
- elif self.op == Operation.NOT_NULL:
- if bound_term.ref.field.is_required:
- return Expressions.always_true()
- return BoundPredicate(Operation.NOT_NULL, bound_term, is_unary_predicate=True)
- elif self.op in [Operation.IS_NAN, Operation.NOT_NAN]:
- if not self.floating_type(bound_term.ref.type.type_id):
- raise ValidationException(f"{self.op} cannot be used with a non-floating column", ())
- return BoundPredicate(self.op, bound_term, is_unary_predicate=True)
-
- raise ValidationException(f"Operation must be in [IS_NULL, NOT_NULL, IS_NAN, NOT_NAN] was:{self.op}", ())
-
- def bind_in_operation(self, bound_term):
- from .expressions import Expressions
-
- def convert_literal(lit):
- converted = lit.to(bound_term)
- ValidationException.check(converted is not None,
- "Invalid Value for conversion to type %s: %s (%s)",
- (bound_term.type, lit, lit.__class__.__name__))
- return converted
-
- converted_literals = filter(lambda x: x != Literals.above_max() and x != Literals.below_min(),
- [convert_literal(lit) for lit in self.literals])
- if len(converted_literals) == 0:
- return Expressions.always_true() if Operation.NOT_IN else Expressions.always_false()
- literal_set = set(converted_literals)
- if len(literal_set) == 1:
- if self.op == Operation.IN:
- return BoundPredicate(Operation.EQ, bound_term, literal_set[0])
- elif self.op == Operation.NOT_IN:
- return BoundPredicate(Operation.NOT_EQ, bound_term, literal_set[0])
- else:
- raise ValidationException("Operation must be in or not in", ())
-
- return BoundPredicate(self.op, bound_term, literals=literal_set, is_set_predicate=True)
-
- def bind_literal_operation(self, bound_term):
- from .expressions import Expressions
-
- lit = self.lit.to(bound_term.type)
- ValidationException.check(lit is not None,
- "Invalid Value for conversion to type %s: %s (%s)",
- (bound_term.type, self.lit, self.lit.__class__.__name__))
-
- if lit == Literals.above_max():
- if self.op in [Operation.LT, Operation.LT_EQ, Operation.NOT_EQ]:
- return Expressions.always_true()
- elif self.op in [Operation.GT, Operation.GT_EQ, Operation.EQ]:
- return Expressions.always_false()
- elif lit == Literals.below_min():
- if self.op in [Operation.LT, Operation.LT_EQ, Operation.NOT_EQ]:
- return Expressions.always_false()
- elif self.op in [Operation.GT, Operation.GT_EQ, Operation.EQ]:
- return Expressions.always_true()
-
- return BoundPredicate(self.op, bound_term, lit=lit, is_literal_predicate=True)
-
- @staticmethod
- def floating_type(type_id: TypeID) -> bool:
- return type_id in [TypeID.FLOAT, TypeID.DOUBLE]
diff --git a/python_legacy/iceberg/api/expressions/projections.py b/python_legacy/iceberg/api/expressions/projections.py
deleted file mode 100644
index 7521b42b65..0000000000
--- a/python_legacy/iceberg/api/expressions/projections.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-from .expressions import Expressions, ExpressionVisitors, RewriteNot
-from .predicate import BoundPredicate, UnboundPredicate
-
-
-def inclusive(spec, case_sensitive=True):
- return InclusiveProjection(spec, case_sensitive)
-
-
-def strict(spec):
- return StrictProjection(spec)
-
-
-class ProjectionEvaluator(ExpressionVisitors.ExpressionVisitor):
-
- def project(self, expr):
- raise NotImplementedError()
-
-
-class BaseProjectionEvaluator(ProjectionEvaluator):
-
- def __init__(self, spec, case_sensitive=True):
- self.spec = spec
- self.case_sensitive = case_sensitive
-
- def project(self, expr):
- # projections assume that there are no NOT nodes in the expression tree. to ensure that this
- # is the case, the expression is rewritten to push all NOT nodes down to the expression
- # leaf nodes.
- # this is necessary to ensure that the default expression returned when a predicate can't be
- # projected is correct.
- #
- return ExpressionVisitors.visit(ExpressionVisitors.visit(expr, RewriteNot.get()), self)
-
- def always_true(self):
- return Expressions.always_true()
-
- def always_false(self):
- return Expressions.always_false()
-
- def not_(self, result):
- raise RuntimeError("[BUG] project called on expression with a not")
-
- def and_(self, left_result, right_result):
- return Expressions.and_(left_result, right_result)
-
- def or_(self, left_result, right_result):
- return Expressions.or_(left_result, right_result)
-
- def predicate(self, pred):
- bound = pred.bind(self.spec.schema.as_struct(), case_sensitive=self.case_sensitive)
-
- if isinstance(bound, BoundPredicate):
- return self.predicate(bound)
-
- return bound
-
-
-class InclusiveProjection(BaseProjectionEvaluator):
-
- def __init__(self, spec, case_sensitive=True):
- super(InclusiveProjection, self).__init__(spec,
- case_sensitive=case_sensitive)
-
- def predicate(self, pred):
- if isinstance(pred, UnboundPredicate):
- return super(InclusiveProjection, self).predicate(pred)
-
- part = self.spec.get_field_by_source_id(pred.ref.field.field_id)
-
- if part is None:
- return self.always_true()
-
- result = part.transform.project(part.name, pred)
- if result is not None:
- return result
-
- return self.always_true()
-
-
-class StrictProjection(BaseProjectionEvaluator):
-
- def __init__(self, spec):
- super(StrictProjection, self).__init__(spec)
-
- def predicate(self, pred):
- part = self.spec.get_field_by_source_id(pred.ref.field.field_id)
-
- if part is None:
- return self.always_false()
-
- result = part.transform.project_strict(part.name, pred)
-
- if result is not None:
- return result
-
- return self.always_false()
diff --git a/python_legacy/iceberg/api/expressions/reference.py b/python_legacy/iceberg/api/expressions/reference.py
deleted file mode 100644
index 9637607e56..0000000000
--- a/python_legacy/iceberg/api/expressions/reference.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from __future__ import annotations
-
-from typing import Any, TYPE_CHECKING
-
-from iceberg.exceptions import ValidationException
-
-from ..types import StructType
-
-if TYPE_CHECKING:
- from iceberg.api import StructLike
-
-
-class BoundReference:
-
- def __init__(self, struct, field):
- self.field = field
- self.pos = self.find(field.field_id, struct)
- self._type = struct.fields[self.pos].type
-
- @property
- def type(self):
- return self._type
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
- elif other is None or not isinstance(other, BoundReference):
- return False
-
- return self.field.field_id == other.field.field_id and self.pos == other.pos and self._type == other._type
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
- def find(self, field_id, struct):
- fields = struct.fields
- for i, field in enumerate(fields):
- if field.field_id == self.field.field_id:
- return i
-
- raise ValidationException("Cannot find top-level field id %d in struct: %s", (field_id, struct))
-
- def get(self, struct):
- return struct.get(self.pos)
-
- def __str__(self):
- return "ref(id={id}, pos={pos}, type={_type})".format(id=self.field.field_id,
- pos=self.pos,
- _type=self._type)
-
- @property
- def ref(self):
- return self
-
- def eval(self, struct: StructLike) -> Any:
- return self.get(struct)
-
-
-class NamedReference:
-
- def __init__(self, name):
- super(NamedReference, self).__init__()
- if name is None:
- raise RuntimeError("Name cannot be null")
-
- self.name = name
-
- @property
- def ref(self):
- return self
-
- def bind(self, struct: StructType, case_sensitive: bool = True) -> BoundReference:
- from iceberg.api import Schema
- schema = Schema(struct.fields)
- field = schema.find_field(self.name) if case_sensitive else schema.case_insensitive_find_field(self.name)
-
- ValidationException.check(field is not None, "Cannot find field '%s' in struct: %s", (self.name,
- schema.as_struct()))
-
- return BoundReference(struct, field)
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
- elif other is None or not isinstance(other, NamedReference):
- return False
-
- return self.name == other.name
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
- def __repr__(self):
- return "NamedReference({})".format(self.name)
-
- def __str__(self):
- return 'ref(name="{}")'.format(self.name)
diff --git a/python_legacy/iceberg/api/expressions/residual_evaluator.py b/python_legacy/iceberg/api/expressions/residual_evaluator.py
deleted file mode 100644
index 285abae051..0000000000
--- a/python_legacy/iceberg/api/expressions/residual_evaluator.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from .expressions import Expressions, ExpressionVisitors
-from .predicate import BoundPredicate, Predicate, UnboundPredicate
-
-
-class ResidualEvaluator(object):
-
- def __init__(self, spec, expr):
- self._spec = spec
- self._expr = expr
- self.__visitor = None
-
- def _visitor(self):
- if self.__visitor is None:
- self.__visitor = ResidualVisitor()
-
- return self.__visitor
-
- def residual_for(self, partition_data):
- return self._visitor().eval(partition_data)
-
-
-class ResidualVisitor(ExpressionVisitors.BoundExpressionVisitor):
-
- def __init__(self):
- self.struct = None
-
- def eval(self, struct):
- self.struct = struct
-
- def always_true(self):
- return Expressions.always_true()
-
- def always_false(self):
- return Expressions.always_false()
-
- def is_null(self, ref):
- return self.always_true() if ref.get(self.struct) is None else self.always_false()
-
- def not_null(self, ref):
- return self.always_true() if ref.get(self.struct) is not None else self.always_false()
-
- def lt(self, ref, lit):
- return self.always_true() if ref.get(self.struct) < lit.value else self.always_false()
-
- def lt_eq(self, ref, lit):
- return self.always_true() if ref.get(self.struct) <= lit.value else self.always_false()
-
- def gt(self, ref, lit):
- return self.always_true() if ref.get(self.struct) > lit.value else self.always_false()
-
- def gt_eq(self, ref, lit):
- return self.always_true() if ref.get(self.struct) >= lit.value else self.always_false()
-
- def eq(self, ref, lit):
- return self.always_true() if ref.get(self.struct) == lit.value else self.always_false()
-
- def not_eq(self, ref, lit):
- return self.always_true() if ref.get(self.struct) != lit.value else self.always_false()
-
- def not_(self, result):
- return Expressions.not_(result)
-
- def and_(self, left_result, right_result):
- return Expressions.and_(left_result, right_result)
-
- def or_(self, left_result, right_result):
- return Expressions.or_(left_result, right_result)
-
- def predicate(self, pred):
- if isinstance(pred, BoundPredicate):
- return self.bound_predicate(pred)
- elif isinstance(pred, UnboundPredicate):
- return self.unbound_predicate(pred)
-
- raise RuntimeError("Invalid predicate argument %s" % pred)
-
- def bound_predicate(self, pred):
- part = self.spec.get_field_by_source_id(pred.ref.field_id)
- if part is None:
- return pred
-
- strict_projection = part.transform.project_strict(part.name, pred)
- if strict_projection is None:
- bound = strict_projection.bind(self.spec.partition_type())
- if isinstance(bound, BoundPredicate):
- return super(ResidualVisitor, self).predicate(bound)
- return bound
-
- return pred
-
- def unbound_predicate(self, pred):
- bound = pred.bind(self.spec.schema.as_struct())
-
- if isinstance(bound, BoundPredicate):
- bound_residual = self.predicate(bound)
- if isinstance(bound_residual, Predicate):
- return pred
- return bound_residual
-
- return bound
diff --git a/python_legacy/iceberg/api/expressions/strict_metrics_evaluator.py b/python_legacy/iceberg/api/expressions/strict_metrics_evaluator.py
deleted file mode 100644
index 16b7728eed..0000000000
--- a/python_legacy/iceberg/api/expressions/strict_metrics_evaluator.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import threading
-
-from .expressions import Expressions, ExpressionVisitors
-from ..expressions.binder import Binder
-from ..types import Conversions
-
-
-class StrictMetricsEvaluator(object):
-
- def __init__(self, schema, unbound):
- self.schema = schema
- self.struct = schema.as_struct()
- self.expr = Binder.bind(self.struct, Expressions.rewrite_not(unbound))
- self.thread_local_data = threading.local()
-
- def _visitor(self):
- if not hasattr(self.thread_local_data, "visitors"):
- self.thread_local_data.visitors = StrictMetricsEvaluator.MetricsEvalVisitor(
- self.expr,
- self.schema,
- self.struct
- )
-
- return self.thread_local_data.visitors
-
- def eval(self, file):
- return self._visitor().eval(file)
-
- class MetricsEvalVisitor(ExpressionVisitors.BoundExpressionVisitor):
- ROWS_MUST_MATCH = True
- ROWS_MIGHT_NOT_MATCH = False
-
- def __init__(self, expr, schema, struct):
- self.expr = expr
- self.schema = schema
- self.struct = struct
- self.value_counts = None
- self.null_counts = None
- self.lower_bounds = None
- self.upper_bounds = None
-
- def eval(self, file):
- if file.record_count() <= 0:
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MUST_MATCH
-
- self.value_counts = file.value_counts()
- self.null_counts = file.null_value_counts()
- self.lower_bounds = file.lower_bounds()
- self.upper_bounds = file.upper_bounds()
-
- return ExpressionVisitors.visit(self.expr, self)
-
- def always_true(self):
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MUST_MATCH
-
- def always_false(self):
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MIGHT_NOT_MATCH
-
- def not_(self, result):
- return not result
-
- def and_(self, left_result, right_result):
- return left_result and right_result
-
- def or_(self, left_result, right_result):
- return left_result or right_result
-
- def is_null(self, ref):
- id = ref.field.field_id
- if self.struct.field(id=id) is None:
- raise RuntimeError("Cannot filter by nested column: %s" % self.schema.find_field(id))
-
- if self.value_counts is not None and self.value_counts.get(id) is not None \
- and self.null_counts is not None and self.null_counts.get(id) is not None \
- and self.value_counts.get(id) - self.null_counts.get(id) == 0:
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MUST_MATCH
-
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MIGHT_NOT_MATCH
-
- def not_null(self, ref):
- id = ref.field.field_id
- if self.struct.field(id=id) is None:
- raise RuntimeError("Cannot filter by nested column: %s" % self.schema.find_field(id))
-
- if self.null_counts is not None and self.null_counts.get(id, -1) == 0:
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MUST_MATCH
-
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MIGHT_NOT_MATCH
-
- def lt(self, ref, lit):
- # Rows must match when: <----------Min----Max---X------->
- id = ref.field.field_id
-
- field = self.struct.field(id=id)
-
- if field is None:
- raise RuntimeError("Cannot filter by nested column: %s" % self.schema.find_field(id))
-
- if self.upper_bounds is not None and id in self.upper_bounds:
- upper = Conversions.from_byte_buffer(field.type, self.upper_bounds.get(id))
- if upper < lit.value:
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MUST_MATCH
-
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MIGHT_NOT_MATCH
-
- def lt_eq(self, ref, lit):
- # Rows must match when: <----------Min----Max---X------->
- id = ref.field.field_id
-
- field = self.struct.field(id=id)
-
- if field is None:
- raise RuntimeError("Cannot filter by nested column: %s" % self.schema.find_field(id))
-
- if self.upper_bounds is not None and id in self.upper_bounds:
- upper = Conversions.from_byte_buffer(field.type, self.upper_bounds.get(id))
- if upper <= lit.value:
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MUST_MATCH
-
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MIGHT_NOT_MATCH
-
- def gt(self, ref, lit):
- # Rows must match when: <-------X---Min----Max---------->
- id = ref.field.field_id
-
- field = self.struct.field(id=id)
-
- if field is None:
- raise RuntimeError("Cannot filter by nested column: %s" % self.schema.find_field(id))
-
- if self.lower_bounds is not None and id in self.lower_bounds:
- lower = Conversions.from_byte_buffer(field.type, self.lower_bounds.get(id))
- if lower > lit.value:
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MUST_MATCH
-
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MIGHT_NOT_MATCH
-
- def gt_eq(self, ref, lit):
- # Rows must match when: <-------X---Min----Max---------->
- id = ref.field.field_id
-
- field = self.struct.field(id=id)
-
- if field is None:
- raise RuntimeError("Cannot filter by nested column: %s" % self.schema.find_field(id))
-
- if self.lower_bounds is not None and id in self.lower_bounds:
- lower = Conversions.from_byte_buffer(field.type, self.lower_bounds.get(id))
- if lower >= lit.value:
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MUST_MATCH
-
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MIGHT_NOT_MATCH
-
- def eq(self, ref, lit):
- # Rows must match when Min == X == Max
- id = ref.field.field_id
-
- field = self.struct.field(id=id)
-
- if field is None:
- raise RuntimeError("Cannot filter by nested column: %s" % self.schema.find_field(id))
-
- if self.lower_bounds is not None and id in self.lower_bounds \
- and self.upper_bounds is not None and id in self.upper_bounds:
- lower = Conversions.from_byte_buffer(field.type, self.lower_bounds.get(id))
- if lower != lit.value:
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MIGHT_NOT_MATCH
-
- upper = Conversions.from_byte_buffer(field.type, self.upper_bounds.get(id))
-
- if upper != lit.value:
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MIGHT_NOT_MATCH
-
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MUST_MATCH
-
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MIGHT_NOT_MATCH
-
- def not_eq(self, ref, lit):
- # Rows must match when X < Min or Max < X because it is not in the range
- id = ref.field.field_id
-
- field = self.struct.field(id=id)
-
- if field is None:
- raise RuntimeError("Cannot filter by nested column: %s" % self.schema.find_field(id))
-
- if self.lower_bounds is not None and id in self.lower_bounds:
- lower = Conversions.from_byte_buffer(field.type, self.lower_bounds.get(id))
- if lower > lit.value:
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MUST_MATCH
-
- if self.upper_bounds is not None and id in self.upper_bounds:
- upper = Conversions.from_byte_buffer(field.type, self.upper_bounds.get(id))
-
- if upper < lit.value:
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MUST_MATCH
-
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MIGHT_NOT_MATCH
-
- def in_(self, ref, lit):
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MIGHT_NOT_MATCH
-
- def not_in(self, ref, lit):
- return StrictMetricsEvaluator.MetricsEvalVisitor.ROWS_MIGHT_NOT_MATCH
diff --git a/python_legacy/iceberg/api/expressions/term.py b/python_legacy/iceberg/api/expressions/term.py
deleted file mode 100644
index e135783a08..0000000000
--- a/python_legacy/iceberg/api/expressions/term.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from __future__ import annotations
-
-from typing import Any, TYPE_CHECKING
-
-from iceberg.api.types import StructType
-
-if TYPE_CHECKING:
- from iceberg.api import StructLike
-
-
-class Bound(object):
- def eval(self, struct: StructLike) -> Any:
- """Evaluate current object against struct"""
-
-
-class Unbound(object):
- @property
- def ref(self):
- """the underlying reference"""
-
- def bind(self, struct: StructType, case_sensitive: bool = True) -> Bound:
- """bind the current object based on the given struct and return the Bound object"""
-
-
-class Term(object):
- @property
- def ref(self):
- """Expose the reference for this term"""
-
- @property
- def type(self):
- """accessor for term type """
-
-
-class BoundTerm(Bound, Term):
-
- @property
- def ref(self):
- raise NotImplementedError("Base class does not have implementation")
-
- @property
- def type(self):
- raise NotImplementedError("Base class does not have implementation")
-
- def eval(self, struct: StructLike):
- raise NotImplementedError("Base class does not have implementation")
-
-
-class UnboundTerm(Unbound, Term):
-
- @property
- def ref(self):
- raise NotImplementedError("Base class does not have implementation")
-
- def bind(self, struct: StructType, case_sensitive: bool = True):
- raise NotImplementedError("Base class does not have implementation")
diff --git a/python_legacy/iceberg/api/expressions/transform.py b/python_legacy/iceberg/api/expressions/transform.py
deleted file mode 100644
index cf2e4563f0..0000000000
--- a/python_legacy/iceberg/api/expressions/transform.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-from .reference import BoundReference, NamedReference
-from .. import StructLike
-from ..transforms import Transform, Transforms
-from ..types import StructType
-from ...exceptions import ValidationException
-
-
-class BoundTransform:
-
- def __init__(self, ref: BoundReference, transform: Transform):
- self._ref = ref
- self._transform = transform
-
- @property
- def ref(self):
- return self._ref
-
- @property
- def type(self):
- return self.transform.get_result_type(self.ref.type)
-
- @property
- def transform(self):
- return self._transform
-
- def eval(self, struct: StructLike):
- return self.transform.apply(self.ref.eval(struct))
-
- def __str__(self):
- return f"{self.transform}({self.ref})"
-
-
-class UnboundTransform:
-
- def __init__(self, ref: NamedReference, transform: Transform):
- self._ref = ref
- self._transform = transform
-
- @property
- def ref(self):
- return self._ref
-
- @property
- def transform(self):
- return self._transform
-
- def bind(self, struct: StructType, case_sensitive: bool = True):
- bound_ref = self.ref.bind(struct, case_sensitive=case_sensitive)
-
- type_transform = Transforms.from_string(bound_ref.type, str(self.transform))
- ValidationException.check(type_transform.can_transform(bound_ref.type),
- "Cannot bind: %s cannot transform %s values from '%s'", (self.transform,
- bound_ref.type,
- self.ref.name))
-
- return BoundTransform(bound_ref, type_transform)
-
- def __str__(self):
- return f"{self.transform}({self.ref})"
diff --git a/python_legacy/iceberg/api/file_format.py b/python_legacy/iceberg/api/file_format.py
deleted file mode 100644
index a1e8998b66..0000000000
--- a/python_legacy/iceberg/api/file_format.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from enum import Enum, unique
-
-
-@unique
-class FileFormat(Enum):
- ORC = {"extension": "orc", "splittable": True}
- PARQUET = {"extension": "parquet", "splittable": True}
- AVRO = {"extension": "avro", "splittable": True}
-
- def add_extension(self, filename):
- if filename.endswith(self.value["extension"]):
- return filename
- else:
- return filename + "." + self.value["extension"]
-
- def is_splittable(self):
- return self.value["splittable"]
-
- @staticmethod
- def from_file_name(filename):
- last_index_of = filename.rfind('.')
- if last_index_of < 0:
- return None
- ext = filename[last_index_of + 1:]
- for fmt in FileFormat:
- if ext == fmt.value["extension"]:
- return fmt
diff --git a/python_legacy/iceberg/api/file_scan_task.py b/python_legacy/iceberg/api/file_scan_task.py
deleted file mode 100644
index f51103b38e..0000000000
--- a/python_legacy/iceberg/api/file_scan_task.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from .scan_task import ScanTask
-
-
-class FileScanTask(ScanTask):
-
- @property
- def file(self):
- raise NotImplementedError()
-
- @property
- def spec(self):
- raise NotImplementedError()
-
- @property
- def start(self):
- raise NotImplementedError()
-
- @property
- def length(self):
- raise NotImplementedError()
-
- @property
- def residual(self):
- raise NotImplementedError()
-
- def is_file_scan_task(self):
- return True
-
- def as_file_scan_task(self):
- return self
diff --git a/python_legacy/iceberg/api/files.py b/python_legacy/iceberg/api/files.py
deleted file mode 100644
index 7c58032e8e..0000000000
--- a/python_legacy/iceberg/api/files.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import gzip
-import os
-
-from iceberg.exceptions import AlreadyExistsException
-
-from .expressions import JAVA_MAX_INT
-from .io import (InputFile,
- OutputFile,
- PositionOutputStream,
- SeekableInputStream)
-
-
-class Files(object):
-
- @staticmethod
- def local_output(file):
- return LocalOutputFile(file)
-
- @staticmethod
- def local_input(file):
- return LocalInputFile(file)
-
-
-class LocalOutputFile(OutputFile):
-
- def __init__(self, file):
- self.file = file
-
- def create(self):
- if os.path.exists(self.file):
- raise AlreadyExistsException("File already exists: %s" % self.file)
-
- return PositionOutputStream(open(self.file, "w"))
-
- def create_or_overwrite(self):
- if os.path.exists(self.file):
- if not os.remove(self.file.name):
- raise RuntimeError("File deleting file: %s" % self.file)
-
- return self.create()
-
- def location(self):
- return self.file
-
- def __str__(self):
- return self.location()
-
-
-class LocalInputFile(InputFile):
-
- def __init__(self, file):
- self.file = file
-
- def get_length(self):
- return os.path.getsize(self.file)
-
- def new_stream(self, gzipped=False):
- with open(self.file, "rb") as fo:
- if gzipped:
- fo = gzip.GzipFile(fileobj=fo)
- return fo
-
- def location(self):
- return self.file
-
- def __str__(self):
- return self.location()
-
-
-class SeekableFileInputStream(SeekableInputStream):
-
- def __init__(self, stream):
- self.stream = stream
-
- def get_pos(self):
- return self.stream.tell()
-
- def seek(self, new_pos):
- return self.stream.seek(new_pos)
-
- def read(self, b=None, off=None, read_len=None):
- if b is None and off is None and read_len is None:
- return None, self.stream.read()
- if b is None:
- raise RuntimeError("supplied byte field is None")
-
- if off is None and read_len is None:
- new_b = self.stream.read(b.length)
- return len(new_b), new_b
-
- if off is not None and read_len is None or off is None and read_len is not None:
- raise RuntimeError("Invalid args: read_len(%s), off(%s)" % (read_len, off))
-
- if read_len < 0 or off < 0 or (len(b) - off < read_len):
- raise RuntimeError("Invalid args: read_len(%s), off(%s), len_b_offset(%s)" % (read_len, off, len(b) - off))
-
- new_b = bytes(self.stream.read(read_len), "utf8")
-
- if off > 0:
- new_b = b[0:off] + new_b
- if off + read_len < len(b):
- new_b = new_b + b[off + read_len:]
- return read_len, new_b
-
- def skip(self, n):
- if n > JAVA_MAX_INT:
- return self.stream.seek(JAVA_MAX_INT)
- else:
- return self.stream.seek(n)
diff --git a/python_legacy/iceberg/api/filterable.py b/python_legacy/iceberg/api/filterable.py
deleted file mode 100644
index beb8d48767..0000000000
--- a/python_legacy/iceberg/api/filterable.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-class Filterable(object):
- ALL_COLUMNS = ("*",)
-
- def select(self, columns):
- raise NotImplementedError()
-
- def filter_partitions(self, expr):
- raise NotImplementedError()
-
- def filter_rows(self, expr):
- raise NotImplementedError()
-
- def iterator(self):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/filtered_snapshot.py b/python_legacy/iceberg/api/filtered_snapshot.py
deleted file mode 100644
index 055480a283..0000000000
--- a/python_legacy/iceberg/api/filtered_snapshot.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from .expressions import Expressions
-from .filterable import Filterable
-
-
-class FilteredSnapshot(Filterable):
-
- def __init__(self, snapshot, part_filter, row_filter, columns):
- self.snapshot = snapshot
- self.part_filter = part_filter
- self.row_filter = row_filter
- self.columns = columns
-
- def select(self, columns):
- return FilteredSnapshot(self.snapshot, self.part_filter, self.row_filter, columns)
-
- def filter_partitions(self, expr):
- return FilteredSnapshot(self.snapshot, Expressions.and_(self.part_filter, expr), self.row_filter, self.columns)
-
- def filter_rows(self, expr):
- return FilteredSnapshot(self.snapshot, self.part_filter, Expressions.and_(self.row_filter, expr), self.columns)
-
- def iterator(self):
- return self.snapshot.iterator(self.part_filter, self.row_filter, self.columns)
diff --git a/python_legacy/iceberg/api/io/__init__.py b/python_legacy/iceberg/api/io/__init__.py
deleted file mode 100644
index 0fdac354b9..0000000000
--- a/python_legacy/iceberg/api/io/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-__all__ = ["CloseableGroup",
- "FileAppender",
- "InputFile",
- "OutputFile",
- "PositionOutputStream",
- "SeekableInputStream"]
-
-from .closeable_group import CloseableGroup
-from .file_appender import FileAppender
-from .input_file import InputFile
-from .output_file import OutputFile
-from .position_output_stream import PositionOutputStream
-from .seekable_input_stream import SeekableInputStream
diff --git a/python_legacy/iceberg/api/io/closeable_group.py b/python_legacy/iceberg/api/io/closeable_group.py
deleted file mode 100644
index 4abe1b2fd1..0000000000
--- a/python_legacy/iceberg/api/io/closeable_group.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-class CloseableGroup(object):
-
- def __init__(self):
- self.closeables = list()
-
- def add_closeable(self, closeable):
- self.closeables.append(closeable)
-
- def close(self):
- while self.closeables:
- to_close = self.closeables.pop(0)
- if to_close is not None:
- to_close.close()
diff --git a/python_legacy/iceberg/api/io/closeable_iterable.py b/python_legacy/iceberg/api/io/closeable_iterable.py
deleted file mode 100644
index 2ea80effa1..0000000000
--- a/python_legacy/iceberg/api/io/closeable_iterable.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import collections
-
-
-class CloseableIterable(collections.Iterator):
-
- def __next__(self):
- raise NotImplementedError()
-
- def close(self):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/io/delegating_input_stream.py b/python_legacy/iceberg/api/io/delegating_input_stream.py
deleted file mode 100644
index 40ebe34dc2..0000000000
--- a/python_legacy/iceberg/api/io/delegating_input_stream.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-class DelegatingInputStream(object):
-
- def get_delegate(self):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/io/delegating_output_stream.py b/python_legacy/iceberg/api/io/delegating_output_stream.py
deleted file mode 100644
index 84064d0f3b..0000000000
--- a/python_legacy/iceberg/api/io/delegating_output_stream.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-class DelegatingOutputStream(object):
-
- def get_delegate(self):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/io/file_appender.py b/python_legacy/iceberg/api/io/file_appender.py
deleted file mode 100644
index fabb05d068..0000000000
--- a/python_legacy/iceberg/api/io/file_appender.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-class FileAppender(object):
-
- def add(self, d):
- raise NotImplementedError()
-
- def add_all(self, values):
- for value in values:
- self.add(value)
-
- def metrics(self):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/io/input_file.py b/python_legacy/iceberg/api/io/input_file.py
deleted file mode 100644
index 08b68ad0a2..0000000000
--- a/python_legacy/iceberg/api/io/input_file.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
- from iceberg.core.filesystem import FileSystem
-
-
-class InputFile(object):
- fs: 'FileSystem'
- path: str
-
- def get_length(self):
- raise NotImplementedError()
-
- def new_stream(self):
- raise NotImplementedError()
-
- def location(self):
- raise NotImplementedError()
-
- def new_fo(self):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/io/output_file.py b/python_legacy/iceberg/api/io/output_file.py
deleted file mode 100644
index f091459b82..0000000000
--- a/python_legacy/iceberg/api/io/output_file.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-class OutputFile(object):
-
- def create(self):
- raise NotImplementedError()
-
- def create_or_overwrite(self):
- raise NotImplementedError()
-
- def location(self):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/io/position_output_stream.py b/python_legacy/iceberg/api/io/position_output_stream.py
deleted file mode 100644
index fb1e758043..0000000000
--- a/python_legacy/iceberg/api/io/position_output_stream.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-class PositionOutputStream(object):
-
- # OutputStream methods
- def close(self):
- raise NotImplementedError()
-
- def flush(self):
- raise NotImplementedError()
-
- def write(self, b, off=None, len=None):
- raise NotImplementedError()
-
- def get_pos(self):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/io/seekable_input_stream.py b/python_legacy/iceberg/api/io/seekable_input_stream.py
deleted file mode 100644
index 60808a8e38..0000000000
--- a/python_legacy/iceberg/api/io/seekable_input_stream.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-class SeekableInputStream(object):
-
- def available(self):
- raise NotImplementedError()
-
- def close(self):
- raise NotImplementedError()
-
- def mark(self, read_limit):
- raise NotImplementedError()
-
- def mark_supported(self):
- raise NotImplementedError()
-
- def read(self, b=None, off=None, len=None):
- raise NotImplementedError()
-
- def reset(self):
- raise NotImplementedError()
-
- def skip(self, n):
- raise NotImplementedError()
-
- def get_pos(self):
- raise NotImplementedError()
-
- def seek(self, new_pos):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/manifest_file.py b/python_legacy/iceberg/api/manifest_file.py
deleted file mode 100644
index a4a0274da1..0000000000
--- a/python_legacy/iceberg/api/manifest_file.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from .schema import Schema
-from .types import (BinaryType,
- BooleanType,
- IntegerType,
- ListType,
- LongType,
- NestedField,
- StringType,
- StructType)
-
-
-class ManifestFile(object):
- SCHEMA = Schema(NestedField.required(500, "manifest_path", StringType.get()),
- NestedField.required(501, "manifest_length", LongType.get()),
- NestedField.required(502, "partition_spec_id", IntegerType.get()),
- NestedField.optional(503, "added_snapshot_id", LongType.get()),
- NestedField.optional(504, "added_data_files_count", IntegerType.get()),
- NestedField.optional(505, "existing_data_files_count", IntegerType.get()),
- NestedField.optional(506, "deleted_data_files_count", IntegerType.get()),
- NestedField
- .optional(507, "partitions",
- ListType.of_required(508, StructType.of([NestedField.required(509,
- "contains_null",
- BooleanType.get()),
- NestedField.optional(510,
- "lower_bound",
- BinaryType.get()),
- NestedField.optional(511,
- "upper_bound",
- BinaryType.get())]))))
-
- @staticmethod
- def schema():
- return ManifestFile.SCHEMA
-
- @property
- def added_files_count(self):
- raise NotImplementedError()
-
- @property
- def existing_files_count(self):
- raise NotImplementedError()
-
- @property
- def deleted_files_count(self):
- raise NotImplementedError()
-
- def copy(self):
- raise NotImplementedError()
-
- def has_added_files(self):
- return self.added_files_count is None or self.added_files_count > 0
-
- def has_existing_files(self):
- return self.existing_files_count is None or self.existing_files_count > 0
-
- def has_deleted_files(self):
- return self.deleted_files_count is None or self.deleted_files_count > 0
-
-
-class PartitionFieldSummary(object):
- TYPE = ManifestFile.schema().find_type("partitions").as_list_type().element_type.as_struct_type()
-
- @staticmethod
- def get_type():
- return PartitionFieldSummary.TYPE
-
- def contains_null(self):
- raise NotImplementedError()
-
- def lower_bound(self):
- raise NotImplementedError()
-
- def upper_bound(self):
- raise NotImplementedError()
-
- def copy(self):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/metrics.py b/python_legacy/iceberg/api/metrics.py
deleted file mode 100644
index 231388a256..0000000000
--- a/python_legacy/iceberg/api/metrics.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-class Metrics(object):
-
- def __init__(self, row_count=None,
- column_sizes=None,
- value_counts=None, null_value_counts=None,
- lower_bounds=None, upper_bounds=None):
- self.row_count = row_count
- self.column_sizes = column_sizes
- self.value_counts = value_counts
- self.null_value_counts = null_value_counts
- self.lower_bounds = lower_bounds
- self.upper_bounds = upper_bounds
diff --git a/python_legacy/iceberg/api/overwrite_files.py b/python_legacy/iceberg/api/overwrite_files.py
deleted file mode 100644
index 3bcf0cec88..0000000000
--- a/python_legacy/iceberg/api/overwrite_files.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from .pending_update import PendingUpdate
-
-
-class OverwriteFiles(PendingUpdate):
-
- def overwrite_by_row_filter(self, expr):
- raise NotImplementedError()
-
- def add_file(self, file):
- raise NotImplementedError()
-
- def validate_added_files(self):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/partition_field.py b/python_legacy/iceberg/api/partition_field.py
deleted file mode 100644
index 8e789a1089..0000000000
--- a/python_legacy/iceberg/api/partition_field.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-class PartitionField(object):
-
- def __init__(self, source_id, field_id, name, transform):
- self.source_id = source_id
- self.field_id = field_id
- self.name = name
- self.transform = transform
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
- elif other is None or not isinstance(other, PartitionField):
- return False
-
- return self.source_id == other.source_id and self.field_id == other.field_id and \
- self.name == other.name and self.transform == other.transform
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
- def __hash__(self):
- return hash(self.__key())
-
- def __key(self):
- return PartitionField.__class__, self.source_id, self.field_id, self.name, self.transform
diff --git a/python_legacy/iceberg/api/partition_spec.py b/python_legacy/iceberg/api/partition_spec.py
deleted file mode 100644
index e2f106d76b..0000000000
--- a/python_legacy/iceberg/api/partition_spec.py
+++ /dev/null
@@ -1,352 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from urllib.parse import quote_plus
-
-from iceberg.exceptions import ValidationException
-
-from .partition_field import PartitionField
-from .schema import Schema
-from .transforms import Transform, Transforms
-from .types import (NestedField,
- StructType)
-
-
-class PartitionSpec(object):
-
- PARTITION_DATA_ID_START = 1000
-
- @staticmethod
- def UNPARTITIONED_SPEC():
- return PartitionSpec(Schema(), 0, [], PartitionSpec.PARTITION_DATA_ID_START - 1)
-
- @staticmethod
- def unpartitioned():
- return PartitionSpec.UNPARTITIONED_SPEC()
-
- def __init__(self, schema, spec_id, fields, last_assigned_field_id):
- self.fields_by_source_id = None
- self.fields_by_name = None
- self.__java_classes = None
- self.field_list = None
-
- self.schema = schema
- self.spec_id = spec_id
- self.__fields = list()
- for field in fields:
- self.__fields.append(field)
- self.last_assigned_field_id = last_assigned_field_id
-
- @property
- def fields(self):
- return self.lazy_field_list()
-
- @property
- def java_classes(self):
- if self.__java_classes is None:
- self.__java_classes
- for field in self.__fields:
- source_type = self.schema.find_type(field.source_id)
- result = field.transform().get_result_by_type(source_type)
- self.__java_classes.append(result.type_id.java_class())
-
- return self.__java_classes
-
- def get_field_by_source_id(self, field_id):
- return self.lazy_fields_by_source_id().get(field_id)
-
- def partition_type(self):
- struct_fields = list()
- for _i, field in enumerate(self.__fields):
- source_type = self.schema.find_type(field.source_id)
- result_type = field.transform.get_result_type(source_type)
- struct_fields.append(NestedField.optional(field.field_id,
- field.name,
- result_type))
-
- return StructType.of(struct_fields)
-
- def get(self, data, pos, java_class):
- data.get(pos, java_class)
-
- def escape(self, string):
- return quote_plus(string, encoding="UTF-8")
-
- def partition_to_path(self, data):
- sb = list()
- java_classes = self.java_classes
- for i, jclass in enumerate(java_classes):
- field = self.__fields[i]
- value_string = field.transform().to_human_string(self.get(data, i, jclass))
-
- if i > 0:
- sb.append("/")
- sb.append(field.name)
- sb.append("=")
- sb.append(self.escape(value_string))
-
- return "".join(sb)
-
- def compatible_with(self, other):
- if self.__eq__(other):
- return True
-
- if len(self.__fields) != len(other.__fields):
- return False
-
- for i, field in enumerate(self.__fields):
- that_field = other.__fields[i]
- if field.source_id != that_field.source_id or str(field.transform) != str(that_field.transform):
- return False
-
- return True
-
- def lazy_fields_by_source_id(self):
- if self.fields_by_source_id is None:
- self.fields_by_source_id = dict()
- for field in self.fields:
- self.fields_by_source_id[field.source_id] = field
-
- return self.fields_by_source_id
-
- def identity_source_ids(self):
- source_ids = set()
- fields = self.fields
- for field in fields:
- if "identity" == str(field.transform()):
- source_ids.add(field)
-
- return source_ids
-
- def lazy_field_list(self):
- if self.field_list is None:
- self.field_list = list(self.__fields)
-
- return self.field_list
-
- def lazy_fields_by_source_name(self):
- if self.fields_by_name is None:
- self.fields_by_name = dict()
- for field in self.__fields:
- self.fields_by_name[field.name] = field
-
- return self.fields_by_name
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
-
- if other is None or not isinstance(other, PartitionSpec):
- return False
-
- return self.__fields == other.__fields
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
- def __hash__(self):
- return hash(self.__key())
-
- def __key(self):
- return PartitionSpec.__class__, tuple(self.fields)
-
- def __str__(self):
- return self.__repr__()
-
- def __repr__(self):
- sb = ["["]
-
- for field in self.__fields:
- sb.append("\n {field_id}: {name}: {transform}({source_id})".format(field_id=field.field_id,
- name=field.name,
- transform=str(field.transform),
- source_id=field.source_id))
-
- if len(self.__fields) > 0:
- sb.append("\n")
- sb.append("]")
-
- return "".join(sb)
-
- @staticmethod
- def builder_for(schema: Schema) -> "PartitionSpecBuilder":
- return PartitionSpecBuilder(schema)
-
- @staticmethod
- def check_compatibility(spec, schema):
- for field in spec.fields:
- src_type = schema.find_type(field.source_id)
- if not src_type.is_primitive_type():
- raise ValidationException("Cannot partition by non-primitive source field: %s", src_type)
- if not field.transform.can_transform(src_type):
- raise ValidationException("Invalid source type %s for transform: %s", (src_type, field.transform))
-
-
-class PartitionSpecBuilder(object):
-
- def __init__(self, schema):
- self.schema = schema
- self.fields = list()
- self.partition_names = set()
- self.dedup_fields = dict()
- self.spec_id = 0
- self.last_assigned_field_id = PartitionSpec.PARTITION_DATA_ID_START - 1
-
- def __next_field_id(self):
- self.last_assigned_field_id = self.last_assigned_field_id + 1
- return self.last_assigned_field_id
-
- def with_spec_id(self, spec_id):
- self.spec_id = spec_id
- return self
-
- def check_and_add_partition_name(self, name, source_column_id=None):
- schema_field = self.schema.find_field(name)
- if source_column_id is not None:
- if schema_field is not None and schema_field.field_id != source_column_id:
- raise ValueError("Cannot create identity partition sourced from different field in schema: %s" % name)
- else:
- if schema_field is not None:
- raise ValueError("Cannot create partition from name that exists in schema: %s" % name)
-
- if name is None or name == "":
- raise ValueError("Cannot use empty or null partition name: %s" % name)
- if name in self.partition_names:
- raise ValueError("Cannot use partition names more than once: %s" % name)
-
- self.partition_names.add(name)
- return self
-
- def check_redundant_and_add_field(self, field_id: int, name: str, transform: Transform) -> None:
- field = PartitionField(field_id,
- self.__next_field_id(),
- name,
- transform)
- dedup_key = (field.source_id, field.transform.dedup_name())
- partition_field = self.dedup_fields.get(dedup_key)
- if partition_field is not None:
- raise ValueError("Cannot add redundant partition: %s conflicts with %s" % (partition_field, field))
- self.dedup_fields[dedup_key] = field
- self.fields.append(field)
-
- def find_source_column(self, source_name):
- source_column = self.schema.find_field(source_name)
- if source_column is None:
- raise RuntimeError("Cannot find source column: %s" % source_name)
-
- return source_column
-
- def identity(self, source_name, target_name=None):
- if target_name is None:
- target_name = source_name
-
- source_column = self.find_source_column(source_name)
- self.check_and_add_partition_name(target_name, source_column.field_id)
- self.check_redundant_and_add_field(source_column.field_id,
- target_name,
- Transforms.identity(source_column.type))
- return self
-
- def year(self, source_name, target_name=None):
- if target_name is None:
- target_name = "{}_year".format(source_name)
-
- self.check_and_add_partition_name(target_name)
- source_column = self.find_source_column(source_name)
- self.check_redundant_and_add_field(source_column.field_id,
- target_name,
- Transforms.year(source_column.type))
- return self
-
- def month(self, source_name, target_name=None):
- if target_name is None:
- target_name = "{}_month".format(source_name)
-
- self.check_and_add_partition_name(target_name)
- source_column = self.find_source_column(source_name)
- self.check_redundant_and_add_field(source_column.field_id,
- target_name,
- Transforms.month(source_column.type))
- return self
-
- def day(self, source_name, target_name=None):
- if target_name is None:
- target_name = "{}_day".format(source_name)
-
- self.check_and_add_partition_name(target_name)
- source_column = self.find_source_column(source_name)
- self.check_redundant_and_add_field(source_column.field_id,
- target_name,
- Transforms.day(source_column.type))
- return self
-
- def hour(self, source_name, target_name=None):
- if target_name is None:
- target_name = "{}_hour".format(source_name)
-
- self.check_and_add_partition_name(target_name)
- source_column = self.find_source_column(source_name)
- self.check_redundant_and_add_field(source_column.field_id,
- target_name,
- Transforms.hour(source_column.type))
- return self
-
- def bucket(self, source_name, num_buckets, target_name=None):
- if target_name is None:
- target_name = "{}_bucket".format(source_name)
-
- self.check_and_add_partition_name(target_name)
- source_column = self.find_source_column(source_name)
- self.fields.append(PartitionField(source_column.field_id,
- self.__next_field_id(),
- target_name,
- Transforms.bucket(source_column.type, num_buckets)))
- return self
-
- def truncate(self, source_name, width, target_name=None):
- if target_name is None:
- target_name = "{}_truncate".format(source_name)
-
- self.check_and_add_partition_name(target_name)
- source_column = self.find_source_column(source_name)
- self.fields.append(PartitionField(source_column.field_id,
- self.__next_field_id(),
- target_name,
- Transforms.truncate(source_column.type, width)))
- return self
-
- def add_without_field_id(self, source_id, name, transform):
- return self.add(source_id, self.__next_field_id(), name, transform)
-
- def add(self, source_id: int, field_id: int, name: str, transform: str) -> "PartitionSpecBuilder":
- column = self.schema.find_field(source_id)
- if column is None:
- raise ValueError("Cannot find source column: %s" % source_id)
-
- transform_obj = Transforms.from_string(column.type, transform)
- self.check_and_add_partition_name(name, source_id)
- self.fields.append(PartitionField(source_id,
- field_id,
- name,
- transform_obj))
- self.last_assigned_field_id = max(self.last_assigned_field_id, field_id)
- return self
-
- def build(self):
- spec = PartitionSpec(self.schema, self.spec_id, self.fields, self.last_assigned_field_id)
- PartitionSpec.check_compatibility(spec, self.schema)
- return spec
diff --git a/python_legacy/iceberg/api/pending_update.py b/python_legacy/iceberg/api/pending_update.py
deleted file mode 100644
index 6569895e90..0000000000
--- a/python_legacy/iceberg/api/pending_update.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-class PendingUpdate(object):
-
- def apply(self):
- raise NotImplementedError()
-
- def commit(self):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/replace_partitions.py b/python_legacy/iceberg/api/replace_partitions.py
deleted file mode 100644
index ca7199daba..0000000000
--- a/python_legacy/iceberg/api/replace_partitions.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from .pending_update import PendingUpdate
-
-
-class ReplacePartitions(PendingUpdate):
-
- def __init__(self):
- raise NotImplementedError()
-
- def apply(self):
- raise NotImplementedError()
-
- def commit(self):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/rewrite_files.py b/python_legacy/iceberg/api/rewrite_files.py
deleted file mode 100644
index fd120985bc..0000000000
--- a/python_legacy/iceberg/api/rewrite_files.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from .pending_update import PendingUpdate
-
-
-class RewriteFiles(PendingUpdate):
-
- def rewrite_files(self, files_to_delete=None, files_to_add=None):
- raise NotImplementedError()
-
- def apply(self):
- raise NotImplementedError()
-
- def commit(self):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/rollback.py b/python_legacy/iceberg/api/rollback.py
deleted file mode 100644
index 5b19261238..0000000000
--- a/python_legacy/iceberg/api/rollback.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from .pending_update import PendingUpdate
-
-
-class Rollback(PendingUpdate):
-
- def to_snapshot_id(self, snapshot_id):
- raise NotImplementedError()
-
- def to_snapshot_at_time(self, timestamp_millis):
- raise NotImplementedError()
-
- def apply(self):
- raise NotImplementedError()
-
- def commit(self):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/scan_task.py b/python_legacy/iceberg/api/scan_task.py
deleted file mode 100644
index 7b13560def..0000000000
--- a/python_legacy/iceberg/api/scan_task.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-class ScanTask(object):
-
- def is_file_scan_task(self):
- return False
-
- def as_file_scan_task(self):
- raise RuntimeError("Not a FileScanTask: %s" % self)
-
- def as_combined_scan_task(self):
- RuntimeError("Not a CombinedScanTask: %s" % self)
diff --git a/python_legacy/iceberg/api/schema.py b/python_legacy/iceberg/api/schema.py
deleted file mode 100644
index 50d99e73ab..0000000000
--- a/python_legacy/iceberg/api/schema.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from .types import StructType
-from .types import type_util
-"""
- The schema of a data table.
-
-"""
-
-
-class Schema(object):
- NEWLINE = '\n'
- ALL_COLUMNS = "*"
-
- def __init__(self, *argv):
- aliases = None
- if len(argv) == 1 and isinstance(argv[0], (list, tuple)):
- columns = argv[0]
- elif len(argv) == 2 and isinstance(argv[0], list) and isinstance(argv[1], dict):
- columns = argv[0]
- aliases = argv[1]
- else:
- columns = argv
-
- self.struct = StructType.of(columns)
- self._alias_to_id = None
- self._id_to_alias = None
- if aliases is not None:
- self._alias_to_id = dict(aliases)
- self._id_to_alias = {v: k for k, v in self._alias_to_id.items()}
-
- self._id_to_field = None
- self._name_to_id = type_util.index_by_name(self.struct)
- self._id_to_name = {v: k for k, v in self._name_to_id.items()}
- self._lowercase_name_to_id = {k.lower(): v for k, v in self._name_to_id.items()}
-
- def as_struct(self):
- return self.struct
-
- def get_aliases(self):
- return self._alias_to_id
-
- def lazy_id_to_field(self):
- from .types import index_by_id
- if self._id_to_field is None:
- self._id_to_field = index_by_id(self.struct) # noqa
-
- return self._id_to_field
-
- def lazy_name_to_id(self):
- from .types import index_by_name
- if self._name_to_id is None:
- self._name_to_id = index_by_name(self.struct)
- self._id_to_name = {v: k for k, v in self._name_to_id.items()}
- self._lowercase_name_to_id = {k.lower(): v for k, v in self._name_to_id.items()}
-
- return self._name_to_id
-
- def lazy_lowercase_name_to_id(self):
- from .types import index_by_name
- if self._lowercase_name_to_id is None:
- self._name_to_id = index_by_name(self.struct)
- self._id_to_name = {v: k for k, v in self._name_to_id.items()}
- self._lowercase_name_to_id = {k.lower(): v for k, v in self._name_to_id.items()}
-
- return self._lowercase_name_to_id
-
- def columns(self):
- return self.struct.fields
-
- def find_type(self, name):
- if isinstance(name, int):
- field = self.lazy_id_to_field().get(name)
- if field:
- return field.type
- elif isinstance(name, str):
- id = self.lazy_name_to_id().get(name)
- if id:
- return self.find_type(id)
- else:
- raise RuntimeError("Invalid Column (could not find): %s" % name)
-
- def find_field(self, id):
- if isinstance(id, int):
- return self.lazy_id_to_field().get(id)
-
- if not id:
- raise ValueError("Invalid Column Name (empty)")
-
- id = self.lazy_name_to_id().get(id)
- if id is not None:
- return self.lazy_id_to_field().get(id)
-
- def case_insensitive_find_field(self, name):
- if name is None:
- raise ValueError("Invalid Column Name (empty)")
-
- id = self.lazy_lowercase_name_to_id().get(name.lower())
- if id is not None:
- return self.lazy_id_to_field().get(id)
-
- return None
-
- def find_column_name(self, id):
- if isinstance(id, int):
- field = self.lazy_id_to_field().get(id)
-
- return None if field is None else field.name
-
- def alias_to_id(self, alias):
- if self._alias_to_id:
- return self._alias_to_id.get(alias)
-
- def id_to_alias(self, field_id):
- if self._id_to_alias:
- return self._id_to_alias.get(field_id)
-
- def select(self, cols):
- return self._internal_select(True, cols)
-
- def case_insensitive_select(self, cols):
- return self._internal_select(False, cols)
-
- def _internal_select(self, case_sensitive, cols):
- from .types import select
-
- if Schema.ALL_COLUMNS in cols:
- return self
-
- selected = set()
- for name in cols:
- if case_sensitive:
- field_id = self.lazy_name_to_id().get(name)
- else:
- field_id = self.lazy_lowercase_name_to_id().get(name.lower())
-
- if field_id is not None:
- selected.add(field_id)
-
- return select(self, selected)
-
- def __len__(self):
- return len(self.struct.fields)
-
- def __repr__(self):
- return "Schema(%s)" % ",".join([str(field) for field in self.struct.fields])
-
- def __str__(self):
- return "table {\n%s\n}" % Schema.NEWLINE.join([" " + str(field) for field in self.struct.fields])
diff --git a/python_legacy/iceberg/api/snapshot.py b/python_legacy/iceberg/api/snapshot.py
deleted file mode 100644
index 8a049e996e..0000000000
--- a/python_legacy/iceberg/api/snapshot.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-class Snapshot(object):
-
- @property
- def snapshot_id(self):
- raise NotImplementedError()
-
- @property
- def parent_id(self):
- raise NotImplementedError()
-
- @property
- def timestamp_millis(self):
- raise NotImplementedError()
-
- @property
- def manifests(self):
- raise NotImplementedError()
-
- @property
- def manifest_location(self):
- raise NotImplementedError()
-
- @property
- def summary(self):
- raise NotImplementedError()
-
- @property
- def operation(self):
- raise NotImplementedError()
-
- def added_files(self):
- raise NotImplementedError()
-
- def deleted_files(self):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/snapshot_iterable.py b/python_legacy/iceberg/api/snapshot_iterable.py
deleted file mode 100644
index ef8a0fb0c7..0000000000
--- a/python_legacy/iceberg/api/snapshot_iterable.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-class SnapshotIterable(object):
- def iterator(self, part_filter, row_filter, columns):
- raise RuntimeError("Interface Implementation")
diff --git a/python_legacy/iceberg/api/struct_like.py b/python_legacy/iceberg/api/struct_like.py
deleted file mode 100644
index e421c2eb9f..0000000000
--- a/python_legacy/iceberg/api/struct_like.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-class StructLike(object):
-
- def __init__(self):
- raise NotImplementedError()
-
- def get(self, pos):
- raise NotImplementedError()
-
- def set(self, pos, value):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/table.py b/python_legacy/iceberg/api/table.py
deleted file mode 100644
index 2152121a50..0000000000
--- a/python_legacy/iceberg/api/table.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-class Table(object):
-
- def __init__(self):
- raise NotImplementedError()
-
- def refresh(self):
- raise NotImplementedError()
-
- def new_scan(self):
- raise NotImplementedError()
-
- def schema(self):
- raise NotImplementedError()
-
- def spec(self):
- raise NotImplementedError()
-
- def properties(self):
- raise NotImplementedError()
-
- def location(self):
- raise NotImplementedError()
-
- def snapshots(self):
- raise NotImplementedError()
-
- def update_schema(self):
- raise NotImplementedError()
-
- def update_properties(self):
- raise NotImplementedError()
-
- def new_append(self):
- raise NotImplementedError()
-
- def new_fast_append(self):
- return self.new_append()
-
- def new_rewrite(self):
- raise NotImplementedError()
-
- def new_overwrite(self):
- raise NotImplementedError()
-
- def new_replace_partitions(self):
- raise NotImplementedError()
-
- def new_delete(self):
- raise NotImplementedError()
-
- def expire_snapshots(self):
- raise NotImplementedError()
-
- def rollback(self):
- raise NotImplementedError()
-
- def new_transaction(self):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/table_scan.py b/python_legacy/iceberg/api/table_scan.py
deleted file mode 100644
index d93d857a02..0000000000
--- a/python_legacy/iceberg/api/table_scan.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-class TableScan(object):
-
- def __init__(self):
- raise NotImplementedError()
-
- @property
- def row_filter(self):
- raise NotImplementedError()
-
- def use_snapshot(self, snapshot_id):
- raise NotImplementedError()
-
- def as_of_time(self, timestamp_millis):
- raise NotImplementedError()
-
- def project(self, schema):
- raise NotImplementedError()
-
- def select(self, columns):
- raise NotImplementedError()
-
- def select_except(self, columns):
- raise NotImplementedError()
-
- def filter(self, expr):
- raise NotImplementedError()
-
- def plan_files(self):
- raise NotImplementedError()
-
- def plan_tasks(self):
- raise NotImplementedError()
-
- def is_case_sensitive(self):
- raise NotImplementedError()
-
- def options(self):
- raise NotImplementedError()
-
- def to_arrow_table(self):
- raise NotImplementedError()
-
- def to_pandas(self):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/tables.py b/python_legacy/iceberg/api/tables.py
deleted file mode 100644
index 0210437dfc..0000000000
--- a/python_legacy/iceberg/api/tables.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from .partition_spec import PartitionSpec
-
-
-class Tables(object):
-
- def create(self, schema, table_identifier, spec=None, properties=None, location=None):
- raise NotImplementedError()
-
- def load(self, table_identifier):
- raise NotImplementedError()
-
- @staticmethod
- def default_args(spec=None, properties=None):
- spec = spec if spec is not None else PartitionSpec.unpartitioned()
- properties = properties if properties is not None else dict()
-
- return spec, properties
diff --git a/python_legacy/iceberg/api/transaction.py b/python_legacy/iceberg/api/transaction.py
deleted file mode 100644
index 1860c8af88..0000000000
--- a/python_legacy/iceberg/api/transaction.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-class Transaction(object):
-
- def table(self):
- raise NotImplementedError()
-
- def update_schema(self):
- raise NotImplementedError()
-
- def update_properties(self):
- raise NotImplementedError()
-
- def update_location(self):
- raise NotImplementedError()
-
- def new_append(self):
- raise NotImplementedError()
-
- def new_fast_append(self):
- raise NotImplementedError()
-
- def new_rewrite(self):
- raise NotImplementedError()
-
- def new_overwrite(self):
- raise NotImplementedError()
-
- def new_replace_partitions(self):
- raise NotImplementedError()
-
- def new_delete(self):
- raise RuntimeError("Interface implementation")
-
- def expire_snapshots(self):
- raise RuntimeError("Interface implementation")
-
- def commit_transaction(self):
- raise RuntimeError("Interface implementation")
diff --git a/python_legacy/iceberg/api/transforms/__init__.py b/python_legacy/iceberg/api/transforms/__init__.py
deleted file mode 100644
index c0f29d778e..0000000000
--- a/python_legacy/iceberg/api/transforms/__init__.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-__all__ = ["Bucket",
- "BucketByteBuffer",
- "BucketDecimal",
- "BucketDouble",
- "BucketFloat",
- "BucketInteger",
- "BucketLong",
- "BucketString",
- "BucketUUID",
- "Dates",
- "Identity",
- "Timestamps",
- "Transform",
- "Transforms",
- "Truncate"]
-
-from .bucket import (Bucket,
- BucketByteBuffer,
- BucketDecimal,
- BucketDouble,
- BucketFloat,
- BucketInteger,
- BucketLong,
- BucketString,
- BucketUUID)
-from .dates import Dates
-from .identity import Identity
-from .timestamps import Timestamps
-from .transform import Transform
-from .transforms import Transforms
-from .truncate import Truncate
diff --git a/python_legacy/iceberg/api/transforms/bucket.py b/python_legacy/iceberg/api/transforms/bucket.py
deleted file mode 100644
index 2884b81e17..0000000000
--- a/python_legacy/iceberg/api/transforms/bucket.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import math
-import struct
-import sys
-
-import mmh3
-
-from .transform import Transform
-from .transform_util import TransformUtil
-from ..expressions import (Expressions,
- JAVA_MAX_INT,
- Operation)
-from ..types.types import (IntegerType,
- TypeID)
-from ...api.types.conversions import Conversions
-
-
-class Bucket(Transform):
- MURMUR3 = mmh3
-
- BUCKET_TYPE = {TypeID.DATE: lambda n: BucketInteger(n),
- TypeID.INTEGER: lambda n: BucketInteger(n),
- TypeID.TIME: lambda n: BucketLong(n),
- TypeID.TIMESTAMP: lambda n: BucketLong(n),
- TypeID.LONG: lambda n: BucketLong(n),
- TypeID.DECIMAL: lambda n: BucketDecimal(n),
- TypeID.STRING: lambda n: BucketString(n),
- TypeID.FIXED: lambda n: BucketByteBuffer(n),
- TypeID.BINARY: lambda n: BucketByteBuffer(n),
- TypeID.UUID: lambda n: BucketUUID(n)}
-
- @staticmethod
- def get(type_var, n):
- bucket_type_func = Bucket.BUCKET_TYPE.get(type_var.type_id)
- if not bucket_type_func:
- raise RuntimeError("Cannot bucket by type: %s" % type_var)
- return bucket_type_func(n)
-
- def __init__(self, n):
- self.n = n
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
- if other is None or not isinstance(other, Bucket):
- return False
-
- return self.n == other.n
-
- def __hash__(self):
- return hash(self.__key())
-
- def __key(self):
- return Bucket.__class__, self.n
-
- def __repr__(self):
- return "Bucket[%s]" % self.n
-
- def __str__(self):
- return "bucket[%s]" % self.n
-
- def apply(self, value):
- return (self.hash(value) & JAVA_MAX_INT) % self.n
-
- def hash(self):
- raise NotImplementedError()
-
- def project(self, name, predicate):
- if predicate.op == Operation.EQ:
- return Expressions.predicate(predicate.op, name, self.apply(predicate.lit.value))
-
- def project_strict(self, name, predicate):
- if predicate.op == Operation.NOT_EQ:
- return Expressions.predicate(predicate.op, name, self.apply(predicate.lit.value))
-
- def get_result_type(self, source_type):
- return IntegerType.get()
-
-
-class BucketInteger(Bucket):
-
- def __init__(self, n):
- super(BucketInteger, self).__init__(n)
-
- def hash(self, value):
- return Bucket.MURMUR3.hash(struct.pack("q", value))
-
- def can_transform(self, type_var):
- return type_var.type_id in [TypeID.INTEGER, TypeID.DATE]
-
-
-class BucketLong(Bucket):
- def __init__(self, n):
- super(BucketLong, self).__init__(n)
-
- def hash(self, value):
- return Bucket.MURMUR3.hash(struct.pack("q", value))
-
- def can_transform(self, type_var):
- return type_var.type_id in [TypeID.LONG,
- TypeID.TIME,
- TypeID.TIMESTAMP]
-
-
-class BucketFloat(Bucket):
- def __init__(self, n):
- super(BucketFloat, self).__init__(n)
-
- def hash(self, value):
- return Bucket.MURMUR3.hash(struct.pack("d", value))
-
- def can_transform(self, type_var):
- return type_var.type_id == TypeID.FLOAT
-
-
-class BucketDouble(Bucket):
- def __init__(self, n):
- super(BucketDouble, self).__init__(n)
-
- def hash(self, value):
- return Bucket.MURMUR3.hash(struct.pack("d", value))
-
- def can_transform(self, type_var):
- return type_var.type_id == TypeID.DOUBLE
-
-
-class BucketDecimal(Bucket):
-
- def __init__(self, n):
- super(BucketDecimal, self).__init__(n)
-
- def hash(self, value):
- # to-do: unwrap to_bytes func since python2 support is being removed
- unscaled_value = TransformUtil.unscale_decimal(value)
- number_of_bytes = int(math.ceil(unscaled_value.bit_length() / 8))
- return Bucket.MURMUR3.hash(to_bytes(unscaled_value, number_of_bytes, byteorder='big'))
-
- def can_transform(self, type_var):
- return type_var.type_id == TypeID.DECIMAL
-
-
-class BucketString(Bucket):
- def __init__(self, n):
- super(BucketString, self).__init__(n)
-
- def hash(self, value):
- return Bucket.MURMUR3.hash(value)
-
- def can_transform(self, type_var):
- return type_var.type_id == TypeID.STRING
-
-
-class BucketByteBuffer(Bucket):
- def __init__(self, n):
- super(BucketByteBuffer, self).__init__(n)
-
- def hash(self, value):
- return Bucket.MURMUR3.hash(value)
-
- def can_transform(self, type_var):
- return type_var.type_id in [TypeID.BINARY, TypeID.FIXED]
-
-
-class BucketUUID(Bucket):
- def __init__(self, n):
- super(BucketUUID, self).__init__(n)
-
- def hash(self, value):
- return Bucket.MURMUR3.hash(Conversions.to_byte_buffer(TypeID.UUID, value))
-
- def can_transform(self, type_var):
- return type_var.type_id == TypeID.UUID
-
-
-def to_bytes(n, length, byteorder='big'):
- if sys.version_info >= (3, 0):
- return n.to_bytes(length, byteorder=byteorder)
- h = '%x' % n
- s = ('0' * (len(h) % 2) + h).zfill(length * 2).decode('hex')
- return s if byteorder == 'big' else s[::-1]
diff --git a/python_legacy/iceberg/api/transforms/dates.py b/python_legacy/iceberg/api/transforms/dates.py
deleted file mode 100644
index dfc6b9a2cd..0000000000
--- a/python_legacy/iceberg/api/transforms/dates.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import datetime
-
-from .projection_util import ProjectionUtil
-from .transform import Transform
-from .transform_util import TransformUtil
-from ..expressions import (Expressions,
- Operation)
-from ..types.types import (IntegerType,
- TypeID)
-
-
-class Dates(Transform):
- YEAR = "year"
- MONTH = "month"
- DAY = "day"
-
- EPOCH = datetime.datetime.utcfromtimestamp(0)
- SECONDS_IN_DAY = 86400
-
- HUMAN_FUNCS = {"year": lambda x: TransformUtil.human_year(x),
- "month": lambda x: TransformUtil.human_month(x),
- "day": lambda x: TransformUtil.human_day(x)}
-
- def __init__(self, granularity, name):
- if granularity not in (Dates.YEAR, Dates.MONTH, Dates.DAY):
- raise RuntimeError("Invalid Granularity: %s" % granularity)
- self.granularity = granularity
- self.name = name
-
- def apply(self, days):
- if self.granularity == Dates.DAY:
- return days
- else:
- apply_func = getattr(TransformUtil, "diff_{}".format(self.granularity))
- return apply_func(datetime.datetime.utcfromtimestamp(days * Dates.SECONDS_IN_DAY), Dates.EPOCH)
-
- def can_transform(self, type):
- return type.type_id == TypeID.DATE
-
- def get_result_type(self, source_type):
- return IntegerType.get()
-
- def project(self, name, predicate):
- if predicate.op == Operation.NOT_NULL or predicate.op == Operation.IS_NULL:
- return Expressions.predicate(predicate.op, name)
-
- return ProjectionUtil.truncate_integer(name, predicate, self)
-
- def project_strict(self, name, predicate):
- return None
-
- def to_human_string(self, value):
- if value is None:
- return "null"
-
- return Dates.HUMAN_FUNCS[self.granularity](value)
-
- def __str__(self):
- return self.name
-
- def dedup_name(self):
- return "time"
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
- if other is None or not isinstance(other, Dates):
- return False
-
- return self.granularity == other.granularity and self.name == other.name
diff --git a/python_legacy/iceberg/api/transforms/identity.py b/python_legacy/iceberg/api/transforms/identity.py
deleted file mode 100644
index f57524eb8a..0000000000
--- a/python_legacy/iceberg/api/transforms/identity.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from .transform import Transform
-from .transform_util import TransformUtil
-from ..expressions import Expressions
-from ..types import TypeID
-
-
-class Identity(Transform):
-
- @staticmethod
- def get(type_var):
- return Identity(type_var)
-
- def __init__(self, type_var):
- self.type_var = type_var
-
- def apply(self, value):
- return value
-
- def can_transform(self, type_var):
- return type_var.is_primitive_type()
-
- def get_result_type(self, source_type):
- return source_type
-
- def project(self, name, predicate):
- return self.project_strict(name, predicate)
-
- def project_strict(self, name, predicate):
- if predicate.lit is not None:
- return Expressions.predicate(predicate.op, name, predicate.lit.value)
- else:
- return Expressions.predicate(predicate.op, name)
-
- def to_human_string(self, value):
- if value is None:
- return "null"
-
- if self.type_var.type_id == TypeID.DATE:
- return TransformUtil.human_day(value)
- elif self.type_var.type_id == TypeID.TIME:
- return TransformUtil.human_time(value)
- elif self.type_var.type_id == TypeID.TIMESTAMP:
- if self.type_var.adjust_to_utc:
- return TransformUtil.human_timestamp_with_timezone(value)
- else:
- return TransformUtil.human_timestamp_without_timezone(value)
- elif self.type_var.type_id in (TypeID.BINARY, TypeID.FIXED):
- raise NotImplementedError()
- # if isinstance(value, bytearray):
- # return base64.b64encode(value)
- # elif isinstance(value, bytes):
- # return base64.b64encode(bytes(value))
- # else:
- # raise RuntimeError("Unsupported binary type: %s" % value.__class__.__name__)
- else:
- return str(value)
-
- def __str__(self):
- return "identity"
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
-
- if other is None or not isinstance(other, Identity):
- return False
-
- return self.type_var == other.type_var
-
- def __hash__(self):
- return hash(self.__key())
-
- def __key(self):
- return Identity.__class__, self.type_var
diff --git a/python_legacy/iceberg/api/transforms/projection_util.py b/python_legacy/iceberg/api/transforms/projection_util.py
deleted file mode 100644
index f9b0bb547e..0000000000
--- a/python_legacy/iceberg/api/transforms/projection_util.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-import decimal
-
-from iceberg.api.expressions import Expressions, Operation
-
-
-class ProjectionUtil(object):
- @staticmethod
- def truncate_integer(name, pred, transform):
- boundary = pred.lit.value
- if pred.op == Operation.LT:
- return Expressions.predicate(Operation.LT_EQ, name, transform.apply(boundary - 1))
- elif pred.op == Operation.LT_EQ:
- return Expressions.predicate(Operation.LT_EQ, name, transform.apply(boundary))
- elif pred.op == Operation.GT:
- return Expressions.predicate(Operation.GT_EQ, name, transform.apply(boundary + 1))
- elif pred.op == Operation.GT_EQ:
- return Expressions.predicate(Operation.GT_EQ, name, transform.apply(boundary))
- elif pred.op == Operation.EQ:
- return Expressions.predicate(pred.op, name, transform.apply(boundary))
-
- def truncate_long(name, pred, transform):
- return ProjectionUtil.truncate_integer(name, pred, transform)
-
- def truncate_decimal(name, pred, transform):
- boundary = pred.lit.value
-
- if pred.op == Operation.LT:
- minus_one = boundary - decimal.Decimal(1)
- return Expressions.predicate(Operation.LT_EQ, name, transform.apply(minus_one))
- elif pred.op == Operation.LT_EQ:
- return Expressions.predicate(Operation.LT_EQ, name, transform.apply(boundary))
- elif pred.op == Operation.GT:
- plus_one = boundary + decimal.Decimal(1)
- return Expressions.predicate(Operation.GT_EQ, name, transform.apply(plus_one))
- elif pred.op == Operation.GT_EQ:
- return Expressions.predicate(Operation.GT_EQ, name, transform.apply(boundary))
- elif pred.op == Operation.EQ:
- return Expressions.predicate(pred.op, name, transform.apply(boundary))
-
- def truncate_array(name, pred, transform):
- boundary = pred.lit.value
-
- if pred.op == Operation.LT or pred.op == Operation.LT_EQ:
- return Expressions.predicate(Operation.LT_EQ, name, transform.apply(boundary))
- elif pred.op == Operation.GT or pred.op == Operation.GT_EQ:
- return Expressions.predicate(Operation.GT_EQ, name, transform.apply(boundary))
- elif pred.op == Operation.EQ:
- return Expressions.predicate(pred.op, name, transform.apply(boundary))
diff --git a/python_legacy/iceberg/api/transforms/timestamps.py b/python_legacy/iceberg/api/transforms/timestamps.py
deleted file mode 100644
index ca38a1c3be..0000000000
--- a/python_legacy/iceberg/api/transforms/timestamps.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-import datetime
-
-from .transform import Transform
-from .transform_util import TransformUtil
-from ..expressions import (Expressions,
- Operation)
-from ..types.types import (IntegerType,
- TypeID)
-
-
-class Timestamps(Transform):
- YEAR = "year"
- MONTH = "month"
- DAY = "day"
- HOUR = "hour"
-
- EPOCH = datetime.datetime.utcfromtimestamp(0)
- HUMAN_FUNCS = {"year": lambda x: TransformUtil.human_year(x),
- "month": lambda x: TransformUtil.human_month(x),
- "day": lambda x: TransformUtil.human_day(x),
- "hour": lambda x: TransformUtil.human_hour(x)}
-
- def __init__(self, granularity, name):
- if granularity not in (Timestamps.YEAR, Timestamps.MONTH, Timestamps.DAY, Timestamps.HOUR):
- raise RuntimeError("Invalid Granularity: %s" % granularity)
-
- self.granularity = granularity
- self.name = name
-
- def apply(self, value):
- apply_func = getattr(TransformUtil, "diff_{}".format(self.granularity))
- return apply_func(datetime.datetime.utcfromtimestamp(value / 1000000), Timestamps.EPOCH)
-
- def can_transform(self, type_var):
- return type_var.type_id == TypeID.TIMESTAMP
-
- def get_result_type(self, source_type):
- return IntegerType.get()
-
- def project(self, name, predicate):
- if predicate.op == Operation.NOT_NULL or predicate.op == Operation.IS_NULL:
- return Expressions.predicate(predicate.op, self.name)
-
- def project_strict(self, name, predicate):
- return None
-
- def to_human_string(self, value):
- if value is None:
- return "null"
-
- return Timestamps.HUMAN_FUNCS[self.granularity](value)
-
- def __str__(self):
- return self.name
-
- def dedup_name(self):
- return "time"
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
- if other is None or not isinstance(other, Timestamps):
- return False
-
- return self.granularity == other.granularity and self.name == other.name
diff --git a/python_legacy/iceberg/api/transforms/transform.py b/python_legacy/iceberg/api/transforms/transform.py
deleted file mode 100644
index 0ccbf2e3c2..0000000000
--- a/python_legacy/iceberg/api/transforms/transform.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-class Transform(object):
-
- def __init__(self):
- raise NotImplementedError()
-
- def apply(self, value):
- raise NotImplementedError()
-
- def can_transform(self, type_var):
- raise NotImplementedError()
-
- def get_result_type(self, source_type):
- raise NotImplementedError()
-
- def project(self, name, predicate):
- raise NotImplementedError()
-
- def project_strict(self, name, predicate):
- raise NotImplementedError()
-
- def to_human_string(self, value):
- return str(value)
-
- def dedup_name(self):
- return self.__str__()
diff --git a/python_legacy/iceberg/api/transforms/transform_util.py b/python_legacy/iceberg/api/transforms/transform_util.py
deleted file mode 100644
index 9042fcc40f..0000000000
--- a/python_legacy/iceberg/api/transforms/transform_util.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from datetime import datetime, timedelta
-
-import pytz
-
-
-class TransformUtil(object):
- EPOCH = datetime.utcfromtimestamp(0)
- EPOCH_YEAR = datetime.utcfromtimestamp(0).year
-
- @staticmethod
- def human_year(year_ordinal):
- return "{0:0=4d}".format(TransformUtil.EPOCH_YEAR + year_ordinal)
-
- @staticmethod
- def human_month(month_ordinal):
- return "{0:0=4d}-{1:0=2d}".format(TransformUtil.EPOCH_YEAR + int(month_ordinal / 12), 1 + int(month_ordinal % 12))
-
- @staticmethod
- def human_day(day_ordinal):
- day = TransformUtil.EPOCH + timedelta(days=day_ordinal)
- return "{0:0=4d}-{1:0=2d}-{2:0=2d}".format(day.year, day.month, day.day)
-
- @staticmethod
- def human_time(micros_from_midnight):
- day = TransformUtil.EPOCH + timedelta(microseconds=micros_from_midnight)
- return "{}".format(day.time())
-
- @staticmethod
- def human_timestamp_with_timezone(timestamp_micros):
- day = TransformUtil.EPOCH + timedelta(microseconds=timestamp_micros)
- return pytz.timezone("UTC").localize(day).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
-
- @staticmethod
- def human_timestamp_without_timezone(timestamp_micros):
- day = TransformUtil.EPOCH + timedelta(microseconds=timestamp_micros)
- return day.isoformat()
-
- @staticmethod
- def human_hour(hour_ordinal):
- time = TransformUtil.EPOCH + timedelta(hours=hour_ordinal)
- return "{0:0=4d}-{1:0=2d}-{2:0=2d}-{3:0=2d}".format(time.year, time.month, time.day, time.hour)
-
- @staticmethod
- def base_64_encode(buffer):
- raise NotImplementedError()
-
- @staticmethod
- def diff_hour(date1, date2):
- return int((date1 - date2).total_seconds() / 3600)
-
- @staticmethod
- def diff_day(date1, date2):
- return (date1 - date2).days
-
- @staticmethod
- def diff_month(date1, date2):
- return (date1.year - date2.year) * 12 + (date1.month - date2.month) - (1 if date1.day < date2.day else 0)
-
- @staticmethod
- def diff_year(date1, date2):
- return (date1.year - date2.year) - \
- (1 if date1.month < date2.month or (date1.month == date2.month and date1.day < date2.day) else 0)
-
- @staticmethod
- def unscale_decimal(decimal_value):
- value_tuple = decimal_value.as_tuple()
- return int(("-" if value_tuple.sign else "") + "".join([str(d) for d in value_tuple.digits]))
diff --git a/python_legacy/iceberg/api/transforms/transforms.py b/python_legacy/iceberg/api/transforms/transforms.py
deleted file mode 100644
index 205a9de737..0000000000
--- a/python_legacy/iceberg/api/transforms/transforms.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import re
-
-from .bucket import Bucket
-from .dates import Dates
-from .identity import Identity
-from .timestamps import Timestamps
-from .truncate import Truncate
-from .unknown_transform import UnknownTransform
-from .void_transform import VoidTransform
-from ..types import (TypeID)
-
-
-"""
- Factory methods for transforms.
- <p>
- Most users should create transforms using a
- {@link PartitionSpec.Builder#builderFor(Schema)} partition spec builder}.
-
- @see PartitionSpec#builderFor(Schema) The partition spec builder.
-"""
-
-
-class Transforms(object):
- HAS_WIDTH = re.compile("(\\w+)\\[(\\d+)\\]")
-
- def __init__(self):
- pass
-
- @staticmethod
- def from_string(type_var, transform):
- match = Transforms.HAS_WIDTH.match(transform)
-
- if match is not None:
- name = match.group(1)
- w = int(match.group(2))
- if name.lower() == "truncate":
- return Truncate.get(type_var, w)
- elif name.lower() == "bucket":
- return Bucket.get(type_var, w)
-
- if transform.lower() == "identity":
- return Identity.get(type_var)
- elif type_var.type_id == TypeID.TIMESTAMP:
- return Timestamps(transform.lower(), transform.lower())
- elif type_var.type_id == TypeID.DATE:
- return Dates(transform.lower(), transform.lower())
-
- if transform.lower() == "void":
- return VoidTransform.get()
-
- return UnknownTransform(type_var, transform)
-
- @staticmethod
- def identity(type_var):
- return Identity.get(type_var)
-
- @staticmethod
- def year(type_var):
- if type_var.type_id == TypeID.DATE:
- return Dates("year", "year")
- elif type_var.type_id == TypeID.TIMESTAMP:
- return Timestamps("year", "year")
- else:
- raise RuntimeError("Cannot partition type %s by year" % type_var)
-
- @staticmethod
- def month(type_var):
- if type_var.type_id == TypeID.DATE:
- return Dates("month", "month")
- elif type_var.type_id == TypeID.TIMESTAMP:
- return Timestamps("month", "month")
- else:
- raise RuntimeError("Cannot partition type %s by month" % type_var)
-
- @staticmethod
- def day(type_var):
- if type_var.type_id == TypeID.DATE:
- return Dates("day", "day")
- elif type_var.type_id == TypeID.TIMESTAMP:
- return Timestamps("day", "day")
- else:
- raise RuntimeError("Cannot partition type %s by day" % type_var)
-
- @staticmethod
- def hour(type_var):
- if type_var.type_id == TypeID.DATE:
- return Dates("hour", "hour")
- elif type_var.type_id == TypeID.TIMESTAMP:
- return Timestamps("hour", "hour")
- else:
- raise RuntimeError("Cannot partition type %s by hour" % type_var)
-
- @staticmethod
- def bucket(type_var, num_buckets):
- return Bucket.get(type_var, num_buckets)
-
- @staticmethod
- def truncate(type_var, width):
- return Truncate.get(type_var, width)
-
- @staticmethod
- def always_null():
- return VoidTransform.get()
diff --git a/python_legacy/iceberg/api/transforms/truncate.py b/python_legacy/iceberg/api/transforms/truncate.py
deleted file mode 100644
index 98cd85f810..0000000000
--- a/python_legacy/iceberg/api/transforms/truncate.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from decimal import Decimal
-
-from .projection_util import ProjectionUtil
-from .transform import Transform
-from .transform_util import TransformUtil
-from ..expressions import (Expressions,
- Operation)
-from ..types import TypeID
-
-
-class Truncate(Transform):
-
- @staticmethod
- def get(type_var, width):
- if type_var.type_id == TypeID.INTEGER:
- return TruncateInteger(width)
- elif type_var.type_id == TypeID.LONG:
- return TruncateLong(width)
- elif type_var.type_id == TypeID.DECIMAL:
- return TruncateDecimal(width)
- elif type_var.type_id == TypeID.STRING:
- return TruncateString(width)
-
- def __init__(self):
- raise NotImplementedError()
-
- def apply(self, value):
- raise NotImplementedError()
-
- def can_transform(self, type_var):
- raise NotImplementedError()
-
- def get_result_type(self, source_type):
- return source_type
-
- def project(self, name, predicate):
- raise NotImplementedError()
-
- def project_strict(self, name, predicate):
- raise NotImplementedError()
-
-
-class TruncateInteger(Truncate):
-
- def __init__(self, width):
- self.W = width
-
- def apply(self, value):
- return value - (((value % self.W) + self.W) % self.W)
-
- def can_transform(self, type_var):
- return type_var.type_id == TypeID.INTEGER
-
- def project(self, name, predicate):
- if predicate.op == Operation.NOT_NULL or predicate.op == Operation.IS_NULL:
- return Expressions.predicate(predicate.op, name)
-
- return ProjectionUtil.truncate_integer(name, predicate, self)
-
- def project_strict(self, name, predicate):
- if predicate.op == Operation.LT:
- _in = predicate.lit.value - 1
- _out = predicate.lit.value
- in_image = self.apply(_in)
- out_image = self.apply(_out)
- if in_image != out_image:
- return Expressions.predicate(Operation.LT_EQ, name, in_image)
- else:
- return Expressions.predicate(Operation.LT, name, in_image)
- elif predicate.op == Operation.LT_EQ:
- _in = predicate.lit.value
- _out = predicate.lit.value + 1
- in_image = self.apply(_in)
- out_image = self.apply(_out)
- if in_image != out_image:
- return Expressions.predicate(Operation.LT_EQ, name, in_image)
- else:
- return Expressions.predicate(Operation.LT, name, in_image)
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
-
- if other is None or not isinstance(other, TruncateInteger):
- return False
-
- return self.W == other.W
-
- def __hash__(self):
- return hash(self.__key())
-
- def __key(self):
- return TruncateInteger.__class__, self.W
-
- def __str__(self):
- return "truncate[%s]" % self.W
-
-
-class TruncateLong(Truncate):
-
- def __init__(self, width):
- self.W = width
-
- def apply(self, value):
- return value - (((value % self.W) + self.W) % self.W)
-
- def can_transform(self, type_var):
- return type_var.type_id == TypeID.LONG
-
- def project(self, name, predicate):
- if predicate.op == Operation.NOT_NULL or predicate.op == Operation.IS_NULL:
- return Expressions.predicate(predicate.op, name)
-
- return ProjectionUtil.truncate_long(name, predicate, self)
-
- def project_strict(self, name, predicate):
- return None
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
-
- if other is None or not isinstance(other, TruncateLong):
- return False
-
- return self.W == other.W
-
- def __hash__(self):
- return hash(self.__key())
-
- def __key(self):
- return TruncateLong.__class__, self.W
-
- def __str__(self):
- return "truncate[%s]" % self.W
-
-
-class TruncateDecimal(Truncate):
-
- def __init__(self, unscaled_width):
- self.unscaled_width = unscaled_width
-
- def apply(self, value):
- unscaled_value = TransformUtil.unscale_decimal(value)
- applied_value = unscaled_value - (((unscaled_value % self.unscaled_width) + self.unscaled_width) % self.unscaled_width)
- return Decimal("{}e{}".format(applied_value, value.as_tuple().exponent))
-
- def can_transform(self, type_var):
- return type_var.type_id == TypeID.DECIMAL
-
- def project(self, name, predicate):
- if predicate.op == Operation.NOT_NULL or predicate.op == Operation.IS_NULL:
- return Expressions.predicate(predicate.op, name)
-
- return ProjectionUtil.truncate_decimal(name, predicate, self)
-
- def project_strict(self, name, predicate):
- return None
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
-
- if other is None or not isinstance(other, TruncateDecimal):
- return False
-
- return self.unscaled_width == other.unscaled_width
-
- def __hash__(self):
- return hash(self.__key())
-
- def __key(self):
- return TruncateDecimal.__class__, self.unscaled_width
-
- def __str__(self):
- return "truncate[%s]" % self.unscaled_width
-
-
-class TruncateString(Truncate):
- def __init__(self, length):
- self.L = length
-
- def apply(self, value):
- return value[0:min(self.L, len(value))]
-
- def can_transform(self, type_var):
- return type_var.type_id == TypeID.STRING
-
- def project(self, name, predicate):
- if predicate.op == Operation.NOT_NULL or predicate.op == Operation.IS_NULL:
- return Expressions.predicate(predicate.op, name)
-
- return ProjectionUtil.truncate_array(name, predicate, self)
-
- def project_strict(self, name, predicate):
- return None
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
-
- if other is None or not isinstance(other, TruncateString):
- return False
-
- return self.L == other.L
-
- def __hash__(self):
- return hash(self.__key())
-
- def __key(self):
- return TruncateString.__class__, self.L
-
- def __str__(self):
- return "truncate[%s]" % self.L
diff --git a/python_legacy/iceberg/api/transforms/unknown_transform.py b/python_legacy/iceberg/api/transforms/unknown_transform.py
deleted file mode 100644
index de326445de..0000000000
--- a/python_legacy/iceberg/api/transforms/unknown_transform.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from typing import Union
-
-from iceberg.api.types import StringType, Type
-
-from .transform import Transform
-
-
-class UnknownTransform(Transform):
-
- def __init__(self, source_type: Type, transform: str):
- self.source_type = source_type
- self.transform = transform
-
- def apply(self, value):
- raise AttributeError(f"Cannot apply unsupported transform: {self.transform}")
-
- def can_transform(self, type_var) -> bool:
- # assume the transform function can be applied for this type because unknown transform is only used when parsing
- # a transform in an existing table. a different Iceberg version must have already validated it.
- return self.source_type == type_var
-
- def get_result_type(self, source_type):
- # the actual result type is not known
- return StringType.get()
-
- def project(self, name, predicate):
- return None
-
- def project_strict(self, name, predicate):
- return None
-
- def __str__(self):
- return self.transform
-
- def __eq__(self, other: Union['UnknownTransform', Transform, object]):
- if id(self) == id(other):
- return True
- elif not isinstance(other, UnknownTransform):
- return False
-
- return self.source_type == other.source_type and self.transform == other.transform
-
- def __hash__(self):
- return hash((self.source_type, self.transform))
diff --git a/python_legacy/iceberg/api/transforms/void_transform.py b/python_legacy/iceberg/api/transforms/void_transform.py
deleted file mode 100644
index ea859641ad..0000000000
--- a/python_legacy/iceberg/api/transforms/void_transform.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from .transform import Transform
-
-
-class VoidTransform(Transform):
- _INSTANCE = None
-
- @staticmethod
- def get():
- if VoidTransform._INSTANCE is None:
- VoidTransform._INSTANCE = VoidTransform()
- return VoidTransform._INSTANCE
-
- def __init__(self):
- pass
-
- def apply(self, value):
- return None
-
- def can_transform(self, type_var):
- return True
-
- def get_result_type(self, source_type):
- return source_type
-
- def project(self, name, predicate):
- return None
-
- def project_strict(self, name, predicate):
- return None
-
- def to_human_string(self, value):
- return "null"
-
- def __str__(self):
- return "void"
diff --git a/python_legacy/iceberg/api/types/__init__.py b/python_legacy/iceberg/api/types/__init__.py
deleted file mode 100644
index cf3fe0167b..0000000000
--- a/python_legacy/iceberg/api/types/__init__.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-__all__ = ["AssignFreshIds",
- "assign_fresh_ids",
- "from_primitive_string",
- "get_projected_ids",
- "Conversions",
- "CustomOrderSchemaVisitor",
- "NestedType",
- "PrimitiveType",
- "Type",
- "TypeID",
- "BinaryType",
- "BooleanType",
- "DateType",
- "DecimalType",
- "DoubleType",
- "FixedType",
- "FloatType",
- "GetProjectedIds",
- "IntegerType",
- "IndexById",
- "IndexByName",
- "index_by_id",
- "index_by_name",
- "join",
- "ListType",
- "LongType",
- "MapType",
- "NestedField",
- "PruneColumns",
- "SchemaVisitor",
- "select",
- "select_not",
- "StringType",
- "StructType",
- "TimeType",
- "TimestampType",
- "UUIDType",
- "visit",
- "visit_custom_order",
- "VisitFieldFuture",
- "VisitFuture"
- ]
-
-import re
-
-from .conversions import Conversions
-from .type import (NestedType,
- PrimitiveType,
- Type,
- TypeID)
-from .type_util import (assign_fresh_ids,
- AssignFreshIds,
- CustomOrderSchemaVisitor,
- get_projected_ids,
- GetProjectedIds,
- index_by_id,
- index_by_name,
- IndexById,
- IndexByName,
- join,
- PruneColumns,
- SchemaVisitor,
- select,
- select_not,
- visit,
- visit_custom_order,
- VisitFieldFuture,
- VisitFuture)
-from .types import (BinaryType,
- BooleanType,
- DateType,
- DecimalType,
- DoubleType,
- FixedType,
- FloatType,
- IntegerType,
- ListType,
- LongType,
- MapType,
- NestedField,
- StringType,
- StructType,
- TimestampType,
- TimeType,
- UUIDType)
-
-TYPES = {str(BooleanType.get()): BooleanType.get(),
- str(IntegerType.get()): IntegerType.get(),
- str(LongType.get()): LongType.get(),
- str(FloatType.get()): FloatType.get(),
- str(DoubleType.get()): DoubleType.get(),
- str(DateType.get()): DateType.get(),
- str(TimeType.get()): TimeType.get(),
- str(TimestampType.with_timezone()): TimestampType.with_timezone(),
- str(TimestampType.without_timezone()): TimestampType.without_timezone(),
- str(StringType.get()): StringType.get(),
- str(UUIDType.get()): UUIDType.get(),
- str(BinaryType.get()): BinaryType.get()}
-
-FIXED = re.compile("fixed\\[(\\d+)\\]")
-DECIMAL = re.compile("decimal\\((\\d+),\\s+(\\d+)\\)")
-
-
-def from_primitive_string(type_string):
- lower_type_string = type_string.lower()
- if lower_type_string in TYPES.keys():
- return TYPES[lower_type_string]
-
- matches = FIXED.match(type_string)
- if matches:
- return FixedType.of_length(matches.group(1))
-
- matches = DECIMAL.match(type_string)
- if matches:
- return DecimalType.of(matches.group(1), matches.group(2))
-
- raise RuntimeError("Cannot parse type string to primitive: %s", type_string)
diff --git a/python_legacy/iceberg/api/types/conversions.py b/python_legacy/iceberg/api/types/conversions.py
deleted file mode 100644
index 4399f6d1e6..0000000000
--- a/python_legacy/iceberg/api/types/conversions.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from decimal import Decimal, ROUND_HALF_UP
-import struct
-import sys
-import uuid
-
-from .type import TypeID
-from .type_util import decimal_to_bytes
-
-
-class Conversions(object):
- HIVE_NULL = "__HIVE_DEFAULT_PARTITION__"
- value_mapping = {TypeID.BOOLEAN: lambda as_str: as_str.lower() == "true" if as_str is not None else False,
- TypeID.INTEGER: lambda as_str: int(float(as_str)),
- TypeID.LONG: lambda as_str: int(float(as_str)),
- TypeID.FLOAT: lambda as_str: float(as_str),
- TypeID.DOUBLE: lambda as_str: float(as_str),
- TypeID.STRING: lambda as_str: as_str,
- TypeID.UUID: lambda as_str: uuid.UUID(as_str),
- TypeID.FIXED: lambda as_str: bytearray(bytes(as_str, "UTF-8")
- if sys.version_info >= (3, 0)
- else bytes(as_str)),
- TypeID.BINARY: lambda as_str: bytes(as_str, "UTF-8") if sys.version_info >= (3, 0) else bytes(as_str),
- TypeID.DECIMAL: lambda as_str: Decimal(as_str),
- }
-
- to_byte_buff_mapping = {TypeID.BOOLEAN: lambda type_id, value: struct.pack("<?", 1 if value else 0),
- TypeID.INTEGER: lambda type_id, value: struct.pack("<i", value),
- TypeID.DATE: lambda type_id, value: struct.pack("<i", value),
- TypeID.LONG: lambda type_id, value: struct.pack("<q", value),
- TypeID.TIME: lambda type_id, value: struct.pack("<q", value),
- TypeID.TIMESTAMP: lambda type_id, value: struct.pack("<q", value),
- TypeID.FLOAT: lambda type_id, value: struct.pack("<f", value),
- TypeID.DOUBLE: lambda type_id, value: struct.pack("<d", value),
- TypeID.STRING: lambda type_id, value: value.encode('UTF-8'),
- TypeID.UUID: lambda type_id, value: struct.pack('>QQ', (value.int >> 64)
- & 0xFFFFFFFFFFFFFFFF, value.int
- & 0xFFFFFFFFFFFFFFFF),
- TypeID.FIXED: lambda type_id, value: value,
- TypeID.BINARY: lambda type_id, value: value,
- TypeID.DECIMAL: decimal_to_bytes
- }
-
- from_byte_buff_mapping = {TypeID.BOOLEAN: lambda type_var, value: struct.unpack('<?', value)[0] != 0,
- TypeID.INTEGER: lambda type_var, value: struct.unpack('<i', value)[0],
- TypeID.DATE: lambda type_var, value: struct.unpack('<i', value)[0],
- TypeID.LONG: lambda type_var, value: struct.unpack('<q', value)[0],
- TypeID.TIME: lambda type_var, value: struct.unpack('<q', value)[0],
- TypeID.TIMESTAMP: lambda type_var, value: struct.unpack('<q', value)[0],
- TypeID.FLOAT: lambda type_var, value: struct.unpack('<f', value)[0],
- TypeID.DOUBLE: lambda type_var, value: struct.unpack('<d', value)[0],
- TypeID.STRING: lambda type_var, value: bytes(value).decode("utf-8"),
- TypeID.UUID: lambda type_var, value:
- uuid.UUID(int=struct.unpack('>QQ', value)[0] << 64 | struct.unpack('>QQ', value)[1]),
- TypeID.FIXED: lambda type_var, value: value,
- TypeID.BINARY: lambda type_var, value: value,
- TypeID.DECIMAL: lambda type_var, value:
- Decimal(int.from_bytes(value, 'big', signed=True) * 10**-type_var.scale)
- .quantize(Decimal("." + "".join(["0" for i in range(1, type_var.scale)]) + "1"),
- rounding=ROUND_HALF_UP)
- }
-
- @staticmethod
- def from_partition_string(type_var, as_string):
- if as_string is None or Conversions.HIVE_NULL == as_string:
- return None
- part_func = Conversions.value_mapping.get(type_var.type_id)
- if part_func is None:
- raise RuntimeError(f"Unsupported type for from_partition_string: {type_var}")
-
- return part_func(as_string)
-
- @staticmethod
- def to_byte_buffer(type_id, value):
- try:
- return Conversions.to_byte_buff_mapping[type_id](type_id, value)
- except KeyError:
- raise TypeError(f"Cannot serialize type, no conversion mapping found for TypeID: {type_id}")
-
- @staticmethod
- def from_byte_buffer(type_var, buffer_var):
- try:
- return Conversions.from_byte_buff_mapping[type_var.type_id](type_var, buffer_var)
- except KeyError:
- raise TypeError(f"Cannot deserialize type, no conversion mapping found for TypeID: {type_var.type_id}")
diff --git a/python_legacy/iceberg/api/types/type.py b/python_legacy/iceberg/api/types/type.py
deleted file mode 100644
index 28f000e5f4..0000000000
--- a/python_legacy/iceberg/api/types/type.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from decimal import Decimal
-from enum import Enum, unique
-import uuid
-
-
-@unique
-class TypeID(Enum):
- BOOLEAN = {"java_class": "Boolean.class", "python_class": bool, "id": 1}
- INTEGER = {"java_class": "Integer.class", "python_class": int, "id": 2}
- LONG = {"java_class": "Long.class", "python_class": int, "id": 3}
- FLOAT = {"java_class": "Float.class", "python_class": float, "id": 4}
- DOUBLE = {"java_class": "Double.class", "python_class": float, "id": 5}
- DATE = {"java_class": "Integer.class", "python_class": int, "id": 6}
- TIME = {"java_class": "Long.class", "python_class": int, "id": 7}
- TIMESTAMP = {"java_class": "Long.class", "python_class": int, "id": 8}
- STRING = {"java_class": "CharSequence.class", "python_class": str, "id": 9}
- UUID = {"java_class": "java.util.UUID.class", "python_class": uuid.UUID, "id": 10}
- FIXED = {"java_class": "ByteBuffer.class", "python_class": bytes, "id": 11}
- BINARY = {"java_class": "ByteBuffer.class", "python_class": bytearray, "id": 12}
- DECIMAL = {"java_class": "BigDecimal.class", "python_class": Decimal, "id": 13}
- STRUCT = {"java_class": "Void.class", "python_class": None, "id": 14}
- LIST = {"java_class": "Void.class", "python_class": None, "id": 15}
- MAP = {"java_class": "Void.class", "python_class": None, "id": 16}
-
-
-class Type(object):
- length: int
- scale: int
- precision: int
-
- def __init__(self):
- pass
-
- def type_id(self):
- pass
-
- def is_primitive_type(self):
- return False
-
- def as_primitive_type(self):
- raise ValueError("Not a primitive type: " + self)
-
- def as_struct_type(self):
- raise ValueError("Not a struct type: " + self)
-
- def as_list_type(self):
- raise ValueError("Not a list type: " + self)
-
- def as_map_type(self):
- raise ValueError("Not a map type: " + self)
-
- def is_nested_type(self):
- return False
-
- def is_struct_type(self):
- return False
-
- def is_list_type(self):
- return False
-
- def is_map_type(self):
- return False
-
- def as_nested_type(self):
- raise ValueError("Not a nested type: " + self)
-
-
-class PrimitiveType(Type):
-
- def __eq__(self, other):
- return type(self) == type(other)
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
- def is_primitive_type(self):
- return True
-
- def as_primitive_type(self):
- return self
-
-
-class NestedType(Type):
-
- def __init__(self):
- super(NestedType, self).__init__()
-
- def is_nested_type(self):
- return True
-
- def as_nested_type(self):
- return self
-
- def fields(self):
- pass
-
- def field_type(self, name):
- pass
-
- def field(self, id):
- pass
diff --git a/python_legacy/iceberg/api/types/type_util.py b/python_legacy/iceberg/api/types/type_util.py
deleted file mode 100644
index 1407967d7a..0000000000
--- a/python_legacy/iceberg/api/types/type_util.py
+++ /dev/null
@@ -1,575 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from decimal import Decimal
-import math
-from typing import List
-
-from .type import (Type,
- TypeID)
-from .types import (ListType,
- MapType,
- NestedField,
- StructType)
-from ...exceptions import ValidationException
-
-MAX_PRECISION = list()
-REQUIRED_LENGTH = [-1 for item in range(40)]
-
-MAX_PRECISION.append(0)
-for i in range(1, 24):
- MAX_PRECISION.append(int(math.floor(math.log10(math.pow(2, 8 * i - 1) - 1))))
-
-for i in range(len(REQUIRED_LENGTH)):
- for j in range(len(MAX_PRECISION)):
- if i <= MAX_PRECISION[j]:
- REQUIRED_LENGTH[i] = j
- break
- if REQUIRED_LENGTH[i] < 0:
- raise RuntimeError("Could not find required length for precision %s" % i)
-
-
-def select(schema, field_ids):
- import iceberg.api.schema
- if schema is None:
- raise RuntimeError("Schema cannot be None")
- if field_ids is None:
- raise RuntimeError("Field ids cannot be None")
-
- result = visit(schema, PruneColumns(field_ids))
- if schema.as_struct() == result:
- return schema
- elif result is not None:
- if schema.get_aliases() is not None:
- return iceberg.api.schema.Schema(result.as_nested_type().fields, schema.get_aliases())
- else:
- return iceberg.api.schema.Schema(result.as_nested_type().fields)
-
- return iceberg.api.schema.Schema(list(), schema.get_aliases())
-
-
-def get_projected_ids(schema):
- import iceberg.api.schema
- if isinstance(schema, iceberg.api.schema.Schema):
- return visit(schema, GetProjectedIds())
- elif isinstance(schema, Type):
- if schema.is_primitive_type():
- return set()
-
- return set(visit(schema, GetProjectedIds))
-
- else:
- raise RuntimeError("Argument %s must be Schema or a Type" % schema)
-
-
-def select_not(schema, field_ids):
- projected_ids = get_projected_ids(schema)
- projected_ids.difference(field_ids)
-
- return select(schema, projected_ids)
-
-
-def join(left, right):
- import iceberg.api.schema
- return iceberg.api.schema.Schema(left + right)
-
-
-def index_by_name(struct):
- return visit(struct, IndexByName())
-
-
-def index_by_id(struct):
- return visit(struct, IndexById())
-
-
-def assign_fresh_ids(type_var, next_id):
- from ..schema import Schema
- if isinstance(type_var, Type):
- return visit(type_var, AssignFreshIds(next_id))
- elif isinstance(type_var, Schema):
- schema = type_var
- return Schema(list(visit(schema.as_struct(), AssignFreshIds(next_id))
- .as_nested_type().fields))
-
-
-def decimal_to_bytes(_, value):
- scale = abs(value.as_tuple().exponent)
- quantized_value = value.quantize(Decimal("10")**-scale)
- unscaled_value = int((quantized_value * 10**scale).to_integral_value())
- min_num_bytes = (unscaled_value.bit_length() + 7) // 8
- return unscaled_value.to_bytes(min_num_bytes, 'big', signed=True)
-
-
-def visit(arg, visitor): # noqa: ignore=C901
- from ..schema import Schema
- if isinstance(visitor, CustomOrderSchemaVisitor):
- return visit_custom_order(arg, visitor)
- elif isinstance(arg, Schema):
- return visitor.schema(arg, visit(arg.as_struct(), visitor))
- elif isinstance(arg, Type):
- type_var = arg
- if type_var.type_id == TypeID.STRUCT:
- struct = type_var.as_nested_type().as_struct_type()
- results = list()
- for field in struct.fields:
- visitor.field_ids.append(field.field_id)
- visitor.field_names.append(field.name)
- result = None
- try:
- result = visit(field.type, visitor)
- except NotImplementedError:
- # will remove it after missing functions are implemented.
- pass
- finally:
- visitor.field_ids.pop()
- visitor.field_names.pop()
- results.append(visitor.field(field, result))
- return visitor.struct(struct, results)
- elif type_var.type_id == TypeID.LIST:
- list_var = type_var.as_nested_type().as_list_type()
- visitor.field_ids.append(list_var.element_id)
- try:
- element_result = visit(list_var.element_type, visitor)
- except NotImplementedError:
- # will remove it after missing functions are implemented.
- pass
- finally:
- visitor.field_ids.pop()
-
- return visitor.list(list_var, element_result)
- elif type_var.type_id == TypeID.MAP:
- map_var = type_var.as_nested_type().as_map_type()
- visitor.field_ids.append(map_var.key_field.field_id)
- visitor.field_names.append(map_var.key_field.name)
- try:
- key_result = visit(map_var.key_type(), visitor)
- except NotImplementedError:
- # will remove it after missing functions are implemented.
- pass
- finally:
- visitor.field_ids.pop()
- visitor.field_names.pop()
-
- visitor.field_ids.append(map_var.value_field.field_id)
- visitor.field_names.append(map_var.value_field.name)
- try:
- value_result = visit(map_var.value_type(), visitor)
- except NotImplementedError:
- # will remove it after missing functions are implemented.
- pass
- finally:
- visitor.field_ids.pop()
- visitor.field_names.pop()
- return visitor.map(map_var, key_result, value_result)
- else:
- return visitor.primitive(arg.as_primitive_type())
- else:
- raise RuntimeError("Invalid type for arg: %s" % arg)
-
-
-def visit_custom_order(arg, visitor):
- from ..schema import Schema
- if isinstance(arg, Schema):
- schema = arg
- return visitor.schema(arg, VisitFuture(schema.as_struct(), visitor))
- elif isinstance(arg, Type):
- type_var = arg
- if type_var.type_id == TypeID.STRUCT:
- struct = type_var.as_nested_type().as_struct_type()
- results = list()
- fields = struct.fields
- for field in fields:
- results.append(VisitFieldFuture(field, visitor))
- struct = visitor.struct(struct, [x.get() for x in results])
- return struct
- elif type_var.type_id == TypeID.LIST:
- list_var = type_var.as_nested_type().as_list_type()
- return visitor.list(list_var, VisitFuture(list_var.element_type, visitor))
- elif type_var.type_id == TypeID.MAP:
- raise NotImplementedError()
-
- return visitor.primitive(type_var.as_primitive_type())
-
-
-class SchemaVisitor(object):
-
- def __init__(self):
- self.field_names = list()
- self.field_ids = list()
-
- def schema(self, schema, struct_result):
- return None
-
- def struct(self, struct, field_results):
- return None
-
- def field(self, field, field_result):
- return None
-
- def list(self, list_var, element_result):
- return None
-
- def map(self, map_var, key_result, value_result):
- return None
-
- def primitive(self, primitive_var):
- return None
-
-
-class CustomOrderSchemaVisitor(object):
- def __init__(self):
- super(CustomOrderSchemaVisitor, self).__init__()
-
- def schema(self, schema, struct_result):
- return None
-
- def struct(self, struct, field_results):
- return None
-
- def field(self, field, field_result):
- return None
-
- def list(self, list_var, element_result):
- return None
-
- def map(self, map_var, key_result, value_result):
- return None
-
- def primitive(self, primitive_var):
- return None
-
-
-class VisitFuture(object):
-
- def __init__(self, type, visitor):
- self.type = type
- self.visitor = visitor
-
- def get(self):
- return visit(self.type, self.visitor)
-
-
-class VisitFieldFuture(object):
-
- def __init__(self, field, visitor):
- self.field = field
- self.visitor = visitor
-
- def get(self):
- return self.visitor.field(self.field, VisitFuture(self.field.type, self.visitor).get)
-
-
-def decimal_required_bytes(precision):
- if precision < 0 or precision > 40:
- raise RuntimeError("Unsupported decimal precision: %s" % precision)
-
- return REQUIRED_LENGTH[precision]
-
-
-class GetProjectedIds(SchemaVisitor):
-
- def __init__(self):
- super(GetProjectedIds, self).__init__()
- self.field_ids = list()
-
- def schema(self, schema, struct_result):
- return self.field_ids
-
- def struct(self, struct, field_results):
- return self.field_ids
-
- def field(self, field, field_result):
- if field_result is None:
- self.field_ids.append(field.field_id)
-
- return self.field_ids
-
- def list(self, list_var, element_result):
- if element_result is None:
- for field in list_var.fields():
- self.field_ids.append(field.field_id)
-
- return self.field_ids
-
- def map(self, map_var, key_result, value_result):
- if value_result is None:
- for field in map_var.fields():
- self.field_ids.append(field.field_id)
-
- return self.field_ids
-
-
-class PruneColumns(SchemaVisitor):
-
- def __init__(self, selected):
- super(PruneColumns, self).__init__()
- self.selected = list(selected)
-
- def schema(self, schema, struct_result):
- return struct_result
-
- def struct(self, struct, field_results):
- fields = struct.fields
- selected_fields = list()
- same_types = True
-
- for i, projected_type in enumerate(field_results):
- field = fields[i]
- if projected_type is not None:
- if field.type == projected_type:
- selected_fields.append(field)
- elif projected_type is not None:
- same_types = False
- if field.is_optional:
- selected_fields.append(NestedField.optional(field.field_id,
- field.name,
- projected_type))
- else:
- selected_fields.append(NestedField.required(field.field_id,
- field.name,
- projected_type))
-
- if len(selected_fields) != 0:
- if len(selected_fields) == len(fields) and same_types:
- return struct
- else:
- return StructType.of(selected_fields)
-
- def field(self, field, field_result):
- if field.field_id in self.selected:
- return field.type
- elif field_result is not None:
- return field_result
-
- def primitive(self, primitive_var):
- return None
-
-
-class IndexByName(SchemaVisitor):
-
- DOT = "."
-
- def __init__(self):
- super(IndexByName, self).__init__()
- self.name_to_id = dict()
-
- def schema(self, schema, struct_result):
- return self.name_to_id
-
- def struct(self, struct, field_results):
- return self.name_to_id
-
- def field(self, field, field_result):
- self.add_field(field.name, field.field_id)
-
- def list(self, list_var, element_result):
- for field in list_var.fields():
- self.add_field(field.name, field.field_id)
-
- def map(self, map_var, key_result, value_result):
- for field in map_var.fields():
- self.add_field(field.name, field.field_id)
-
- def add_field(self, name, field_id):
- full_name = name
- if self.field_names is not None and len(self.field_names) > 0:
- full_name = IndexByName.DOT.join([IndexByName.DOT.join(self.field_names), name])
-
- existing_field_id = self.name_to_id.get(full_name)
- ValidationException.check(existing_field_id is None, "Invalid schema: multiple fields for name %s: %s and %s",
- (full_name, existing_field_id, field_id))
-
- self.name_to_id[full_name] = field_id
-
-
-class IndexById(SchemaVisitor):
-
- def __init__(self):
- super(IndexById, self).__init__()
- self.index = dict()
-
- def schema(self, schema, struct_result):
- return self.index
-
- def struct(self, struct, field_results):
- return self.index
-
- def field(self, field, field_result):
- self.index[field.field_id] = field
-
- def list(self, list_var, element_result):
- for field in list_var.fields():
- self.index[field.field_id] = field
-
- def map(self, map_var, key_result, value_result):
- for field in map_var.fields():
- self.index[field.field_id] = field
-
-
-class NextID(object):
- def __init__(self):
- raise NotImplementedError()
-
- def get(self):
- raise NotImplementedError()
-
-
-class AssignFreshIds(CustomOrderSchemaVisitor):
- def __init__(self, next_id):
- super(AssignFreshIds, self).__init__()
- self.next_id = next_id
-
- def schema(self, schema, struct_result):
- return self.next_id()
-
- def struct(self, struct, field_results):
- fields = struct.fields
- length = len(struct.fields)
- new_ids = list()
-
- for _ in range(length):
- new_ids.append(self.next_id())
-
- new_fields = list()
- types = iter(field_results)
- for i in range(length):
- field = fields[i]
- type = next(types)
- if field.is_optional:
- new_fields.append(NestedField.optional(new_ids[i], field.name, type))
- else:
- new_fields.append(NestedField.required(new_ids[i], field.name, type))
-
- return StructType.of(new_fields)
-
- def field(self, field, field_result):
- return field_result()
-
- def list(self, list_var, element_result):
- new_id = self.next_id()
- if list_var.is_element_optional():
- return ListType.of_optional(new_id, element_result.get())
- else:
- return ListType.of_required(new_id, element_result.get())
-
- def map(self, map_var, key_result, value_result):
- new_key_id = self.next_id()
- new_value_id = self.next_id()
-
- if map_var.is_value_optional():
- return MapType.of_optional(new_key_id, new_value_id, key_result(), value_result())
- else:
- return MapType.of_required(new_key_id, new_value_id, key_result(), value_result())
-
- def primitive(self, primitive_var):
- return primitive_var
-
-
-class CheckCompatibility(CustomOrderSchemaVisitor):
-
- @staticmethod
- def write_compatibility_errors(read_schema, write_schema):
- visit(read_schema, CheckCompatibility(write_schema, True))
-
- @staticmethod
- def read_compatibility_errors(read_schema, write_schema):
- visit(write_schema, CheckCompatibility(read_schema, False))
-
- NO_ERRORS: List[str] = []
-
- def __init__(self, schema, check_ordering):
- self.schema = schema
- self.check_ordering = check_ordering
- self.current_type = None
-
- def schema(self, schema, struct_result):
- self.current_type = self.schema.as_struct()
- try:
- struct_result.get()
- finally:
- self.current_type = None
-
- def struct(self, struct, field_results):
- if struct is None:
- raise RuntimeError("Evaluation must start with a schema.")
-
- if not self.current_type.is_struct_type():
- return [": %s cannot be read as a struct" % self.current_type]
-
- errors = []
-
- for field_errors in field_results:
- errors = errors + field_errors
-
- if self.check_ordering:
- new_struct = self.current_type.as_struct_type()
- id_to_ord = {}
- for i, val in enumerate(new_struct.fields):
- id_to_ord[val.field_id] = i
-
- last_ordinal = -1
-
- for read_field in self.struct.fields:
- id_var = read_field.field_id
-
- field = struct.field(id=id_var)
- if field is not None:
- ordinal = id_to_ord[id]
- if last_ordinal >= ordinal:
- errors.append("%s is out of order before %s" % (read_field.name,
- new_struct.fields[last_ordinal].name))
- last_ordinal = ordinal
-
- return errors
-
- def field(self, field, field_result) -> List[str]:
- struct = self.current_type.as_struct_type()
- curr_field = struct.field(field.field_id)
- errors = []
-
- if curr_field is None:
- if not field.is_optional:
- errors.append("{} is required, but is missing".format(field.name))
- return self.NO_ERRORS
-
- self.current_type = curr_field.type
-
- try:
- if not field.is_optional and curr_field.is_optional:
- errors.append(field.name + " should be required, but is optional")
-
- for error in field_result:
- if error.startswith(":"):
- errors.append("{}{}".format(field.field_name, error))
- else:
- errors.append("{}.{}".format(field.field_name, error))
-
- return errors
- except RuntimeError:
- pass
- finally:
- self.current_type = struct
- return errors
-
- def list(self, list_var, element_result):
- raise NotImplementedError()
-
- def map(self, map_var, key_result, value_result):
- raise NotImplementedError()
-
- def primitive(self, primitive_var):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/types/types.py b/python_legacy/iceberg/api/types/types.py
deleted file mode 100644
index 2bc8d5bde7..0000000000
--- a/python_legacy/iceberg/api/types/types.py
+++ /dev/null
@@ -1,732 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from .type import (NestedType,
- PrimitiveType,
- TypeID)
-
-
-class BooleanType(PrimitiveType):
- __instance = None
-
- @staticmethod
- def get():
- if BooleanType.__instance is None:
- BooleanType()
- return BooleanType.__instance
-
- def __init__(self):
- if BooleanType.__instance is not None:
- raise Exception("Multiple Boolean Types created")
- BooleanType.__instance = self
-
- @property
- def type_id(self):
- return TypeID.BOOLEAN
-
- def __repr__(self):
- return "boolean"
-
- def __str__(self):
- return "boolean"
-
-
-class IntegerType(PrimitiveType):
- __instance = None
-
- @staticmethod
- def get():
- if IntegerType.__instance is None:
- IntegerType()
- return IntegerType.__instance
-
- def __init__(self):
- if IntegerType.__instance is not None:
- raise Exception("Multiple Integer Types created")
- IntegerType.__instance = self
-
- @property
- def type_id(self):
- return TypeID.INTEGER
-
- def __repr__(self):
- return "int"
-
- def __str__(self):
- return "int"
-
-
-class LongType(PrimitiveType):
- __instance = None
-
- @staticmethod
- def get():
- if LongType.__instance is None:
- LongType()
- return LongType.__instance
-
- def __init__(self):
- if LongType.__instance is not None:
- raise Exception("Multiple Long Types created")
- LongType.__instance = self
-
- @property
- def type_id(self):
- return TypeID.LONG
-
- def __repr__(self):
- return "long"
-
- def __str__(self):
- return "long"
-
-
-class FloatType(PrimitiveType):
- __instance = None
-
- @staticmethod
- def get():
- if FloatType.__instance is None:
- FloatType()
- return FloatType.__instance
-
- def __init__(self):
- if FloatType.__instance is not None:
- raise Exception("Multiple Float Types created")
- FloatType.__instance = self
-
- @property
- def type_id(self):
- return TypeID.FLOAT
-
- def __repr__(self):
- return "float"
-
- def __str__(self):
- return "float"
-
-
-class DoubleType(PrimitiveType):
- __instance = None
-
- @staticmethod
- def get():
- if DoubleType.__instance is None:
- DoubleType()
- return DoubleType.__instance
-
- def __init__(self):
- if DoubleType.__instance is not None:
- raise Exception("Multiple Double Types created")
- DoubleType.__instance = self
-
- @property
- def type_id(self):
- return TypeID.DOUBLE
-
- def __repr__(self):
- return "double"
-
- def __str__(self):
- return "double"
-
-
-class DateType(PrimitiveType):
- __instance = None
-
- @staticmethod
- def get():
- if DateType.__instance is None:
- DateType()
- return DateType.__instance
-
- def __init__(self):
- if DateType.__instance is not None:
- raise Exception("Multiple Date Types created")
- DateType.__instance = self
-
- @property
- def type_id(self):
- return TypeID.DATE
-
- def __repr__(self):
- return "date"
-
- def __str__(self):
- return "date"
-
-
-class TimeType(PrimitiveType):
- __instance = None
-
- @staticmethod
- def get():
- if TimeType.__instance is None:
- TimeType()
- return TimeType.__instance
-
- def __init__(self):
- if TimeType.__instance is not None:
- raise Exception("Multiple Time Types created")
- TimeType.__instance = self
-
- @property
- def type_id(self):
- return TypeID.TIME
-
- def __repr__(self):
- return "time"
-
- def __str__(self):
- return "time"
-
-
-class TimestampType(PrimitiveType):
- __instance_with_tz = None
- __instance_without_tz = None
-
- @staticmethod
- def with_timezone():
- if not TimestampType.__instance_with_tz:
- TimestampType()
- return TimestampType.__instance_with_tz
-
- @staticmethod
- def without_timezone():
- if not TimestampType.__instance_without_tz:
- TimestampType(False)
- return TimestampType.__instance_without_tz
-
- def __init__(self, with_timezone=True):
- self.adjust_to_utc = with_timezone
- if (with_timezone and TimestampType.__instance_with_tz is not None)\
- or (not with_timezone and TimestampType.__instance_without_tz is not None):
- raise Exception("Multiple Timestamp Types created")
-
- if with_timezone:
- TimestampType.__instance_with_tz = self
- else:
- TimestampType.__instance_without_tz = self
-
- @property
- def type_id(self):
- return TypeID.TIMESTAMP
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
- elif other is None or not isinstance(other, TimestampType):
- return False
-
- return self.adjust_to_utc == other.adjust_to_utc
-
- def __hash__(self):
- return hash(self.__key())
-
- def __key(self):
- return TimestampType.__class__, self.adjust_to_utc
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
- def __repr__(self):
- if self.adjust_to_utc:
- return "timestamptz"
- else:
- return "timestamp"
-
- def __str__(self):
- return self.__repr__()
-
-
-class StringType(PrimitiveType):
- __instance = None
-
- @staticmethod
- def get():
- if StringType.__instance is None:
- StringType()
- return StringType.__instance
-
- def __init__(self):
- if StringType.__instance is not None:
- raise Exception("Multiple String Types created")
- StringType.__instance = self
-
- @property
- def type_id(self):
- return TypeID.STRING
-
- def __repr__(self):
- return "string"
-
- def __str__(self):
- return "string"
-
-
-class UUIDType(PrimitiveType):
- __instance = None
-
- @staticmethod
- def get():
- if UUIDType.__instance is None:
- UUIDType()
- return UUIDType.__instance
-
- def __init__(self):
- if UUIDType.__instance is not None:
- raise Exception("Multiple UUID Types created")
- UUIDType.__instance = self
-
- @property
- def type_id(self):
- return TypeID.UUID
-
- def __repr__(self):
- return "uuid"
-
- def __str__(self):
- return "uuid"
-
-
-class FixedType(PrimitiveType):
-
- @staticmethod
- def of_length(length):
- return FixedType(length)
-
- def __init__(self, length):
- self._length = length
-
- @property
- def length(self):
- return self._length
-
- @property
- def type_id(self):
- return TypeID.FIXED
-
- def __hash__(self):
- return hash(self.__key())
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
- elif other is None or not isinstance(other, FixedType):
- return False
-
- return self.length == other.length
-
- def __key(self):
- return FixedType.__class__, self.length
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
- def __repr__(self):
- return "fixed[%s]" % (self.length)
-
- def __str__(self):
- return self.__repr__()
-
-
-class BinaryType(PrimitiveType):
- __instance = None
-
- @staticmethod
- def get():
- if BinaryType.__instance is None:
- BinaryType()
- return BinaryType.__instance
-
- def __init__(self):
- if BinaryType.__instance is not None:
- raise Exception("Multiple Binary Types created")
- BinaryType.__instance = self
-
- @property
- def type_id(self):
- return TypeID.BINARY
-
- def __repr__(self):
- return "binary"
-
- def __str__(self):
- return "binary"
-
-
-class DecimalType(PrimitiveType):
-
- @staticmethod
- def of(precision, scale):
- return DecimalType(precision, scale)
-
- def __init__(self, precision, scale):
- if int(precision) > 38:
- raise RuntimeError("Decimals with precision larger than 38 are not supported: %s", precision)
- self.precision = int(precision)
- self.scale = int(scale)
-
- @property
- def type_id(self):
- return TypeID.DECIMAL
-
- def __repr__(self):
- return "decimal(%s, %s)" % (self.precision, self.scale)
-
- def __str__(self):
- return self.__repr__()
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
- elif other is None or not isinstance(other, DecimalType):
- return False
-
- return self.precision == other.precision and self.scale == other.scale
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
- def __hash__(self):
- return hash(self.__key())
-
- def __key(self):
- return DecimalType.__class__, self.precision, self.scale
-
-
-class NestedField():
- length: int
-
- @staticmethod
- def optional(id, name, type_var, doc=None):
- return NestedField(True, id, name, type_var, doc=doc)
-
- @staticmethod
- def required(id, name, type, doc=None):
- return NestedField(False, id, name, type, doc=doc)
-
- def __init__(self, is_optional, id, name, type, doc=None):
- self.is_optional = is_optional
- self.id = id
- self.name = name
- self.type = type
- self.doc = doc
-
- @property
- def is_required(self):
- return not self.is_optional
-
- @property
- def field_id(self):
- return self.id
-
- def __repr__(self):
- return "%s: %s: %s %s(%s)" % (self.id,
- self.name,
- "optional" if self.is_optional else "required",
- self.type,
- self.doc)
-
- def __str__(self):
- return self.__repr__()
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
- elif other is None or not isinstance(other, NestedField):
- return False
-
- return self.is_optional == other.is_optional \
- and self.id == other.id \
- and self.name == other.name and self.type == other.type \
- and self.doc == other.doc
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
- def __hash__(self):
- return hash(self.__key())
-
- def __key(self):
- type_name = self.type.type_id.name
- return NestedField.__class__, self.is_optional, self.id, self.name, self.doc, type_name
-
-
-class StructType(NestedType):
- FIELD_SEP = ", "
-
- @staticmethod
- def of(fields):
- return StructType(fields)
-
- def __init__(self, fields):
- if fields is None:
- raise RuntimeError("Field list cannot be None")
-
- self._fields = list()
- for i in range(0, len(fields)):
- self._fields.append(fields[i])
-
- self._fieldList = None
- self._fieldsByName = None
- self._fieldsByLowercaseName = None
- self._fieldsById = None
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
- elif other is None or not isinstance(other, StructType):
- return False
-
- return self._fields == other._fields
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
- @property
- def fields(self):
- return self._lazy_field_list()
-
- def field(self, name=None, id=None):
- if name:
- return self._lazy_fields_by_name().get(name)
- elif id:
- return self._lazy_fields_by_id()[id]
-
- raise RuntimeError("No valid field info passed in ")
-
- def case_insensitive_field(self, name):
- return self._lazy_fields_by_lowercase_name().get(name)
-
- @property
- def type_id(self):
- return TypeID.STRUCT
-
- def is_struct_type(self):
- return True
-
- def as_struct_type(self):
- return self
-
- def __str__(self):
- return "struct<{}>".format(StructType.FIELD_SEP.join(str(x) for x in self.fields))
-
- def __hash__(self):
- return hash(self.__key())
-
- def __key(self):
- return StructType.__class__, self.fields
-
- def _lazy_field_list(self):
- if self._fieldList is None:
- self._fieldList = tuple(self._fields)
- return self._fieldList
-
- def _lazy_fields_by_name(self):
- if self._fieldsByName is None:
- self.index_fields()
- return self._fieldsByName
-
- def _lazy_fields_by_lowercase_name(self):
- if self._fieldsByName is None:
- self.index_fields()
- return self._fieldsByName
-
- def _lazy_fields_by_id(self):
- if self._fieldsById is None:
- self.index_fields()
- return self._fieldsById
-
- def index_fields(self):
- self._fieldsByName = dict()
- self._fieldsByLowercaseName = dict()
- self._fieldsById = dict()
-
- for field in self.fields:
- self._fieldsByName[field.name] = field
- self._fieldsByLowercaseName[field.name.lower()] = field
- self._fieldsById[field.id] = field
-
-
-class ListType(NestedType):
- @staticmethod
- def of_optional(element_id, element_type):
- if element_type is None:
- raise RuntimeError("Element type cannot be null")
- return ListType(NestedField.optional(element_id, "element", element_type))
-
- @staticmethod
- def of_required(element_id, element_type):
- if element_type is None:
- raise RuntimeError("Element type cannot be null")
- return ListType(NestedField.required(element_id, "element", element_type))
-
- def __init__(self, element_field):
- self.element_field = element_field
- self._fields = None
-
- @property
- def type_id(self):
- return TypeID.LIST
-
- @property
- def element_type(self):
- return self.element_field.type
-
- def field_type(self, name):
- if "element" == name:
- return self.element_type
-
- def field(self, id):
- if self.element_field.id == id:
- return self.element_field
-
- def fields(self):
- return self._lazyFieldsList()
-
- @property
- def element_id(self):
- return self.element_field.id
-
- def is_element_required(self):
- return not self.element_field.is_optional
-
- def is_element_optional(self):
- return self.element_field.is_optional
-
- def is_list_type(self):
- return True
-
- def as_list_type(self):
- return self
-
- def __str__(self):
- return "list<%s>" % self.element_field.type
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
- elif other is None or not isinstance(other, ListType):
- return False
-
- return self.element_field == other.element_field
-
- def __ne__(self, other):
- return self.__eq__(other)
-
- def __hash__(self):
- return hash(self.__key())
-
- def __key(self):
- return StructType.__class__, self.element_field
-
- def _lazyFieldsList(self):
- if self._fields is None:
- self._fields = [self.element_field]
-
- return self._fields
-
-
-class MapType(NestedType):
-
- @staticmethod
- def of_optional(key_id, value_id, key_type, value_type):
- if value_type is None:
- raise RuntimeError("Value type cannot be null")
-
- return MapType(NestedField.required(key_id, 'key', key_type),
- NestedField.optional(value_id, 'value', value_type))
-
- @staticmethod
- def of_required(key_id, value_id, key_type, value_type):
- if value_type is None:
- raise RuntimeError("Value type cannot be null")
-
- return MapType(NestedField.required(key_id, 'key', key_type),
- NestedField.required(value_id, 'value', value_type))
-
- def __init__(self, key_field, value_field):
- self.key_field = key_field
- self.value_field = value_field
- self._fields = None
-
- @property
- def type_id(self):
- return TypeID.MAP
-
- def key_type(self):
- return self.key_field.type
-
- def value_type(self):
- return self.value_field.type
-
- def field_type(self, name):
- if "key" == name:
- return self.key_field.type
- elif "value" == name:
- return self.value_field.type
-
- def field(self, id):
- if self.key_field.id == id:
- return self.key_field
- elif self.value_field.id == id:
- return self.value_field
-
- def fields(self):
- return self._lazy_field_list()
-
- def key_id(self):
- return self.key_field.field_id
-
- def value_id(self):
- return self.value_field.field_id
-
- def as_map_type(self):
- return self
-
- def is_map_type(self):
- return True
-
- def is_value_optional(self):
- return self.value_field.is_optional
-
- def is_value_required(self):
- return not self.is_value_optional()
-
- def __str__(self):
- return "map<%s, %s>" % (self.key_field.type, self.value_field.type)
-
- def __eq__(self, other):
- if id(self) == id(other):
- return True
- elif other is None or not isinstance(other, MapType):
- return False
-
- return self.key_field == other.key_field and self.value_field == other.value_field
-
- def __ne__(self, other):
- return not self.__eq__(other)
-
- def __hash__(self):
- return hash(self.__key())
-
- def __key(self):
- return MapType.__class__, self.key_field, self.value_field
-
- def _lazy_field_list(self):
- return (self.key_field, self.value_field)
diff --git a/python_legacy/iceberg/api/update_properties.py b/python_legacy/iceberg/api/update_properties.py
deleted file mode 100644
index 763bb06bcd..0000000000
--- a/python_legacy/iceberg/api/update_properties.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from .pending_update import PendingUpdate
-
-
-class UpdateProperties(PendingUpdate):
-
- def set(self, key, value):
- raise NotImplementedError()
-
- def remove(self, key):
- raise NotImplementedError()
-
- def default_format(self, file_format):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/api/update_schema.py b/python_legacy/iceberg/api/update_schema.py
deleted file mode 100644
index 22ef05e64a..0000000000
--- a/python_legacy/iceberg/api/update_schema.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from .pending_update import PendingUpdate
-
-
-class UpdateSchema(PendingUpdate):
-
- def add_column(self, name, type, parent=None):
- raise NotImplementedError()
-
- def rename_column(self, name, new_name):
- raise NotImplementedError()
-
- def update_column(self, name, new_type):
- raise NotImplementedError()
-
- def delete_column(self, name):
- raise NotImplementedError()
diff --git a/python_legacy/iceberg/core/__init__.py b/python_legacy/iceberg/core/__init__.py
deleted file mode 100644
index d6079a5816..0000000000
--- a/python_legacy/iceberg/core/__init__.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# flake8: noqa
-
-__all__ = ["BaseMetastoreTableOperations",
- "BaseMetastoreTables",
- "BaseSnapshot",
- "BaseTable",
- "ConfigProperties",
- "DataFiles",
- "GenericDataFile",
- "GenericManifestFile",
- "ManifestEntry",
- "ManifestListWriter",
- "ManifestReader",
- "PartitionSpecParser",
- "PartitionData",
- "SchemaParser",
- "SchemaUpdate",
- "SnapshotParser",
- "SnapshotLogEntry",
- "TableMetadata",
- "TableMetadataParser",
- "TableOperations",
- "TableProperties"]
-
-from .table_operations import TableOperations # out of order import to avoid circular deps
-from .base_metastore_table_operations import BaseMetastoreTableOperations
-from .base_metastore_tables import BaseMetastoreTables
-from .base_snapshot import BaseSnapshot
-from .base_table import BaseTable
-from .config_properties import ConfigProperties
-from .data_files import DataFiles
-from .generic_data_file import GenericDataFile
-from .generic_manifest_file import GenericManifestFile
-from .manifest_entry import ManifestEntry
-from .manifest_list_writer import ManifestListWriter
-from .manifest_reader import ManifestReader
-from .partition_data import PartitionData
-from .partition_spec_parser import PartitionSpecParser
-from .schema_parser import SchemaParser
-from .schema_update import SchemaUpdate
-from .snapshot_parser import SnapshotParser
-from .table_metadata import (SnapshotLogEntry,
- TableMetadata)
-from .table_metadata_parser import TableMetadataParser
-from .table_properties import TableProperties
diff --git a/python_legacy/iceberg/core/avro/__init__.py b/python_legacy/iceberg/core/avro/__init__.py
deleted file mode 100644
index 3d07a5b387..0000000000
--- a/python_legacy/iceberg/core/avro/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-__all__ = ["AvroToIceberg", "IcebergToAvro"]
-
-from .avro_to_iceberg import AvroToIceberg
-from .iceberg_to_avro import IcebergToAvro
diff --git a/python_legacy/iceberg/core/avro/avro_schema_util.py b/python_legacy/iceberg/core/avro/avro_schema_util.py
deleted file mode 100644
index f513a9cf8c..0000000000
--- a/python_legacy/iceberg/core/avro/avro_schema_util.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-class AvroSchemaUtil(object):
- FIELD_ID_PROP = "field-id"
- KEY_ID_PROP = "key-id"
- VALUE_ID_PROP = "value-id"
- ELEMENT_ID_PROP = "element-id"
- ADJUST_TO_UTC_PROP = "adjust-to_utc"
-
- # NULL = PrimitiveSchema(NULL)
-
- @staticmethod
- def convert(iceberg_schema=None, avro_schema=None, table_name=None, names=None,
- type_var=None, name=None):
- if iceberg_schema is not None and table_name is not None:
- return AvroSchemaUtil.convert(iceberg_schema=iceberg_schema,
- names={iceberg_schema.as_struct(): table_name})
- elif iceberg_schema is not None and names is not None:
- raise RuntimeError("Not yet implemented")
diff --git a/python_legacy/iceberg/core/avro/avro_to_iceberg.py b/python_legacy/iceberg/core/avro/avro_to_iceberg.py
deleted file mode 100644
index 14dc9ce9d4..0000000000
--- a/python_legacy/iceberg/core/avro/avro_to_iceberg.py
+++ /dev/null
@@ -1,301 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import fastavro
-from iceberg.api import Schema
-from iceberg.api.types import (BinaryType,
- BooleanType,
- DateType,
- DoubleType,
- FloatType,
- IntegerType,
- ListType,
- LongType,
- MapType,
- NestedField,
- StringType,
- StructType,
- TimestampType,
- TimeType,
- TypeID)
-
-
-class AvroToIceberg(object):
- FIELD_ID_PROP = "field-id"
- FIELD_TYPE_PROP = "type"
- FIELD_NAME_PROP = "name"
- FIELD_LOGICAL_TYPE_PROP = "logicalType"
- FIELD_FIELDS_PROP = "fields"
- FIELD_ITEMS_PROP = "items"
- FIELD_ELEMENT_ID_PROP = "element-id"
-
- AVRO_JSON_PRIMITIVE_TYPES = ("boolean", "int", "long", "float", "double", "bytes", "string")
- AVRO_JSON_COMPLEX_TYPES = ("record", "array", "enum", "fixed")
-
- TYPE_PROCESSING_MAP = {str: lambda x, y: AvroToIceberg.convert_str_type(x, y),
- dict: lambda x, y: AvroToIceberg.convert_complex_type(x, y),
- list: lambda x, y: AvroToIceberg.convert_union_type(x, y)}
-
- COMPLEX_TYPE_PROCESSING_MAP = {"record": lambda x, y: AvroToIceberg.convert_record_type(x, y),
- "array": lambda x, y: AvroToIceberg.convert_array_type(x, y),
- "map": lambda x, y: AvroToIceberg.convert_map_type(x, y)}
-
- PRIMITIVE_FIELD_TYPE_MAP = {"boolean": BooleanType.get(),
- "bytes": BinaryType.get(),
- "date": DateType.get(),
- "double": DoubleType.get(),
- "float": FloatType.get(),
- "int": IntegerType.get(),
- "long": LongType.get(),
- "string": StringType.get(),
- "time-millis": TimeType.get(),
- "timestamp-millis": TimestampType.without_timezone()}
-
- PROCESS_FUNCS = {TypeID.STRUCT: lambda avro_row, field: AvroToIceberg.get_field_from_struct(avro_row, field),
- TypeID.LIST: lambda avro_row, field: AvroToIceberg.get_field_from_list(avro_row, field),
- TypeID.MAP: lambda avro_row, field: AvroToIceberg.get_field_from_map(avro_row, field)}
-
- @staticmethod
- def convert_avro_schema_to_iceberg(avro_schema):
- if avro_schema.get(AvroToIceberg.FIELD_TYPE_PROP) != "record":
- raise RuntimeError("Cannot convert avro schema to iceberg %s" % avro_schema)
-
- struct = AvroToIceberg.convert_type(avro_schema, None)
-
- return Schema(struct[0].fields)
-
- @staticmethod
- def convert_record_type(avro_field, next_id=None):
- avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP)
-
- if avro_field_type != "record":
- raise RuntimeError("Field type muse be 'record': %s" % avro_field_type)
-
- fields = avro_field.get(AvroToIceberg.FIELD_FIELDS_PROP)
-
- iceberg_fields = []
- if next_id is None:
- next_id = len(fields)
- for field in fields:
- iceberg_field, next_id = AvroToIceberg.convert_avro_field_to_iceberg(field, next_id=next_id)
- iceberg_fields.append(iceberg_field)
-
- return StructType.of(iceberg_fields), next_id
-
- @staticmethod
- def convert_avro_field_to_iceberg(field, next_id):
- field_type, is_optional, next_id = AvroToIceberg.convert_type(field, next_id)
-
- if field.get(AvroToIceberg.FIELD_ID_PROP) is None:
- return field_type, next_id
-
- if is_optional:
- return NestedField.optional(field.get(AvroToIceberg.FIELD_ID_PROP),
- field.get(AvroToIceberg.FIELD_NAME_PROP),
- field_type), next_id
- else:
- return NestedField.required(field.get(AvroToIceberg.FIELD_ID_PROP),
- field.get(AvroToIceberg.FIELD_NAME_PROP),
- field_type), next_id
-
- @staticmethod
- def convert_type(field, next_id=None):
- avro_field_type = field.get(AvroToIceberg.FIELD_TYPE_PROP)
-
- optional = AvroToIceberg.is_option_schema(avro_field_type)
-
- processing_func = AvroToIceberg.TYPE_PROCESSING_MAP.get(type(avro_field_type))
- if processing_func is None:
- raise RuntimeError("No function found to process %s" % avro_field_type)
-
- iceberg_type, next_id = processing_func(field, next_id)
-
- return iceberg_type, optional, next_id
-
- @staticmethod
- def convert_str_type(avro_field, next_id=None):
- avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP)
- logical_type = avro_field.get(AvroToIceberg.FIELD_LOGICAL_TYPE_PROP)
- if not isinstance(avro_field_type, str):
- raise RuntimeError("Field type must be of type str: %s" % avro_field_type)
-
- if avro_field_type in AvroToIceberg.AVRO_JSON_PRIMITIVE_TYPES:
- if logical_type is not None:
- return AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(logical_type), next_id
- else:
- return AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(avro_field_type), next_id
-
- elif avro_field_type in AvroToIceberg.AVRO_JSON_COMPLEX_TYPES:
- if logical_type is not None:
- processing_func = AvroToIceberg.COMPLEX_TYPE_PROCESSING_MAP.get(logical_type)
- else:
- processing_func = AvroToIceberg.COMPLEX_TYPE_PROCESSING_MAP.get(avro_field_type)
-
- if processing_func is None:
- raise RuntimeError("No function found to process %s" % avro_field_type)
-
- return processing_func(avro_field, next_id)
- else:
- raise RuntimeError("Unknown type %s" % avro_field_type)
-
- @staticmethod
- def convert_complex_type(avro_field, next_id=None):
- avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP)
- if not isinstance(avro_field_type, dict):
- raise RuntimeError("Complex field type must be of type dict: %s" % avro_field_type)
-
- return AvroToIceberg.convert_avro_field_to_iceberg(avro_field_type, next_id)
-
- @staticmethod
- def convert_union_type(avro_field, next_id=None):
- avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP)
- if not isinstance(avro_field_type, list):
- raise RuntimeError("Union field type must be of type list: %s" % avro_field_type)
-
- if len(avro_field_type) > 2:
- raise RuntimeError("Cannot process unions larger than 2 items: %s" % avro_field_type)
- for item in avro_field_type:
- if isinstance(item, str) and item == "null":
- continue
- avro_field_type = item
- avro_field[AvroToIceberg.FIELD_TYPE_PROP] = avro_field_type
- items = AvroToIceberg.convert_type(avro_field, next_id)
- return items[0], items[2]
-
- @staticmethod
- def convert_array_type(avro_field, next_id=None):
- avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP)
- if avro_field_type != "array":
- raise RuntimeError("Avro type must be array: %s" % avro_field_type)
- element_id = avro_field.get(AvroToIceberg.FIELD_ELEMENT_ID_PROP)
- items = avro_field.get(AvroToIceberg.FIELD_ITEMS_PROP)
-
- is_optional = AvroToIceberg.is_option_schema(items)
-
- if isinstance(items, str) and items in AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP:
- item_type = AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(items)
- if item_type is None:
- raise RuntimeError("No mapping found for type %s" % items)
- else:
- raise RuntimeError("Complex list types not yet implemented")
-
- if is_optional:
- return ListType.of_optional(element_id, item_type), next_id
- else:
- return ListType.of_required(element_id, item_type), next_id
-
- @staticmethod
- def convert_map_type(avro_field, next_id=None):
- avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP)
- avro_logical_type = avro_field.get(AvroToIceberg.FIELD_LOGICAL_TYPE_PROP)
- if avro_field_type != "array" or avro_logical_type != "map":
- raise RuntimeError("Avro type must be array and logical type must be map: %s" % avro_logical_type)
- is_optional = False
- items = avro_field.get(AvroToIceberg.FIELD_ITEMS_PROP)
- for field in items.get(AvroToIceberg.FIELD_FIELDS_PROP, list()):
- if field.get(AvroToIceberg.FIELD_NAME_PROP) == "key":
- key_id = field.get(AvroToIceberg.FIELD_ID_PROP)
- if not isinstance(field.get(AvroToIceberg.FIELD_TYPE_PROP), str):
- raise RuntimeError("Support for complex map keys not yet implemented")
- key_type = AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(field.get(AvroToIceberg.FIELD_TYPE_PROP))
- elif field.get(AvroToIceberg.FIELD_NAME_PROP) == "value":
- value_id = field.get(AvroToIceberg.FIELD_ID_PROP)
- if not isinstance(field.get(AvroToIceberg.FIELD_TYPE_PROP), str):
- raise RuntimeError("Support for complex map values not yet imeplemented")
- value_type = AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(field.get(AvroToIceberg.FIELD_TYPE_PROP))
-
- if is_optional:
- return MapType.of_optional(key_id, value_id, key_type, value_type), next_id
- else:
- return MapType.of_required(key_id, value_id, key_type, value_type), next_id
-
- @staticmethod
- def is_option_schema(field_type):
- if isinstance(field_type, list) and len(field_type) == 2 and "null" in field_type:
- return True
-
- return False
-
- @staticmethod
- def read_avro_file(iceberg_schema, data_file):
- fo = data_file.new_fo()
- avro_reader = fastavro.reader(fo)
- for avro_row in avro_reader:
- iceberg_row = dict()
- for field in iceberg_schema.as_struct().fields:
- iceberg_row[field.name] = AvroToIceberg.get_field_from_avro(avro_row, field)
- yield iceberg_row
- fo.close()
-
- @staticmethod
- def read_avro_row(iceberg_schema, avro_reader):
- try:
- for avro_row in avro_reader:
- iceberg_row = dict()
- for field in iceberg_schema.as_struct().fields:
- iceberg_row[field.name] = AvroToIceberg.get_field_from_avro(avro_row, field)
- yield iceberg_row
- except StopIteration:
- return
-
- @staticmethod
- def get_field_from_avro(avro_row, field):
- try:
- return AvroToIceberg.PROCESS_FUNCS.get(field.type.type_id,
- AvroToIceberg.get_field_from_primitive)(avro_row, field)
- except KeyError:
- raise RuntimeError("Don't know how to get field of type: %s" % field.type.type_id)
-
- @staticmethod
- def get_field_from_primitive(avro_row, field):
- try:
- return avro_row[field.name]
- except KeyError:
- if field.is_required:
- raise RuntimeError("Field is required but missing in source %s\n%s:" % (field, avro_row))
-
- @staticmethod
- def get_field_from_struct(avro_row, field):
- field_obj = {}
- for nested_field in field.type.fields:
- field_obj[nested_field.name] = AvroToIceberg.get_field_from_avro(avro_row[field.name], nested_field)
- return field_obj
-
- @staticmethod
- def get_field_from_list(avro_row, field):
- try:
- return avro_row[field.name]
- except KeyError:
- if field.is_required:
- raise RuntimeError("Field is required but missing in source %s\n%s:" % (field, avro_row))
-
- @staticmethod
- def get_field_from_map(avro_row, field):
- val_map = dict()
-
- try:
- avro_value = avro_row[field.name]
- except KeyError:
- if field.is_required:
- raise RuntimeError("Field is required but missing in source %s\n%s:" % (field, avro_row))
- else:
- return None
-
- for val in avro_value:
- val_map[val['key']] = val['value']
-
- return val_map
diff --git a/python_legacy/iceberg/core/avro/iceberg_to_avro.py b/python_legacy/iceberg/core/avro/iceberg_to_avro.py
deleted file mode 100644
index 29e4330b1b..0000000000
--- a/python_legacy/iceberg/core/avro/iceberg_to_avro.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from iceberg.api.types import BinaryType, TypeID
-
-
-class IcebergToAvro(object):
-
- @staticmethod
- def type_to_schema(struct_type, name):
- struct_fields = list()
- for field in struct_type.fields:
- struct_fields.append({"field-id": field.id,
- "name": field.name,
- "type": IcebergToAvro.get_field(field)})
- return {"type": "record",
- "name": name,
- "fields": struct_fields}
-
- @staticmethod
- def get_field(field):
- if field.type.is_primitive_type():
- return IcebergToAvro.to_option(field)
-
- elif field.type.type_id == TypeID.STRUCT:
- struct_fields = list()
- for struct_field in field.type.fields:
- field_dict = {"field-id": struct_field.id,
- "name": struct_field.name,
- "type": IcebergToAvro.get_field(struct_field)}
- if struct_field.is_optional:
- field_dict["default"] = None
-
- struct_fields.append(field_dict)
-
- return {"fields": struct_fields,
- "name": field.name,
- "type": "record"}
-
- elif field.type.type_id == TypeID.LIST:
- array_obj = {'element-id': field.type.element_id,
- "items": IcebergToAvro.get_field(field.type.element_field),
- "type": 'array'}
- if field.is_optional:
- return ['null', array_obj]
- return array_obj
-
- elif field.type.type_id == TypeID.MAP:
- key = field.type.key_field
- value = field.type.value_field
- array_obj = {"items": {"fields": [{"field-id": key.field_id,
- "name": key.name,
- "type": IcebergToAvro.get_field(key)},
- {"field-id": value.field_id,
- "name": value.name,
- "type": IcebergToAvro.get_field(value)}],
- "name": "k{}_v{}".format(key.field_id, value.field_id),
- "type": "record"},
- "logicalType": "map",
- "type": "array"}
- if field.is_optional:
- return ["null", array_obj]
-
- return array_obj
-
- @staticmethod
- def to_option(field):
- if field.type == BinaryType.get():
- type_name = "bytes"
- else:
- type_name = str(field.type)
-
- if field.is_optional:
- return ["null", type_name]
- else:
- return type_name
diff --git a/python_legacy/iceberg/core/base_combined_scan_task.py b/python_legacy/iceberg/core/base_combined_scan_task.py
deleted file mode 100644
index 8cb209e2f9..0000000000
--- a/python_legacy/iceberg/core/base_combined_scan_task.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from copy import deepcopy
-
-from iceberg.api import CombinedScanTask
-
-
-class BaseCombinedScanTask(CombinedScanTask):
-
- def __init__(self, tasks):
- self.tasks = deepcopy(tasks)
-
- @property
- def files(self):
- return self.tasks
-
- def __repr__(self):
- return "BaseCombinedScanTask([{}])".format(self.tasks)
-
- def __str__(self):
- total_size = sum([task.length for task in self.tasks])
- return "BaseCombinedScanTask(num_tasks={}, total_size={})".format(len(self.tasks), total_size)
diff --git a/python_legacy/iceberg/core/base_file_scan_task.py b/python_legacy/iceberg/core/base_file_scan_task.py
deleted file mode 100644
index f36fcf86f7..0000000000
--- a/python_legacy/iceberg/core/base_file_scan_task.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from iceberg.api import FileScanTask
-
-from .partition_spec_parser import PartitionSpecParser
-from .schema_parser import SchemaParser
-
-
-class BaseFileScanTask(FileScanTask):
-
- def __init__(self, file, schema_str, spec_str, residuals):
- self._file = file
- self._schema_str = schema_str
- self._spec_str = spec_str
- self._spec = None
- self._residuals = residuals
-
- @property
- def file(self):
- return self._file
-
- @property
- def spec(self):
- if self._spec is None:
- self._spec = PartitionSpecParser.from_json(SchemaParser.from_json(self._schema_str), self._spec_str)
-
- return self._spec
-
- @property
- def start(self):
- return 0
-
- @property
- def length(self):
- return self._file.file_size_in_bytes()
-
- @property
- def residual(self):
- return self._residuals.residual_for(self._file.partition())
-
- def split(self, split_size):
- if self.file.format().is_splittable():
- return [task for task in SplitScanTaskIterator(split_size, self)]
- else:
- return self
-
- def __repr__(self):
- fields = ["file: {}".format(self._file.path()),
- "partition_data: {}".format(self._file.partition()),
- "residual: {}".format(self.residual)]
-
- return "BaseFileScanTask({})".format(", ".join(fields))
-
- def __str__(self):
- return self.__repr__()
-
-
-class SplitScanTaskIterator(object):
-
- def __init__(self, split_size, file_scan_task):
- self._offset = 0
- self._remaining_len = file_scan_task.length
- self._split_size = split_size
- self._file_scan_task = file_scan_task
-
- def has_next(self):
- return self._remaining_len > 0
-
- def __iter__(self):
- return self
-
- def __next__(self):
- if self.has_next():
- len = min(self._split_size, self._remaining_len)
- split_task = SplitScanTask(self._offset, len, self._file_scan_task)
- self._offset += len
- self._remaining_len -= len
- return split_task
-
- raise StopIteration
-
-
-class SplitScanTask(FileScanTask):
-
- def __init__(self, offset, len, file_scan_task):
- self._offset = offset
- self._len = len
- self._file_scan_task = file_scan_task
-
- @property
- def file(self):
- return self._file_scan_task.file
-
- @property
- def spec(self):
- return self._file_scan_task.spec
-
- @property
- def start(self):
- return self._offset
-
- @property
- def length(self):
- return self._len
-
- @property
- def residual(self):
- return self._file_scan_task.residual()
-
- def split(self):
- raise RuntimeError("Cannot split a task which is already split")
diff --git a/python_legacy/iceberg/core/base_metastore_table_operations.py b/python_legacy/iceberg/core/base_metastore_table_operations.py
deleted file mode 100644
index 97bdae2e0c..0000000000
--- a/python_legacy/iceberg/core/base_metastore_table_operations.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import logging
-import uuid
-
-from retrying import retry
-
-from .table_metadata_parser import TableMetadataParser
-from .table_operations import TableOperations
-from .table_properties import TableProperties
-
-_logger = logging.getLogger(__name__)
-
-
-class BaseMetastoreTableOperations(TableOperations):
-
- TABLE_TYPE_PROP = "table_type"
- ICEBERG_TABLE_TYPE_VALUE = "iceberg"
- METADATA_LOCATION_PROP = "metadata_location"
- PARTITION_SPEC_PROP = "partition_spec"
- PREVIOUS_METADATA_LOCATION_PROP = "previous_metadata_location"
-
- METADATA_FOLDER_NAME = "metadata"
- DATA_FOLDER_NAME = "data"
- HIVE_LOCATION_FOLDER_NAME = "empty"
-
- def __init__(self, conf):
- self.conf = conf
-
- self.current_metadata = None
- self.current_metadata_location = None
- self.base_location = None
- self.should_refresh = True
- self.version = -1
-
- def current(self):
- return self.current_metadata
-
- def hive_table_location(self):
- return "{base_location}/{hive}".format(base_location=self.base_location,
- hive=BaseMetastoreTableOperations.HIVE_LOCATION_FOLDER_NAME)
-
- def data_location(self):
- return "{base_location}/{data}".format(base_location=self.base_location,
- data=BaseMetastoreTableOperations.DATA_FOLDER_NAME)
-
- def request_refresh(self):
- self.should_refresh = True
-
- def write_new_metadata(self, metadata, version):
- from .filesystem import FileSystemOutputFile
-
- if self.base_location is None:
- self.base_location = metadata.location
-
- new_filename = BaseMetastoreTableOperations.new_table_metadata_filename(self.base_location,
- version)
- new_metadata_location = FileSystemOutputFile.from_path(new_filename, self.conf)
-
- TableMetadataParser.write(metadata, new_metadata_location)
- return new_filename
-
- def refresh_from_metadata_location(self, new_location, num_retries=20):
- if not self.current_metadata_location == new_location:
- _logger.info("Refreshing table metadata from new version: %s" % new_location)
- self.retryable_refresh(new_location)
-
- self.should_refresh = False
-
- def new_input_file(self, path):
- from .filesystem import FileSystemInputFile
-
- return FileSystemInputFile.from_location(path, self.conf)
-
- def new_metadata_file(self, filename):
- from .filesystem import FileSystemOutputFile
-
- return FileSystemOutputFile.from_path(BaseMetastoreTableOperations.new_metadata_location(self.base_location,
- filename),
- self.conf)
-
- def metadata_file_location(self, file_name, metadata=None):
- if metadata is None:
- return self.metadata_file_location(file_name, metadata=self.current())
-
- metadata_location = metadata.properties.get(TableProperties.WRITE_METADATA_LOCATION)
-
- if metadata_location is not None:
- return "{}/{}".format(metadata_location, file_name)
- else:
- return "{}/{}/{}".format(metadata.location, BaseMetastoreTableOperations.METADATA_FOLDER_NAME, file_name)
-
- def delete_file(self, path):
- from .filesystem import get_fs
- get_fs(path, self.conf).delete(path)
-
- @retry(wait_incrementing_start=100, wait_exponential_multiplier=4,
- wait_exponential_max=5000, stop_max_delay=600000, stop_max_attempt_number=2)
- def retryable_refresh(self, location):
- from .filesystem import FileSystemInputFile
-
- self.current_metadata = TableMetadataParser.read(self, FileSystemInputFile.from_location(location, self.conf))
- self.current_metadata_location = location
- self.base_location = self.current_metadata.location
- self.version = BaseMetastoreTableOperations.parse_version(location)
-
- @staticmethod
- def parse_version(metadata_location):
- version_start = metadata_location.rfind("/") + 1
- version_end = version_start + metadata_location[version_start:].find("-")
- return int(metadata_location[version_start:version_end])
-
- @staticmethod
- def new_metadata_location(base_location, filename):
- return "{}/{}/{}".format(base_location, BaseMetastoreTableOperations.METADATA_FOLDER_NAME, filename)
-
- @staticmethod
- def new_table_metadata_filename(base_location, new_version):
- return "{}/{}/{}-{}.metadata.json".format(base_location,
- BaseMetastoreTableOperations.METADATA_FOLDER_NAME,
- '%05d' % new_version,
- uuid.uuid4())
diff --git a/python_legacy/iceberg/core/base_metastore_tables.py b/python_legacy/iceberg/core/base_metastore_tables.py
deleted file mode 100644
index a78041c650..0000000000
--- a/python_legacy/iceberg/core/base_metastore_tables.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-from typing import Tuple
-
-from . import TableOperations
-from .base_table import BaseTable
-from .table_metadata import TableMetadata
-from ..api import PartitionSpec, Schema, Table, Tables
-from ..exceptions import AlreadyExistsException, CommitFailedException, NoSuchTableException
-
-
-class BaseMetastoreTables(Tables):
-
- def __init__(self: "BaseMetastoreTables", conf: dict) -> None:
- self.conf = conf
-
- def new_table_ops(self: "BaseMetastoreTables", conf: dict, database: str, table: str) -> "TableOperations":
- raise RuntimeError("Abstract Implementation")
-
- def load(self: "BaseMetastoreTables", table_identifier: str) -> Table:
- database, table = _parse_table_identifier(table_identifier)
- ops = self.new_table_ops(self.conf, database, table)
- if ops.current():
- return BaseTable(ops, "{}.{}".format(database, table))
- raise NoSuchTableException("Table does not exist: {}.{}".format(database, table))
-
- def create(self: "BaseMetastoreTables", schema: Schema, table_identifier: str, spec: PartitionSpec = None,
- properties: dict = None, location: str = None) -> Table:
- database, table = _parse_table_identifier(table_identifier)
- ops = self.new_table_ops(self.conf, database, table)
- if ops.current(): # not None check here to ensure MagicMocks aren't treated as None
- raise AlreadyExistsException("Table already exists: " + table_identifier)
-
- base_location = location if location else self.default_warehouse_location(self.conf, database, table)
- full_spec, properties = super(BaseMetastoreTables, self).default_args(spec, properties)
- metadata = TableMetadata.new_table_metadata(ops, schema, full_spec, base_location, properties)
-
- try:
- ops.commit(None, metadata)
- except CommitFailedException:
- raise AlreadyExistsException("Table was created concurrently: " + table_identifier)
-
- return BaseTable(ops, "{}.{}".format(database, table))
-
- def begin_create(self: "BaseMetastoreTables", schema: Schema, spec: PartitionSpec, database: str, table_name: str,
- properties: dict = None):
- raise RuntimeError("Not Yet Implemented")
-
- def begin_replace(self: "BaseMetastoreTables", schema: Schema, spec: PartitionSpec, database: str, table: str,
- properties: dict = None):
- raise RuntimeError("Not Yet Implemented")
-
- def default_warehouse_location(self: "BaseMetastoreTables", conf: dict, database: str, table: str) -> str:
- warehouse_location = conf.get("hive.metastore.warehouse.dir")
- if warehouse_location:
- return f"{warehouse_location}/{database}.db/{table}"
- raise RuntimeError("Warehouse location is not set: hive.metastore.warehouse.dir=null")
-
-
-_DOT = '.'
-
-
-def _parse_table_identifier(table_identifier: str) -> Tuple[str, str]:
- parts = table_identifier.rsplit(_DOT, 1)
- if len(parts) > 1:
- database = parts[0]
- table = parts[1]
- else:
- database = "default"
- table = parts[0]
- return database, table
diff --git a/python_legacy/iceberg/core/base_snapshot.py b/python_legacy/iceberg/core/base_snapshot.py
deleted file mode 100644
index c849825f6e..0000000000
--- a/python_legacy/iceberg/core/base_snapshot.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import time
-
-from iceberg.api import (Filterable,
- FilteredSnapshot,
- ManifestFile,
- Snapshot,
- SnapshotIterable)
-from iceberg.api.expressions import Expressions
-from iceberg.api.io import CloseableGroup
-from iceberg.core.avro import AvroToIceberg
-
-from .generic_manifest_file import GenericManifestFile
-from .manifest_reader import ManifestReader
-
-
-class BaseSnapshot(Snapshot, SnapshotIterable, CloseableGroup):
-
- @staticmethod
- def snapshot_from_files(ops, snapshot_id, files):
- return BaseSnapshot(ops, snapshot_id, None,
- manifests=[GenericManifestFile(file=ops.new_input_file(path), spec_id=0)
- for path in files])
-
- def __init__(self, ops, snapshot_id, parent_id=None, manifests=None, manifest_list=None, timestamp_millis=None,
- operation=None, summary=None):
- super(BaseSnapshot, self).__init__()
- if timestamp_millis is None:
- timestamp_millis = int(time.time() * 1000)
-
- self._ops = ops
- self._snapshot_id = snapshot_id
- self._parent_id = parent_id
- self._timestamp_millis = timestamp_millis
- if manifests is not None:
- self._manifests = [manifest if isinstance(manifest, GenericManifestFile)
- else GenericManifestFile(file=ops.new_input_file(manifest), spec_id=0)
- for manifest in manifests]
- else:
- self._manifests = None
- self._manifest_list = manifest_list
- self._operation = operation
- self._summary = summary
-
- self._adds = None
- self._deletes = None
-
- @property
- def snapshot_id(self):
- return self._snapshot_id
-
- @property
- def timestamp_millis(self):
- return self._timestamp_millis
-
- @property
- def parent_id(self):
- return self._parent_id
-
- @property
- def manifests(self):
- if self._manifests is None:
- # if manifest isn't set then the snapshot_file is set and should be read to get the list
- return (GenericManifestFile.from_avro_record_json(manifest)
- for manifest in AvroToIceberg.read_avro_file(ManifestFile.schema(), self._manifest_list))
-
- return self._manifests
-
- @property
- def manifest_location(self):
- return self._manifest_list.location() if self._manifest_list is not None else None
-
- @property
- def summary(self):
- return self._summary
-
- @property
- def operation(self):
- return self._operation
-
- def select(self, columns):
- return FilteredSnapshot(self, Expressions.always_true(), Expressions.always_true(), columns)
-
- def filter_partitions(self, expr):
- return FilteredSnapshot(self, expr, Expressions.always_true(), Filterable.ALL_COLUMNS)
-
- def filter_rows(self, expr):
- return FilteredSnapshot(self, Expressions.always_true(), expr, Filterable.ALL_COLUMNS)
-
- def iterator(self, part_filter=None, row_filter=None, columns=None):
- if part_filter is None and row_filter is None and columns is None:
- return self.iterator(Expressions.always_true(), Expressions.always_true(), Filterable.ALL_COLUMNS)
- return iter([self.get_filtered_manifest(path, part_filter, row_filter, columns)
- for path in self._manifest_files])
-
- def added_files(self):
- raise NotImplementedError()
-
- def deleted_files(self):
- raise NotImplementedError()
-
- def cache_changes(self):
- raise NotImplementedError
-
- def __repr__(self):
- return "BaseSnapshot(id={id},timestamp_ms={ts_ms},manifests={manifests}".format(id=self._snapshot_id,
- ts_ms=self._timestamp_millis,
- manifests=self._manifests)
-
- def __str__(self):
- return self.__repr__()
-
- def get_filtered_manifest(self, path, part_filter=None, row_filter=None, columns=None):
- reader = ManifestReader.read(self._ops.new_input_file(path))
- self.add_closeable(reader)
- return reader
diff --git a/python_legacy/iceberg/core/base_table.py b/python_legacy/iceberg/core/base_table.py
deleted file mode 100644
index a86d9960b3..0000000000
--- a/python_legacy/iceberg/core/base_table.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from iceberg.api import Table
-
-from .data_table_scan import DataTableScan
-from .schema_update import SchemaUpdate
-
-
-class BaseTable(Table):
-
- def __init__(self, ops, name):
- self.ops = ops
- self.name = name
-
- def refresh(self):
- self.ops.refresh()
-
- def new_scan(self):
- return DataTableScan(self.ops, self)
-
- def schema(self):
- return self.ops.current().schema
-
- def spec(self):
- return self.ops.current().spec
-
- def properties(self):
- return self.ops.current().properties
-
- def location(self):
- return self.ops.current().location
-
- def current_snapshot(self):
- return self.ops.current().current_snapshot()
-
- def snapshots(self):
- return self.ops.current().snapshots
-
- def snapshots_with_summary_property(self, prop_key, prop_val):
- if prop_key is None:
- raise RuntimeError("Property Key cannot be None: (%s, %s)" % (prop_key, prop_val))
-
- for snapshot in self.ops.current().snapshots:
- if prop_key in snapshot.summary.keys() and snapshot.summary.get(prop_key) == prop_val:
- yield snapshot
-
- def update_schema(self):
- return SchemaUpdate(self.ops)
... 11208 lines suppressed ...