You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by fo...@apache.org on 2022/11/08 13:55:11 UTC
[iceberg] branch master updated: Spark: Remove Spark 3.0 (#6094)
This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new bc9c686758 Spark: Remove Spark 3.0 (#6094)
bc9c686758 is described below
commit bc9c68675807bc6d5bf802d188d9aa37fde2e733
Author: Ajantha Bhat <aj...@gmail.com>
AuthorDate: Tue Nov 8 19:25:02 2022 +0530
Spark: Remove Spark 3.0 (#6094)
---
spark/v3.0/build.gradle | 278 ---
.../IcebergSqlExtensions.g4 | 293 ----
.../extensions/IcebergSparkSessionExtensions.scala | 57 -
.../analysis/AlignRowLevelOperations.scala | 111 --
.../analysis/AssignmentAlignmentSupport.scala | 208 ---
.../analysis/ProcedureArgumentCoercion.scala | 56 -
.../sql/catalyst/analysis/ResolveProcedures.scala | 190 --
.../RowLevelOperationsPredicateCheck.scala | 90 -
.../sql/catalyst/expressions/AccumulateFiles.scala | 43 -
.../OptimizeConditionsInRowLevelOperations.scala | 59 -
...pCorrelatedPredicatesInRowLevelOperations.scala | 51 -
.../sql/catalyst/optimizer/RewriteDelete.scala | 117 --
.../sql/catalyst/optimizer/RewriteMergeInto.scala | 255 ---
.../sql/catalyst/optimizer/RewriteUpdate.scala | 126 --
.../IcebergSparkSqlExtensionsParser.scala | 303 ----
.../IcebergSqlExtensionsAstBuilder.scala | 286 ---
.../catalyst/plans/logical/AddPartitionField.scala | 33 -
.../spark/sql/catalyst/plans/logical/Call.scala | 34 -
.../plans/logical/DropIdentifierFields.scala | 34 -
.../plans/logical/DropPartitionField.scala | 33 -
.../catalyst/plans/logical/DynamicFileFilter.scala | 62 -
.../sql/catalyst/plans/logical/MergeInto.scala | 38 -
.../sql/catalyst/plans/logical/ReplaceData.scala | 36 -
.../plans/logical/ReplacePartitionField.scala | 38 -
.../plans/logical/SetIdentifierFields.scala | 35 -
.../sql/catalyst/plans/logical/statements.scala | 44 -
.../utils/RewriteRowLevelOperationHelper.scala | 282 ---
.../spark/sql/catalyst/utils/SetAccumulator.scala | 45 -
.../datasources/v2/AddPartitionFieldExec.scala | 56 -
.../sql/execution/datasources/v2/CallExec.scala | 39 -
.../datasources/v2/DropIdentifierFieldsExec.scala | 65 -
.../datasources/v2/DropPartitionFieldExec.scala | 67 -
.../datasources/v2/DynamicFileFilterExec.scala | 130 --
.../datasources/v2/ExtendedBatchScanExec.scala | 60 -
.../v2/ExtendedDataSourceV2Implicits.scala | 51 -
.../v2/ExtendedDataSourceV2Strategy.scala | 147 --
.../execution/datasources/v2/MergeIntoExec.scala | 118 --
.../execution/datasources/v2/ReplaceDataExec.scala | 38 -
.../datasources/v2/ReplacePartitionFieldExec.scala | 72 -
.../datasources/v2/SetIdentifierFieldsExec.scala | 52 -
.../v2/SetWriteDistributionAndOrderingExec.scala | 78 -
.../apache/iceberg/spark/extensions/Employee.java | 66 -
.../spark/extensions/SparkExtensionsTestBase.java | 64 -
.../SparkRowLevelOperationsTestBase.java | 227 ---
.../spark/extensions/TestAddFilesProcedure.java | 1048 -----------
.../extensions/TestAlterTablePartitionFields.java | 439 -----
.../spark/extensions/TestAlterTableSchema.java | 162 --
.../spark/extensions/TestAncestorsOfProcedure.java | 168 --
.../spark/extensions/TestCallStatementParser.java | 214 ---
.../TestCherrypickSnapshotProcedure.java | 203 ---
.../spark/extensions/TestCopyOnWriteDelete.java | 43 -
.../spark/extensions/TestCopyOnWriteMerge.java | 43 -
.../spark/extensions/TestCopyOnWriteUpdate.java | 43 -
.../iceberg/spark/extensions/TestDelete.java | 853 ---------
.../extensions/TestExpireSnapshotsProcedure.java | 316 ----
.../spark/extensions/TestIcebergExpressions.java | 74 -
.../apache/iceberg/spark/extensions/TestMerge.java | 1837 --------------------
.../extensions/TestMigrateTableProcedure.java | 187 --
.../extensions/TestPublishChangesProcedure.java | 193 --
.../extensions/TestRemoveOrphanFilesProcedure.java | 465 -----
.../extensions/TestRewriteDataFilesProcedure.java | 416 -----
.../extensions/TestRewriteManifestsProcedure.java | 196 ---
.../TestRollbackToSnapshotProcedure.java | 297 ----
.../TestRollbackToTimestampProcedure.java | 304 ----
.../TestSetCurrentSnapshotProcedure.java | 256 ---
.../TestSetWriteDistributionAndOrdering.java | 302 ----
.../extensions/TestSnapshotTableProcedure.java | 231 ---
.../iceberg/spark/extensions/TestUpdate.java | 1036 -----------
spark/v3.0/spark-runtime/LICENSE | 630 -------
spark/v3.0/spark-runtime/NOTICE | 508 ------
.../java/org/apache/iceberg/spark/SmokeTest.java | 179 --
.../apache/iceberg/spark/SparkBenchmarkUtil.java | 57 -
.../SparkParquetReadersFlatDataBenchmark.java | 222 ---
.../SparkParquetReadersNestedDataBenchmark.java | 220 ---
.../SparkParquetWritersFlatDataBenchmark.java | 128 --
.../SparkParquetWritersNestedDataBenchmark.java | 128 --
.../org/apache/iceberg/spark/source/Action.java | 24 -
.../spark/source/IcebergSourceBenchmark.java | 199 ---
.../source/IcebergSourceFlatDataBenchmark.java | 59 -
.../source/IcebergSourceNestedDataBenchmark.java | 59 -
.../IcebergSourceNestedListDataBenchmark.java | 62 -
.../iceberg/spark/source/WritersBenchmark.java | 366 ----
.../spark/source/avro/AvroWritersBenchmark.java | 39 -
.../IcebergSourceFlatAvroDataReadBenchmark.java | 142 --
.../IcebergSourceNestedAvroDataReadBenchmark.java | 142 --
.../orc/IcebergSourceFlatORCDataBenchmark.java | 68 -
.../orc/IcebergSourceFlatORCDataReadBenchmark.java | 210 ---
...ebergSourceNestedListORCDataWriteBenchmark.java | 109 --
.../IcebergSourceNestedORCDataReadBenchmark.java | 183 --
...cebergSourceFlatParquetDataFilterBenchmark.java | 129 --
.../IcebergSourceFlatParquetDataReadBenchmark.java | 165 --
...IcebergSourceFlatParquetDataWriteBenchmark.java | 89 -
...gSourceNestedListParquetDataWriteBenchmark.java | 89 -
...bergSourceNestedParquetDataFilterBenchmark.java | 128 --
...cebergSourceNestedParquetDataReadBenchmark.java | 166 --
...ebergSourceNestedParquetDataWriteBenchmark.java | 88 -
.../source/parquet/ParquetWritersBenchmark.java | 39 -
...dDictionaryEncodedFlatParquetDataBenchmark.java | 137 --
.../VectorizedReadFlatParquetDataBenchmark.java | 333 ----
.../java/org/apache/iceberg/spark/BaseCatalog.java | 48 -
.../org/apache/iceberg/spark/CommitMetadata.java | 58 -
.../iceberg/spark/FileRewriteCoordinator.java | 96 -
.../iceberg/spark/FileScanTaskSetManager.java | 77 -
.../org/apache/iceberg/spark/IcebergSpark.java | 43 -
.../org/apache/iceberg/spark/JobGroupInfo.java | 44 -
.../org/apache/iceberg/spark/JobGroupUtils.java | 46 -
.../java/org/apache/iceberg/spark/OrderField.java | 117 --
.../org/apache/iceberg/spark/PathIdentifier.java | 57 -
.../iceberg/spark/PruneColumnsWithReordering.java | 275 ---
.../spark/PruneColumnsWithoutReordering.java | 240 ---
.../apache/iceberg/spark/RollbackStagedTable.java | 142 --
.../org/apache/iceberg/spark/SortOrderToSpark.java | 62 -
.../java/org/apache/iceberg/spark/Spark3Util.java | 991 -----------
.../apache/iceberg/spark/Spark3VersionUtil.java | 34 -
.../org/apache/iceberg/spark/SparkCatalog.java | 620 -------
.../org/apache/iceberg/spark/SparkConfParser.java | 186 --
.../org/apache/iceberg/spark/SparkDataFile.java | 208 ---
.../apache/iceberg/spark/SparkExceptionUtil.java | 64 -
.../org/apache/iceberg/spark/SparkFilters.java | 221 ---
.../iceberg/spark/SparkFixupTimestampType.java | 57 -
.../org/apache/iceberg/spark/SparkFixupTypes.java | 63 -
.../org/apache/iceberg/spark/SparkReadConf.java | 201 ---
.../org/apache/iceberg/spark/SparkReadOptions.java | 74 -
.../apache/iceberg/spark/SparkSQLProperties.java | 45 -
.../org/apache/iceberg/spark/SparkSchemaUtil.java | 308 ----
.../apache/iceberg/spark/SparkSessionCatalog.java | 311 ----
.../org/apache/iceberg/spark/SparkStructLike.java | 54 -
.../org/apache/iceberg/spark/SparkTableUtil.java | 794 ---------
.../org/apache/iceberg/spark/SparkTypeToType.java | 158 --
.../org/apache/iceberg/spark/SparkTypeVisitor.java | 78 -
.../java/org/apache/iceberg/spark/SparkUtil.java | 202 ---
.../apache/iceberg/spark/SparkValueConverter.java | 121 --
.../org/apache/iceberg/spark/SparkWriteConf.java | 152 --
.../apache/iceberg/spark/SparkWriteOptions.java | 53 -
.../org/apache/iceberg/spark/TypeToSparkType.java | 122 --
.../actions/BaseDeleteOrphanFilesSparkAction.java | 299 ----
.../BaseDeleteReachableFilesSparkAction.java | 212 ---
.../actions/BaseExpireSnapshotsSparkAction.java | 290 ---
.../spark/actions/BaseMigrateTableSparkAction.java | 243 ---
.../actions/BaseRewriteDataFilesSpark3Action.java | 47 -
.../actions/BaseRewriteDataFilesSparkAction.java | 520 ------
.../actions/BaseRewriteManifestsSparkAction.java | 399 -----
.../actions/BaseSnapshotTableSparkAction.java | 227 ---
.../actions/BaseSnapshotUpdateSparkAction.java | 45 -
.../iceberg/spark/actions/BaseSparkAction.java | 176 --
.../iceberg/spark/actions/BaseSparkActions.java | 60 -
.../actions/BaseTableCreationSparkAction.java | 182 --
.../iceberg/spark/actions/ManifestFileBean.java | 143 --
.../spark/actions/Spark3BinPackStrategy.java | 87 -
.../iceberg/spark/actions/Spark3SortStrategy.java | 165 --
.../apache/iceberg/spark/actions/SparkActions.java | 75 -
.../spark/data/AvroWithSparkSchemaVisitor.java | 76 -
.../spark/data/ParquetWithSparkSchemaVisitor.java | 231 ---
.../apache/iceberg/spark/data/SparkAvroReader.java | 168 --
.../apache/iceberg/spark/data/SparkAvroWriter.java | 165 --
.../apache/iceberg/spark/data/SparkOrcReader.java | 132 --
.../iceberg/spark/data/SparkOrcValueReaders.java | 241 ---
.../iceberg/spark/data/SparkOrcValueWriters.java | 201 ---
.../apache/iceberg/spark/data/SparkOrcWriter.java | 226 ---
.../iceberg/spark/data/SparkParquetReaders.java | 765 --------
.../iceberg/spark/data/SparkParquetWriters.java | 457 -----
.../iceberg/spark/data/SparkValueReaders.java | 288 ---
.../iceberg/spark/data/SparkValueWriters.java | 258 ---
.../vectorized/ArrowVectorAccessorFactory.java | 125 --
.../data/vectorized/ArrowVectorAccessors.java | 38 -
.../spark/data/vectorized/ColumnarBatchReader.java | 64 -
.../data/vectorized/ConstantColumnVector.java | 122 --
.../data/vectorized/IcebergArrowColumnVector.java | 159 --
.../data/vectorized/RowPositionColumnVector.java | 120 --
.../data/vectorized/VectorizedSparkOrcReaders.java | 459 -----
.../vectorized/VectorizedSparkParquetReaders.java | 53 -
.../spark/procedures/AddFilesProcedure.java | 286 ---
.../spark/procedures/AncestorsOfProcedure.java | 111 --
.../iceberg/spark/procedures/BaseProcedure.java | 164 --
.../procedures/CherrypickSnapshotProcedure.java | 100 --
.../spark/procedures/ExpireSnapshotsProcedure.java | 153 --
.../spark/procedures/MigrateTableProcedure.java | 111 --
.../spark/procedures/PublishChangesProcedure.java | 115 --
.../procedures/RemoveOrphanFilesProcedure.java | 172 --
.../procedures/RewriteDataFilesProcedure.java | 199 ---
.../procedures/RewriteManifestsProcedure.java | 114 --
.../procedures/RollbackToSnapshotProcedure.java | 99 --
.../procedures/RollbackToTimestampProcedure.java | 104 --
.../procedures/SetCurrentSnapshotProcedure.java | 100 --
.../spark/procedures/SnapshotTableProcedure.java | 113 --
.../iceberg/spark/procedures/SparkProcedures.java | 63 -
.../iceberg/spark/source/BaseDataReader.java | 205 ---
.../iceberg/spark/source/BatchDataReader.java | 130 --
.../spark/source/EqualityDeleteRowReader.java | 54 -
.../apache/iceberg/spark/source/IcebergSource.java | 198 ---
.../iceberg/spark/source/InternalRowWrapper.java | 91 -
.../apache/iceberg/spark/source/RowDataReader.java | 197 ---
.../iceberg/spark/source/RowDataRewriter.java | 179 --
.../spark/source/SerializableTableWithSize.java | 64 -
.../iceberg/spark/source/SparkAppenderFactory.java | 318 ----
.../iceberg/spark/source/SparkBatchQueryScan.java | 179 --
.../iceberg/spark/source/SparkBatchScan.java | 362 ----
.../spark/source/SparkFileWriterFactory.java | 276 ---
.../iceberg/spark/source/SparkFilesScan.java | 111 --
.../spark/source/SparkFilesScanBuilder.java | 48 -
.../iceberg/spark/source/SparkMergeBuilder.java | 112 --
.../iceberg/spark/source/SparkMergeScan.java | 194 ---
.../spark/source/SparkMicroBatchStream.java | 326 ----
.../spark/source/SparkPartitionedFanoutWriter.java | 55 -
.../spark/source/SparkPartitionedWriter.java | 55 -
.../iceberg/spark/source/SparkRewriteBuilder.java | 71 -
.../iceberg/spark/source/SparkScanBuilder.java | 204 ---
.../apache/iceberg/spark/source/SparkTable.java | 323 ----
.../apache/iceberg/spark/source/SparkWrite.java | 733 --------
.../iceberg/spark/source/SparkWriteBuilder.java | 164 --
.../iceberg/spark/source/StagedSparkTable.java | 41 -
.../org/apache/iceberg/spark/source/Stats.java | 42 -
.../iceberg/spark/source/StreamingOffset.java | 157 --
.../iceberg/spark/source/StructInternalRow.java | 359 ----
.../analysis/NoSuchProcedureException.java | 34 -
.../iceberg/catalog/ExtendedSupportsDelete.java | 43 -
.../sql/connector/iceberg/catalog/Procedure.java | 50 -
.../iceberg/catalog/ProcedureCatalog.java | 40 -
.../iceberg/catalog/ProcedureParameter.java | 56 -
.../iceberg/catalog/ProcedureParameterImpl.java | 75 -
.../connector/iceberg/catalog/SupportsMerge.java | 41 -
.../distributions/ClusteredDistribution.java | 34 -
.../iceberg/distributions/Distribution.java | 29 -
.../iceberg/distributions/Distributions.java | 57 -
.../iceberg/distributions/OrderedDistribution.java | 34 -
.../distributions/UnspecifiedDistribution.java | 29 -
.../impl/ClusterDistributionImpl.java | 35 -
.../impl/OrderedDistributionImpl.java | 35 -
.../impl/UnspecifiedDistributionImpl.java | 23 -
.../iceberg/expressions/NullOrdering.java | 44 -
.../iceberg/expressions/SortDirection.java | 44 -
.../connector/iceberg/expressions/SortOrder.java | 39 -
.../connector/iceberg/read/SupportsFileFilter.java | 53 -
.../sql/connector/iceberg/write/MergeBuilder.java | 39 -
...org.apache.spark.sql.sources.DataSourceRegister | 20 -
.../expressions/TransformExpressions.scala | 137 --
.../logical/SetWriteDistributionAndOrdering.scala | 44 -
.../plans/logical/SortOrderParserUtil.scala | 40 -
.../utils/DistributionAndOrderingUtils.scala | 220 ---
.../spark/sql/catalyst/utils/PlanUtils.scala | 68 -
.../datasources/SparkExpressionConverter.scala | 50 -
.../test/java/org/apache/iceberg/KryoHelpers.java | 51 -
.../java/org/apache/iceberg/TaskCheckHelper.java | 109 --
.../apache/iceberg/TestDataFileSerialization.java | 176 --
.../apache/iceberg/TestFileIOSerialization.java | 109 --
.../iceberg/TestManifestFileSerialization.java | 217 ---
.../apache/iceberg/TestScanTaskSerialization.java | 143 --
.../org/apache/iceberg/TestTableSerialization.java | 98 --
.../apache/iceberg/spark/SparkCatalogConfig.java | 64 -
.../apache/iceberg/spark/SparkCatalogTestBase.java | 62 -
.../org/apache/iceberg/spark/SparkTestBase.java | 260 ---
.../iceberg/spark/SparkTestBaseWithCatalog.java | 93 -
.../iceberg/spark/TestFileRewriteCoordinator.java | 273 ---
.../org/apache/iceberg/spark/TestSpark3Util.java | 132 --
.../iceberg/spark/TestSparkCatalogOperations.java | 97 --
.../org/apache/iceberg/spark/TestSparkFilters.java | 74 -
.../apache/iceberg/spark/TestSparkSchemaUtil.java | 54 -
.../apache/iceberg/spark/TestSparkTableUtil.java | 52 -
.../iceberg/spark/TestSparkValueConverter.java | 94 -
.../iceberg/spark/actions/TestCreateActions.java | 923 ----------
.../actions/TestDeleteReachableFilesAction.java | 331 ----
.../spark/actions/TestExpireSnapshotsAction.java | 1122 ------------
.../spark/actions/TestRemoveOrphanFilesAction.java | 737 --------
.../actions/TestRemoveOrphanFilesAction3.java | 199 ---
.../spark/actions/TestRewriteDataFilesAction.java | 1491 ----------------
.../spark/actions/TestRewriteManifestsAction.java | 495 ------
.../apache/iceberg/spark/data/AvroDataTest.java | 285 ---
.../apache/iceberg/spark/data/GenericsHelpers.java | 346 ----
.../org/apache/iceberg/spark/data/RandomData.java | 368 ----
.../org/apache/iceberg/spark/data/TestHelpers.java | 770 --------
.../apache/iceberg/spark/data/TestOrcWrite.java | 59 -
.../iceberg/spark/data/TestParquetAvroReader.java | 236 ---
.../iceberg/spark/data/TestParquetAvroWriter.java | 123 --
.../iceberg/spark/data/TestSparkAvroEnums.java | 96 -
.../iceberg/spark/data/TestSparkAvroReader.java | 64 -
.../iceberg/spark/data/TestSparkDateTimes.java | 74 -
.../data/TestSparkOrcReadMetadataColumns.java | 220 ---
.../iceberg/spark/data/TestSparkOrcReader.java | 110 --
.../data/TestSparkParquetReadMetadataColumns.java | 237 ---
.../iceberg/spark/data/TestSparkParquetReader.java | 206 ---
.../iceberg/spark/data/TestSparkParquetWriter.java | 119 --
.../spark/data/TestSparkRecordOrcReaderWriter.java | 153 --
...estParquetDictionaryEncodedVectorizedReads.java | 97 --
...naryFallbackToPlainEncodingVectorizedReads.java | 75 -
.../vectorized/TestParquetVectorizedReads.java | 352 ----
.../apache/iceberg/spark/source/LogMessage.java | 119 --
.../apache/iceberg/spark/source/ManualSource.java | 75 -
.../apache/iceberg/spark/source/SimpleRecord.java | 78 -
.../iceberg/spark/source/SparkTestTable.java | 59 -
.../apache/iceberg/spark/source/TestAvroScan.java | 111 --
.../iceberg/spark/source/TestDataFrameWrites.java | 422 -----
.../spark/source/TestDataSourceOptions.java | 451 -----
.../iceberg/spark/source/TestFilteredScan.java | 683 --------
.../spark/source/TestForwardCompatibility.java | 224 ---
.../iceberg/spark/source/TestIcebergSource.java | 42 -
.../source/TestIcebergSourceHadoopTables.java | 67 -
.../spark/source/TestIcebergSourceHiveTables.java | 78 -
.../spark/source/TestIcebergSourceTablesBase.java | 1824 -------------------
.../iceberg/spark/source/TestIcebergSpark.java | 205 ---
.../spark/source/TestIdentityPartitionData.java | 209 ---
.../spark/source/TestInternalRowWrapper.java | 79 -
.../TestMetadataTablesWithPartitionEvolution.java | 344 ----
.../iceberg/spark/source/TestParquetScan.java | 140 --
.../iceberg/spark/source/TestPartitionPruning.java | 467 -----
.../iceberg/spark/source/TestPartitionValues.java | 439 -----
.../iceberg/spark/source/TestPathIdentifier.java | 85 -
.../iceberg/spark/source/TestReadProjection.java | 609 -------
.../spark/source/TestSnapshotSelection.java | 229 ---
.../spark/source/TestSparkAppenderFactory.java | 69 -
.../spark/source/TestSparkBaseDataReader.java | 276 ---
.../iceberg/spark/source/TestSparkCatalog.java | 43 -
.../source/TestSparkCatalogCacheExpiration.java | 152 --
.../source/TestSparkCatalogHadoopOverrides.java | 145 --
.../iceberg/spark/source/TestSparkDataFile.java | 224 ---
.../iceberg/spark/source/TestSparkDataWrite.java | 656 -------
.../spark/source/TestSparkFileWriterFactory.java | 74 -
.../iceberg/spark/source/TestSparkFilesScan.java | 126 --
.../spark/source/TestSparkMergingMetrics.java | 73 -
.../spark/source/TestSparkMetadataColumns.java | 197 ---
.../spark/source/TestSparkPartitioningWriters.java | 74 -
.../source/TestSparkPositionDeltaWriters.java | 74 -
.../spark/source/TestSparkReadProjection.java | 258 ---
.../spark/source/TestSparkReaderDeletes.java | 245 ---
.../spark/source/TestSparkRollingFileWriters.java | 60 -
.../iceberg/spark/source/TestSparkTable.java | 60 -
.../spark/source/TestSparkWriterMetrics.java | 67 -
.../iceberg/spark/source/TestStreamingOffset.java | 56 -
.../spark/source/TestStructuredStreaming.java | 302 ----
.../spark/source/TestStructuredStreamingRead3.java | 532 ------
.../apache/iceberg/spark/source/TestTables.java | 205 ---
.../spark/source/TestTimestampWithoutZone.java | 309 ----
.../spark/source/TestWriteMetricsConfig.java | 298 ----
.../iceberg/spark/source/ThreeColumnRecord.java | 83 -
.../apache/iceberg/spark/sql/TestAlterTable.java | 331 ----
.../apache/iceberg/spark/sql/TestCreateTable.java | 344 ----
.../iceberg/spark/sql/TestCreateTableAsSelect.java | 416 -----
.../apache/iceberg/spark/sql/TestDeleteFrom.java | 154 --
.../apache/iceberg/spark/sql/TestNamespaceSQL.java | 245 ---
.../iceberg/spark/sql/TestPartitionedWrites.java | 160 --
.../apache/iceberg/spark/sql/TestRefreshTable.java | 82 -
.../org/apache/iceberg/spark/sql/TestSelect.java | 225 ---
.../spark/sql/TestTimestampWithoutZone.java | 238 ---
.../iceberg/spark/sql/TestUnpartitionedWrites.java | 172 --
343 files changed, 67105 deletions(-)
diff --git a/spark/v3.0/build.gradle b/spark/v3.0/build.gradle
deleted file mode 100644
index 938dc8da4a..0000000000
--- a/spark/v3.0/build.gradle
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-def sparkProjects = [
- project(':iceberg-spark:iceberg-spark-3.0_2.12'),
- project(":iceberg-spark:iceberg-spark-extensions-3.0_2.12"),
- project(':iceberg-spark:iceberg-spark-runtime-3.0_2.12')
-]
-
-configure(sparkProjects) {
- project.ext {
- sparkVersion = '3.0.3'
- }
-
- configurations {
- all {
- resolutionStrategy {
- force 'com.fasterxml.jackson.module:jackson-module-scala_2.12:2.11.4'
- force 'com.fasterxml.jackson.module:jackson-module-paranamer:2.11.4'
- force 'com.fasterxml.jackson.core:jackson-core:2.11.4'
- force 'com.fasterxml.jackson.core:jackson-databind:2.11.4'
- }
- }
- }
-}
-
-project(':iceberg-spark:iceberg-spark-3.0_2.12') {
- apply plugin: 'scala'
- apply plugin: 'com.github.alisiikh.scalastyle'
-
- sourceSets {
- main {
- scala.srcDirs = ['src/main/scala', 'src/main/java']
- java.srcDirs = []
- }
- }
-
- dependencies {
- implementation project(path: ':iceberg-bundled-guava', configuration: 'shadow')
- api project(':iceberg-api')
- implementation project(':iceberg-common')
- implementation project(':iceberg-core')
- implementation project(':iceberg-data')
- implementation project(':iceberg-orc')
- implementation project(':iceberg-parquet')
- implementation project(':iceberg-arrow')
-
- compileOnly "com.google.errorprone:error_prone_annotations"
- compileOnly "org.apache.avro:avro"
- compileOnly("org.apache.spark:spark-hive_2.12:${sparkVersion}") {
- exclude group: 'org.apache.avro', module: 'avro'
- exclude group: 'org.apache.arrow'
- exclude group: 'org.apache.parquet'
- exclude group: 'org.roaringbitmap'
- }
-
- implementation("org.apache.parquet:parquet-column")
- implementation("org.apache.parquet:parquet-hadoop")
-
- implementation("org.apache.orc:orc-core::nohive") {
- exclude group: 'org.apache.hadoop'
- exclude group: 'commons-lang'
- // These artifacts are shaded and included in the orc-core fat jar
- exclude group: 'com.google.protobuf', module: 'protobuf-java'
- exclude group: 'org.apache.hive', module: 'hive-storage-api'
- }
-
- implementation("org.apache.arrow:arrow-vector") {
- exclude group: 'io.netty', module: 'netty-buffer'
- exclude group: 'io.netty', module: 'netty-common'
- exclude group: 'com.google.code.findbugs', module: 'jsr305'
- }
-
- testImplementation("org.apache.hadoop:hadoop-minicluster") {
- exclude group: 'org.apache.avro', module: 'avro'
- }
- testImplementation project(path: ':iceberg-hive-metastore')
- testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts')
- testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts')
- testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts')
- testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts')
- testImplementation "org.xerial:sqlite-jdbc"
- }
-
- tasks.withType(Test) {
- // For vectorized reads
- // Allow unsafe memory access to avoid the costly check arrow does to check if index is within bounds
- systemProperty("arrow.enable_unsafe_memory_access", "true")
- // Disable expensive null check for every get(index) call.
- // Iceberg manages nullability checks itself instead of relying on arrow.
- systemProperty("arrow.enable_null_check_for_get", "false")
-
- // Vectorized reads need more memory
- maxHeapSize '2560m'
- }
-}
-
-project(":iceberg-spark:iceberg-spark-extensions-3.0_2.12") {
- apply plugin: 'java-library'
- apply plugin: 'scala'
- apply plugin: 'com.github.alisiikh.scalastyle'
- apply plugin: 'antlr'
-
- configurations {
- /*
- The Gradle Antlr plugin erroneously adds both antlr-build and runtime dependencies to the runtime path. This
- bug https://github.com/gradle/gradle/issues/820 exists because older versions of Antlr do not have separate
- runtime and implementation dependencies and they do not want to break backwards compatibility. So to only end up with
- the runtime dependency on the runtime classpath we remove the dependencies added by the plugin here. Then add
- the runtime dependency back to only the runtime configuration manually.
- */
- implementation {
- extendsFrom = extendsFrom.findAll { it != configurations.antlr }
- }
- }
-
- dependencies {
- compileOnly "org.scala-lang:scala-library"
- compileOnly project(path: ':iceberg-bundled-guava', configuration: 'shadow')
- compileOnly project(':iceberg-api')
- compileOnly project(':iceberg-core')
- compileOnly project(':iceberg-data')
- compileOnly project(':iceberg-orc')
- compileOnly project(':iceberg-common')
- compileOnly project(':iceberg-spark:iceberg-spark-3.0_2.12')
- compileOnly("org.apache.spark:spark-hive_2.12:${sparkVersion}") {
- exclude group: 'org.apache.avro', module: 'avro'
- exclude group: 'org.apache.arrow'
- exclude group: 'org.apache.parquet'
- exclude group: 'org.roaringbitmap'
- }
-
- testImplementation project(path: ':iceberg-hive-metastore')
- testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts')
-
- testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts')
- testImplementation project(path: ':iceberg-orc', configuration: 'testArtifacts')
- testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts')
- testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts')
- testImplementation project(path: ':iceberg-spark:iceberg-spark-3.0_2.12', configuration: 'testArtifacts')
-
- testImplementation "org.apache.avro:avro"
-
- // Required because we remove antlr plugin dependencies from the compile configuration, see note above
- runtimeOnly "org.antlr:antlr4-runtime:4.7.1"
- antlr "org.antlr:antlr4:4.7.1"
- }
-
- generateGrammarSource {
- maxHeapSize = "64m"
- arguments += ['-visitor', '-package', 'org.apache.spark.sql.catalyst.parser.extensions']
- }
-}
-
-project(':iceberg-spark:iceberg-spark-runtime-3.0_2.12') {
- apply plugin: 'com.github.johnrengelman.shadow'
-
- tasks.jar.dependsOn tasks.shadowJar
-
- sourceSets {
- integration {
- java.srcDir "$projectDir/src/integration/java"
- resources.srcDir "$projectDir/src/integration/resources"
- }
- }
-
- configurations {
- implementation {
- exclude group: 'org.apache.spark'
- // included in Spark
- exclude group: 'org.slf4j'
- exclude group: 'org.apache.commons'
- exclude group: 'commons-pool'
- exclude group: 'commons-codec'
- exclude group: 'org.xerial.snappy'
- exclude group: 'javax.xml.bind'
- exclude group: 'javax.annotation'
- exclude group: 'com.github.luben'
- exclude group: 'com.ibm.icu'
- exclude group: 'org.glassfish'
- exclude group: 'org.abego.treelayout'
- exclude group: 'org.antlr'
- }
- }
-
- dependencies {
- api project(':iceberg-api')
- implementation project(':iceberg-spark:iceberg-spark-3.0_2.12')
- implementation project(':iceberg-spark:iceberg-spark-extensions-3.0_2.12')
- implementation project(':iceberg-aws')
- implementation(project(':iceberg-aliyun')) {
- exclude group: 'edu.umd.cs.findbugs', module: 'findbugs'
- exclude group: 'org.apache.httpcomponents', module: 'httpclient'
- exclude group: 'commons-logging', module: 'commons-logging'
- }
- implementation project(':iceberg-hive-metastore')
- implementation(project(':iceberg-nessie')) {
- exclude group: 'com.google.code.findbugs', module: 'jsr305'
- }
-
- integrationImplementation "org.apache.spark:spark-hive_2.12:${sparkVersion}"
- integrationImplementation 'org.junit.vintage:junit-vintage-engine'
- integrationImplementation 'org.slf4j:slf4j-simple'
- integrationImplementation project(path: ':iceberg-api', configuration: 'testArtifacts')
- integrationImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts')
- integrationImplementation project(path: ':iceberg-spark:iceberg-spark-3.0_2.12', configuration: 'testArtifacts')
- integrationImplementation project(path: ':iceberg-spark:iceberg-spark-extensions-3.0_2.12', configuration: 'testArtifacts')
- // Not allowed on our classpath, only the runtime jar is allowed
- integrationCompileOnly project(':iceberg-spark:iceberg-spark-extensions-3.0_2.12')
- integrationCompileOnly project(':iceberg-spark:iceberg-spark-3.0_2.12')
- integrationCompileOnly project(':iceberg-api')
- }
-
- shadowJar {
- configurations = [project.configurations.runtimeClasspath]
-
- zip64 true
-
- // include the LICENSE and NOTICE files for the shaded Jar
- from(projectDir) {
- include 'LICENSE'
- include 'NOTICE'
- }
-
- // Relocate dependencies to avoid conflicts
- relocate 'com.google', 'org.apache.iceberg.shaded.com.google'
- relocate 'com.fasterxml', 'org.apache.iceberg.shaded.com.fasterxml'
- relocate 'com.github.benmanes', 'org.apache.iceberg.shaded.com.github.benmanes'
- relocate 'org.checkerframework', 'org.apache.iceberg.shaded.org.checkerframework'
- relocate 'org.apache.avro', 'org.apache.iceberg.shaded.org.apache.avro'
- relocate 'avro.shaded', 'org.apache.iceberg.shaded.org.apache.avro.shaded'
- relocate 'com.thoughtworks.paranamer', 'org.apache.iceberg.shaded.com.thoughtworks.paranamer'
- relocate 'org.apache.parquet', 'org.apache.iceberg.shaded.org.apache.parquet'
- relocate 'shaded.parquet', 'org.apache.iceberg.shaded.org.apache.parquet.shaded'
- relocate 'org.apache.orc', 'org.apache.iceberg.shaded.org.apache.orc'
- relocate 'io.airlift', 'org.apache.iceberg.shaded.io.airlift'
- relocate 'org.apache.httpcomponents.client5', 'org.apache.iceberg.shaded.org.apache.httpcomponents.client5'
- // relocate Arrow and related deps to shade Iceberg specific version
- relocate 'io.netty.buffer', 'org.apache.iceberg.shaded.io.netty.buffer'
- relocate 'org.apache.arrow', 'org.apache.iceberg.shaded.org.apache.arrow'
- relocate 'com.carrotsearch', 'org.apache.iceberg.shaded.com.carrotsearch'
- relocate 'org.threeten.extra', 'org.apache.iceberg.shaded.org.threeten.extra'
- relocate 'org.roaringbitmap', 'org.apache.iceberg.shaded.org.roaringbitmap'
-
- classifier null
- }
-
- task integrationTest(type: Test) {
- description = "Test Spark3 Runtime Jar against Spark 3.0"
- group = "verification"
- testClassesDirs = sourceSets.integration.output.classesDirs
- classpath = sourceSets.integration.runtimeClasspath + files(shadowJar.archiveFile.get().asFile.path)
- inputs.file(shadowJar.archiveFile.get().asFile.path)
- }
- integrationTest.dependsOn shadowJar
- check.dependsOn integrationTest
-
- jar {
- enabled = false
- }
-}
-
diff --git a/spark/v3.0/spark-extensions/src/main/antlr/org.apache.spark.sql.catalyst.parser.extensions/IcebergSqlExtensions.g4 b/spark/v3.0/spark-extensions/src/main/antlr/org.apache.spark.sql.catalyst.parser.extensions/IcebergSqlExtensions.g4
deleted file mode 100644
index d0b228df0a..0000000000
--- a/spark/v3.0/spark-extensions/src/main/antlr/org.apache.spark.sql.catalyst.parser.extensions/IcebergSqlExtensions.g4
+++ /dev/null
@@ -1,293 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- * This file is an adaptation of Presto's and Spark's grammar files.
- */
-
-grammar IcebergSqlExtensions;
-
-@lexer::members {
- /**
- * Verify whether current token is a valid decimal token (which contains dot).
- * Returns true if the character that follows the token is not a digit or letter or underscore.
- *
- * For example:
- * For char stream "2.3", "2." is not a valid decimal token, because it is followed by digit '3'.
- * For char stream "2.3_", "2.3" is not a valid decimal token, because it is followed by '_'.
- * For char stream "2.3W", "2.3" is not a valid decimal token, because it is followed by 'W'.
- * For char stream "12.0D 34.E2+0.12 " 12.0D is a valid decimal token because it is followed
- * by a space. 34.E2 is a valid decimal token because it is followed by symbol '+'
- * which is not a digit or letter or underscore.
- */
- public boolean isValidDecimal() {
- int nextChar = _input.LA(1);
- if (nextChar >= 'A' && nextChar <= 'Z' || nextChar >= '0' && nextChar <= '9' ||
- nextChar == '_') {
- return false;
- } else {
- return true;
- }
- }
-
- /**
- * This method will be called when we see '/*' and try to match it as a bracketed comment.
- * If the next character is '+', it should be parsed as hint later, and we cannot match
- * it as a bracketed comment.
- *
- * Returns true if the next character is '+'.
- */
- public boolean isHint() {
- int nextChar = _input.LA(1);
- if (nextChar == '+') {
- return true;
- } else {
- return false;
- }
- }
-}
-
-singleStatement
- : statement EOF
- ;
-
-statement
- : CALL multipartIdentifier '(' (callArgument (',' callArgument)*)? ')' #call
- | ALTER TABLE multipartIdentifier ADD PARTITION FIELD transform (AS name=identifier)? #addPartitionField
- | ALTER TABLE multipartIdentifier DROP PARTITION FIELD transform #dropPartitionField
- | ALTER TABLE multipartIdentifier REPLACE PARTITION FIELD transform WITH transform (AS name=identifier)? #replacePartitionField
- | ALTER TABLE multipartIdentifier WRITE writeSpec #setWriteDistributionAndOrdering
- | ALTER TABLE multipartIdentifier SET IDENTIFIER_KW FIELDS fieldList #setIdentifierFields
- | ALTER TABLE multipartIdentifier DROP IDENTIFIER_KW FIELDS fieldList #dropIdentifierFields
- ;
-
-writeSpec
- : (writeDistributionSpec | writeOrderingSpec)*
- ;
-
-writeDistributionSpec
- : DISTRIBUTED BY PARTITION
- ;
-
-writeOrderingSpec
- : LOCALLY? ORDERED BY order
- | UNORDERED
- ;
-
-callArgument
- : expression #positionalArgument
- | identifier '=>' expression #namedArgument
- ;
-
-order
- : fields+=orderField (',' fields+=orderField)*
- | '(' fields+=orderField (',' fields+=orderField)* ')'
- ;
-
-orderField
- : transform direction=(ASC | DESC)? (NULLS nullOrder=(FIRST | LAST))?
- ;
-
-transform
- : multipartIdentifier #identityTransform
- | transformName=identifier
- '(' arguments+=transformArgument (',' arguments+=transformArgument)* ')' #applyTransform
- ;
-
-transformArgument
- : multipartIdentifier
- | constant
- ;
-
-expression
- : constant
- | stringMap
- ;
-
-constant
- : number #numericLiteral
- | booleanValue #booleanLiteral
- | STRING+ #stringLiteral
- | identifier STRING #typeConstructor
- ;
-
-stringMap
- : MAP '(' constant (',' constant)* ')'
- ;
-
-booleanValue
- : TRUE | FALSE
- ;
-
-number
- : MINUS? EXPONENT_VALUE #exponentLiteral
- | MINUS? DECIMAL_VALUE #decimalLiteral
- | MINUS? INTEGER_VALUE #integerLiteral
- | MINUS? BIGINT_LITERAL #bigIntLiteral
- | MINUS? SMALLINT_LITERAL #smallIntLiteral
- | MINUS? TINYINT_LITERAL #tinyIntLiteral
- | MINUS? DOUBLE_LITERAL #doubleLiteral
- | MINUS? FLOAT_LITERAL #floatLiteral
- | MINUS? BIGDECIMAL_LITERAL #bigDecimalLiteral
- ;
-
-multipartIdentifier
- : parts+=identifier ('.' parts+=identifier)*
- ;
-
-identifier
- : IDENTIFIER #unquotedIdentifier
- | quotedIdentifier #quotedIdentifierAlternative
- | nonReserved #unquotedIdentifier
- ;
-
-quotedIdentifier
- : BACKQUOTED_IDENTIFIER
- ;
-
-fieldList
- : fields+=multipartIdentifier (',' fields+=multipartIdentifier)*
- ;
-
-nonReserved
- : ADD | ALTER | AS | ASC | BY | CALL | DESC | DROP | FIELD | FIRST | LAST | NULLS | ORDERED | PARTITION | TABLE | WRITE
- | DISTRIBUTED | LOCALLY | UNORDERED | REPLACE | WITH | IDENTIFIER_KW | FIELDS | SET
- | TRUE | FALSE
- | MAP
- ;
-
-ADD: 'ADD';
-ALTER: 'ALTER';
-AS: 'AS';
-ASC: 'ASC';
-BY: 'BY';
-CALL: 'CALL';
-DESC: 'DESC';
-DISTRIBUTED: 'DISTRIBUTED';
-DROP: 'DROP';
-FIELD: 'FIELD';
-FIELDS: 'FIELDS';
-FIRST: 'FIRST';
-LAST: 'LAST';
-LOCALLY: 'LOCALLY';
-NULLS: 'NULLS';
-ORDERED: 'ORDERED';
-PARTITION: 'PARTITION';
-REPLACE: 'REPLACE';
-IDENTIFIER_KW: 'IDENTIFIER';
-SET: 'SET';
-TABLE: 'TABLE';
-UNORDERED: 'UNORDERED';
-WITH: 'WITH';
-WRITE: 'WRITE';
-
-TRUE: 'TRUE';
-FALSE: 'FALSE';
-
-MAP: 'MAP';
-
-PLUS: '+';
-MINUS: '-';
-
-STRING
- : '\'' ( ~('\''|'\\') | ('\\' .) )* '\''
- | '"' ( ~('"'|'\\') | ('\\' .) )* '"'
- ;
-
-BIGINT_LITERAL
- : DIGIT+ 'L'
- ;
-
-SMALLINT_LITERAL
- : DIGIT+ 'S'
- ;
-
-TINYINT_LITERAL
- : DIGIT+ 'Y'
- ;
-
-INTEGER_VALUE
- : DIGIT+
- ;
-
-EXPONENT_VALUE
- : DIGIT+ EXPONENT
- | DECIMAL_DIGITS EXPONENT {isValidDecimal()}?
- ;
-
-DECIMAL_VALUE
- : DECIMAL_DIGITS {isValidDecimal()}?
- ;
-
-FLOAT_LITERAL
- : DIGIT+ EXPONENT? 'F'
- | DECIMAL_DIGITS EXPONENT? 'F' {isValidDecimal()}?
- ;
-
-DOUBLE_LITERAL
- : DIGIT+ EXPONENT? 'D'
- | DECIMAL_DIGITS EXPONENT? 'D' {isValidDecimal()}?
- ;
-
-BIGDECIMAL_LITERAL
- : DIGIT+ EXPONENT? 'BD'
- | DECIMAL_DIGITS EXPONENT? 'BD' {isValidDecimal()}?
- ;
-
-IDENTIFIER
- : (LETTER | DIGIT | '_')+
- ;
-
-BACKQUOTED_IDENTIFIER
- : '`' ( ~'`' | '``' )* '`'
- ;
-
-fragment DECIMAL_DIGITS
- : DIGIT+ '.' DIGIT*
- | '.' DIGIT+
- ;
-
-fragment EXPONENT
- : 'E' [+-]? DIGIT+
- ;
-
-fragment DIGIT
- : [0-9]
- ;
-
-fragment LETTER
- : [A-Z]
- ;
-
-SIMPLE_COMMENT
- : '--' ('\\\n' | ~[\r\n])* '\r'? '\n'? -> channel(HIDDEN)
- ;
-
-BRACKETED_COMMENT
- : '/*' {!isHint()}? (BRACKETED_COMMENT|.)*? '*/' -> channel(HIDDEN)
- ;
-
-WS
- : [ \r\n\t]+ -> channel(HIDDEN)
- ;
-
-// Catch-all for anything we can't recognize.
-// We use this to be able to ignore and recover all the text
-// when splitting statements with DelimiterLexer
-UNRECOGNIZED
- : .
- ;
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/iceberg/spark/extensions/IcebergSparkSessionExtensions.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/iceberg/spark/extensions/IcebergSparkSessionExtensions.scala
deleted file mode 100644
index 30b5df5317..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/iceberg/spark/extensions/IcebergSparkSessionExtensions.scala
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.iceberg.spark.extensions
-
-import org.apache.spark.sql.SparkSessionExtensions
-import org.apache.spark.sql.catalyst.analysis.AlignRowLevelOperations
-import org.apache.spark.sql.catalyst.analysis.ProcedureArgumentCoercion
-import org.apache.spark.sql.catalyst.analysis.ResolveProcedures
-import org.apache.spark.sql.catalyst.analysis.RowLevelOperationsPredicateCheck
-import org.apache.spark.sql.catalyst.optimizer.OptimizeConditionsInRowLevelOperations
-import org.apache.spark.sql.catalyst.optimizer.PullupCorrelatedPredicatesInRowLevelOperations
-import org.apache.spark.sql.catalyst.optimizer.RewriteDelete
-import org.apache.spark.sql.catalyst.optimizer.RewriteMergeInto
-import org.apache.spark.sql.catalyst.optimizer.RewriteUpdate
-import org.apache.spark.sql.catalyst.parser.extensions.IcebergSparkSqlExtensionsParser
-import org.apache.spark.sql.execution.datasources.v2.ExtendedDataSourceV2Strategy
-
-class IcebergSparkSessionExtensions extends (SparkSessionExtensions => Unit) {
-
- override def apply(extensions: SparkSessionExtensions): Unit = {
- // parser extensions
- extensions.injectParser { case (_, parser) => new IcebergSparkSqlExtensionsParser(parser) }
-
- // analyzer extensions
- extensions.injectResolutionRule { spark => ResolveProcedures(spark) }
- extensions.injectResolutionRule { _ => ProcedureArgumentCoercion }
- extensions.injectPostHocResolutionRule { spark => AlignRowLevelOperations }
- extensions.injectCheckRule { _ => RowLevelOperationsPredicateCheck }
-
- // optimizer extensions
- extensions.injectOptimizerRule { _ => OptimizeConditionsInRowLevelOperations }
- extensions.injectOptimizerRule { _ => PullupCorrelatedPredicatesInRowLevelOperations }
- extensions.injectOptimizerRule { spark => RewriteDelete(spark) }
- extensions.injectOptimizerRule { spark => RewriteUpdate(spark) }
- extensions.injectOptimizerRule { spark => RewriteMergeInto(spark) }
-
- // planner extensions
- extensions.injectPlannerStrategy { spark => ExtendedDataSourceV2Strategy(spark) }
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlignRowLevelOperations.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlignRowLevelOperations.scala
deleted file mode 100644
index 6da3ba6323..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlignRowLevelOperations.scala
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.analysis
-
-import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.plans.logical.Assignment
-import org.apache.spark.sql.catalyst.plans.logical.DeleteAction
-import org.apache.spark.sql.catalyst.plans.logical.InsertAction
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.plans.logical.MergeIntoTable
-import org.apache.spark.sql.catalyst.plans.logical.UpdateAction
-import org.apache.spark.sql.catalyst.plans.logical.UpdateTable
-import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.catalyst.utils.PlanUtils.isIcebergRelation
-import org.apache.spark.sql.internal.SQLConf
-
-case object AlignRowLevelOperations extends Rule[LogicalPlan]
- with AssignmentAlignmentSupport with CastSupport {
-
- override def conf: SQLConf = SQLConf.get
-
- override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
- case u: UpdateTable if u.resolved && isIcebergRelation(u.table)=>
- u.copy(assignments = alignAssignments(u.table, u.assignments))
-
- case m: MergeIntoTable if m.resolved && isIcebergRelation(m.targetTable) =>
- val alignedMatchedActions = m.matchedActions.map {
- case u @ UpdateAction(_, assignments) =>
- u.copy(assignments = alignAssignments(m.targetTable, assignments))
- case d: DeleteAction =>
- d
- case _ =>
- throw new AnalysisException("Matched actions can only contain UPDATE or DELETE")
- }
-
- val alignedNotMatchedActions = m.notMatchedActions.map {
- case i @ InsertAction(_, assignments) =>
- // check no nested columns are present
- val refs = assignments.map(_.key).map(asAssignmentReference)
- refs.foreach { ref =>
- if (ref.size > 1) {
- throw new AnalysisException(
- "Nested fields are not supported inside INSERT clauses of MERGE operations: " +
- s"${ref.mkString("`", "`.`", "`")}")
- }
- }
-
- val colNames = refs.map(_.head)
-
- // check there are no duplicates
- val duplicateColNames = colNames.groupBy(identity).collect {
- case (name, matchingNames) if matchingNames.size > 1 => name
- }
-
- if (duplicateColNames.nonEmpty) {
- throw new AnalysisException(
- s"Duplicate column names inside INSERT clause: ${duplicateColNames.mkString(", ")}")
- }
-
- // reorder assignments by the target table column order
- val assignmentMap = colNames.zip(assignments).toMap
- i.copy(assignments = alignInsertActionAssignments(m.targetTable, assignmentMap))
-
- case _ =>
- throw new AnalysisException("Not matched actions can only contain INSERT")
- }
-
- m.copy(matchedActions = alignedMatchedActions, notMatchedActions = alignedNotMatchedActions)
- }
-
- private def alignInsertActionAssignments(
- targetTable: LogicalPlan,
- assignmentMap: Map[String, Assignment]): Seq[Assignment] = {
-
- val resolver = conf.resolver
-
- targetTable.output.map { targetAttr =>
- val assignment = assignmentMap
- .find { case (name, _) => resolver(name, targetAttr.name) }
- .map { case (_, assignment) => assignment }
-
- if (assignment.isEmpty) {
- throw new AnalysisException(
- s"Cannot find column '${targetAttr.name}' of the target table among " +
- s"the INSERT columns: ${assignmentMap.keys.mkString(", ")}. " +
- "INSERT clauses must provide values for all columns of the target table.")
- }
-
- val key = assignment.get.key
- val value = assignment.get.value
- Assignment(key, castIfNeeded(targetAttr, value, resolver))
- }
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/AssignmentAlignmentSupport.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/AssignmentAlignmentSupport.scala
deleted file mode 100644
index c1140df1d2..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/AssignmentAlignmentSupport.scala
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.analysis
-
-import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.expressions.Alias
-import org.apache.spark.sql.catalyst.expressions.AnsiCast
-import org.apache.spark.sql.catalyst.expressions.AttributeReference
-import org.apache.spark.sql.catalyst.expressions.Cast
-import org.apache.spark.sql.catalyst.expressions.CreateNamedStruct
-import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.expressions.ExtractValue
-import org.apache.spark.sql.catalyst.expressions.GetStructField
-import org.apache.spark.sql.catalyst.expressions.Literal
-import org.apache.spark.sql.catalyst.expressions.NamedExpression
-import org.apache.spark.sql.catalyst.plans.logical.Assignment
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.utils.RewriteRowLevelOperationHelper.createAlias
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.internal.SQLConf.StoreAssignmentPolicy
-import org.apache.spark.sql.types.DataType
-import org.apache.spark.sql.types.StructField
-import org.apache.spark.sql.types.StructType
-import scala.collection.mutable
-
-trait AssignmentAlignmentSupport {
-
- def conf: SQLConf
-
- private case class ColumnUpdate(ref: Seq[String], expr: Expression)
-
- /**
- * Aligns assignments to match table columns.
- * <p>
- * This method processes and reorders given assignments so that each target column gets
- * an expression it should be set to. If a column does not have a matching assignment,
- * it will be set to its current value. For example, if one passes a table with columns c1, c2
- * and an assignment c2 = 1, this method will return c1 = c1, c2 = 1.
- * <p>
- * This method also handles updates to nested columns. If there is an assignment to a particular
- * nested field, this method will construct a new struct with one field updated
- * preserving other fields that have not been modified. For example, if one passes a table with
- * columns c1, c2 where c2 is a struct with fields n1 and n2 and an assignment c2.n2 = 1,
- * this method will return c1 = c1, c2 = struct(c2.n1, 1).
- *
- * @param table a target table
- * @param assignments assignments to align
- * @return aligned assignments that match table columns
- */
- protected def alignAssignments(
- table: LogicalPlan,
- assignments: Seq[Assignment]): Seq[Assignment] = {
-
- val columnUpdates = assignments.map(a => ColumnUpdate(a.key, a.value))
- val outputExprs = applyUpdates(table.output, columnUpdates)
- outputExprs.zip(table.output).map {
- case (expr, attr) => Assignment(attr, expr)
- }
- }
-
- private def applyUpdates(
- cols: Seq[NamedExpression],
- updates: Seq[ColumnUpdate],
- resolver: Resolver = conf.resolver,
- namePrefix: Seq[String] = Nil): Seq[Expression] = {
-
- // iterate through columns at the current level and find which column updates match
- cols.map { col =>
- // find matches for this column or any of its children
- val prefixMatchedUpdates = updates.filter(a => resolver(a.ref.head, col.name))
- prefixMatchedUpdates match {
- // if there is no exact match and no match for children, return the column as is
- case updates if updates.isEmpty =>
- col
-
- // if there is an exact match, return the assigned expression
- case Seq(update) if isExactMatch(update, col, resolver) =>
- castIfNeeded(col, update.expr, resolver)
-
- // if there are matches only for children
- case updates if !hasExactMatch(updates, col, resolver) =>
- col.dataType match {
- case StructType(fields) =>
- // build field expressions
- val fieldExprs = fields.zipWithIndex.map { case (field, ordinal) =>
- createAlias(GetStructField(col, ordinal, Some(field.name)), field.name)
- }
-
- // recursively apply this method on nested fields
- val newUpdates = updates.map(u => u.copy(ref = u.ref.tail))
- val updatedFieldExprs = applyUpdates(fieldExprs, newUpdates, resolver, namePrefix :+ col.name)
-
- // construct a new struct with updated field expressions
- toNamedStruct(fields, updatedFieldExprs)
-
- case otherType =>
- val colName = (namePrefix :+ col.name).mkString(".")
- throw new AnalysisException(
- "Updating nested fields is only supported for StructType " +
- s"but $colName is of type $otherType"
- )
- }
-
- // if there are conflicting updates, throw an exception
- // there are two illegal scenarios:
- // - multiple updates to the same column
- // - updates to a top-level struct and its nested fields (e.g., a.b and a.b.c)
- case updates if hasExactMatch(updates, col, resolver) =>
- val conflictingCols = updates.map(u => (namePrefix ++ u.ref).mkString("."))
- throw new AnalysisException(
- "Updates are in conflict for these columns: " +
- conflictingCols.distinct.mkString(", "))
- }
- }
- }
-
- private def toNamedStruct(fields: Seq[StructField], fieldExprs: Seq[Expression]): Expression = {
- val namedStructExprs = fields.zip(fieldExprs).flatMap { case (field, expr) =>
- Seq(Literal(field.name), expr)
- }
- CreateNamedStruct(namedStructExprs)
- }
-
- private def hasExactMatch(
- updates: Seq[ColumnUpdate],
- col: NamedExpression,
- resolver: Resolver): Boolean = {
-
- updates.exists(assignment => isExactMatch(assignment, col, resolver))
- }
-
- private def isExactMatch(
- update: ColumnUpdate,
- col: NamedExpression,
- resolver: Resolver): Boolean = {
-
- update.ref match {
- case Seq(namePart) if resolver(namePart, col.name) => true
- case _ => false
- }
- }
-
- protected def castIfNeeded(
- tableAttr: NamedExpression,
- expr: Expression,
- resolver: Resolver): Expression = {
-
- val storeAssignmentPolicy = conf.storeAssignmentPolicy
-
- // run the type check and catch type errors
- storeAssignmentPolicy match {
- case StoreAssignmentPolicy.STRICT | StoreAssignmentPolicy.ANSI =>
- if (expr.nullable && !tableAttr.nullable) {
- throw new AnalysisException(
- s"Cannot write nullable values to non-null column '${tableAttr.name}'")
- }
-
- // we use byName = true to catch cases when struct field names don't match
- // e.g. a struct with fields (a, b) is assigned as a struct with fields (a, c) or (b, a)
- val errors = new mutable.ArrayBuffer[String]()
- val canWrite = DataType.canWrite(
- expr.dataType, tableAttr.dataType, byName = true, resolver, tableAttr.name,
- storeAssignmentPolicy, err => errors += err)
-
- if (!canWrite) {
- throw new AnalysisException(s"Cannot write incompatible data:\n- ${errors.mkString("\n- ")}")
- }
-
- case _ => // OK
- }
-
- storeAssignmentPolicy match {
- case _ if tableAttr.dataType.sameType(expr.dataType) =>
- expr
- case StoreAssignmentPolicy.ANSI =>
- AnsiCast(expr, tableAttr.dataType, Option(conf.sessionLocalTimeZone))
- case _ =>
- Cast(expr, tableAttr.dataType, Option(conf.sessionLocalTimeZone))
- }
- }
-
- implicit protected def asAssignmentReference(expr: Expression): Seq[String] = expr match {
- case attr: AttributeReference => Seq(attr.name)
- case Alias(child, _) => asAssignmentReference(child)
- case GetStructField(child, _, Some(name)) => asAssignmentReference(child) :+ name
- case other: ExtractValue =>
- throw new AnalysisException(s"Updating nested fields is only supported for structs: $other")
- case other =>
- throw new AnalysisException(s"Cannot convert to a reference, unsupported expression: $other")
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/ProcedureArgumentCoercion.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/ProcedureArgumentCoercion.scala
deleted file mode 100644
index 7f0ca8fadd..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/ProcedureArgumentCoercion.scala
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.analysis
-
-import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.expressions.Cast
-import org.apache.spark.sql.catalyst.plans.logical.Call
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.rules.Rule
-
-object ProcedureArgumentCoercion extends Rule[LogicalPlan] {
- override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
- case c @ Call(procedure, args) if c.resolved =>
- val params = procedure.parameters
-
- val newArgs = args.zipWithIndex.map { case (arg, index) =>
- val param = params(index)
- val paramType = param.dataType
- val argType = arg.dataType
-
- if (paramType != argType && !Cast.canUpCast(argType, paramType)) {
- throw new AnalysisException(
- s"Wrong arg type for ${param.name}: cannot cast $argType to $paramType")
- }
-
- if (paramType != argType) {
- Cast(arg, paramType)
- } else {
- arg
- }
- }
-
- if (newArgs != args) {
- c.copy(args = newArgs)
- } else {
- c
- }
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveProcedures.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveProcedures.scala
deleted file mode 100644
index b50655d5c2..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveProcedures.scala
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.analysis
-
-import java.util.Locale
-import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.expressions.Literal
-import org.apache.spark.sql.catalyst.plans.logical.Call
-import org.apache.spark.sql.catalyst.plans.logical.CallArgument
-import org.apache.spark.sql.catalyst.plans.logical.CallStatement
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.plans.logical.NamedArgument
-import org.apache.spark.sql.catalyst.plans.logical.PositionalArgument
-import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.connector.catalog.CatalogManager
-import org.apache.spark.sql.connector.catalog.CatalogPlugin
-import org.apache.spark.sql.connector.catalog.LookupCatalog
-import org.apache.spark.sql.connector.iceberg.catalog.ProcedureCatalog
-import org.apache.spark.sql.connector.iceberg.catalog.ProcedureParameter
-import scala.collection.Seq
-
-case class ResolveProcedures(spark: SparkSession) extends Rule[LogicalPlan] with LookupCatalog {
-
- protected lazy val catalogManager: CatalogManager = spark.sessionState.catalogManager
-
- override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
- case CallStatement(CatalogAndIdentifier(catalog, ident), args) =>
- val procedure = catalog.asProcedureCatalog.loadProcedure(ident)
-
- val params = procedure.parameters
- val normalizedParams = normalizeParams(params)
- validateParams(normalizedParams)
-
- val normalizedArgs = normalizeArgs(args)
- Call(procedure, args = buildArgExprs(normalizedParams, normalizedArgs))
- }
-
- private def validateParams(params: Seq[ProcedureParameter]): Unit = {
- // should not be any duplicate param names
- val duplicateParamNames = params.groupBy(_.name).collect {
- case (name, matchingParams) if matchingParams.length > 1 => name
- }
-
- if (duplicateParamNames.nonEmpty) {
- throw new AnalysisException(s"Duplicate parameter names: ${duplicateParamNames.mkString("[", ",", "]")}")
- }
-
- // optional params should be at the end
- params.sliding(2).foreach {
- case Seq(previousParam, currentParam) if !previousParam.required && currentParam.required =>
- throw new AnalysisException(
- s"Optional parameters must be after required ones but $currentParam is after $previousParam")
- case _ =>
- }
- }
-
- private def buildArgExprs(
- params: Seq[ProcedureParameter],
- args: Seq[CallArgument]): Seq[Expression] = {
-
- // build a map of declared parameter names to their positions
- val nameToPositionMap = params.map(_.name).zipWithIndex.toMap
-
- // build a map of parameter names to args
- val nameToArgMap = buildNameToArgMap(params, args, nameToPositionMap)
-
- // verify all required parameters are provided
- val missingParamNames = params.filter(_.required).collect {
- case param if !nameToArgMap.contains(param.name) => param.name
- }
-
- if (missingParamNames.nonEmpty) {
- throw new AnalysisException(s"Missing required parameters: ${missingParamNames.mkString("[", ",", "]")}")
- }
-
- val argExprs = new Array[Expression](params.size)
-
- nameToArgMap.foreach { case (name, arg) =>
- val position = nameToPositionMap(name)
- argExprs(position) = arg.expr
- }
-
- // assign nulls to optional params that were not set
- params.foreach {
- case p if !p.required && !nameToArgMap.contains(p.name) =>
- val position = nameToPositionMap(p.name)
- argExprs(position) = Literal.create(null, p.dataType)
- case _ =>
- }
-
- argExprs
- }
-
- private def buildNameToArgMap(
- params: Seq[ProcedureParameter],
- args: Seq[CallArgument],
- nameToPositionMap: Map[String, Int]): Map[String, CallArgument] = {
-
- val containsNamedArg = args.exists(_.isInstanceOf[NamedArgument])
- val containsPositionalArg = args.exists(_.isInstanceOf[PositionalArgument])
-
- if (containsNamedArg && containsPositionalArg) {
- throw new AnalysisException("Named and positional arguments cannot be mixed")
- }
-
- if (containsNamedArg) {
- buildNameToArgMapUsingNames(args, nameToPositionMap)
- } else {
- buildNameToArgMapUsingPositions(args, params)
- }
- }
-
- private def buildNameToArgMapUsingNames(
- args: Seq[CallArgument],
- nameToPositionMap: Map[String, Int]): Map[String, CallArgument] = {
-
- val namedArgs = args.asInstanceOf[Seq[NamedArgument]]
-
- val validationErrors = namedArgs.groupBy(_.name).collect {
- case (name, matchingArgs) if matchingArgs.size > 1 => s"Duplicate procedure argument: $name"
- case (name, _) if !nameToPositionMap.contains(name) => s"Unknown argument: $name"
- }
-
- if (validationErrors.nonEmpty) {
- throw new AnalysisException(s"Could not build name to arg map: ${validationErrors.mkString(", ")}")
- }
-
- namedArgs.map(arg => arg.name -> arg).toMap
- }
-
- private def buildNameToArgMapUsingPositions(
- args: Seq[CallArgument],
- params: Seq[ProcedureParameter]): Map[String, CallArgument] = {
-
- if (args.size > params.size) {
- throw new AnalysisException("Too many arguments for procedure")
- }
-
- args.zipWithIndex.map { case (arg, position) =>
- val param = params(position)
- param.name -> arg
- }.toMap
- }
-
- private def normalizeParams(params: Seq[ProcedureParameter]): Seq[ProcedureParameter] = {
- params.map {
- case param if param.required =>
- val normalizedName = param.name.toLowerCase(Locale.ROOT)
- ProcedureParameter.required(normalizedName, param.dataType)
- case param =>
- val normalizedName = param.name.toLowerCase(Locale.ROOT)
- ProcedureParameter.optional(normalizedName, param.dataType)
- }
- }
-
- private def normalizeArgs(args: Seq[CallArgument]): Seq[CallArgument] = {
- args.map {
- case a @ NamedArgument(name, _) => a.copy(name = name.toLowerCase(Locale.ROOT))
- case other => other
- }
- }
-
- implicit class CatalogHelper(plugin: CatalogPlugin) {
- def asProcedureCatalog: ProcedureCatalog = plugin match {
- case procedureCatalog: ProcedureCatalog =>
- procedureCatalog
- case _ =>
- throw new AnalysisException(s"Cannot use catalog ${plugin.name}: not a ProcedureCatalog")
- }
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/RowLevelOperationsPredicateCheck.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/RowLevelOperationsPredicateCheck.scala
deleted file mode 100644
index 0beab493f2..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/RowLevelOperationsPredicateCheck.scala
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.analysis
-
-import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.expressions.InSubquery
-import org.apache.spark.sql.catalyst.expressions.Not
-import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
-import org.apache.spark.sql.catalyst.plans.logical.DeleteAction
-import org.apache.spark.sql.catalyst.plans.logical.DeleteFromTable
-import org.apache.spark.sql.catalyst.plans.logical.InsertAction
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.plans.logical.MergeIntoTable
-import org.apache.spark.sql.catalyst.plans.logical.UpdateAction
-import org.apache.spark.sql.catalyst.plans.logical.UpdateTable
-import org.apache.spark.sql.catalyst.utils.PlanUtils.isIcebergRelation
-
-object RowLevelOperationsPredicateCheck extends (LogicalPlan => Unit) {
-
- override def apply(plan: LogicalPlan): Unit = {
- plan foreach {
- case DeleteFromTable(r, Some(condition)) if hasNullAwarePredicateWithinNot(condition) && isIcebergRelation(r) =>
- // this limitation is present since SPARK-25154 fix is not yet available
- // we use Not(EqualsNullSafe(cond, true)) when deciding which records to keep
- // such conditions are rewritten by Spark as an existential join and currently Spark
- // does not handle correctly NOT IN subqueries nested into other expressions
- failAnalysis("Null-aware predicate subqueries are not currently supported in DELETE")
-
- case UpdateTable(r, _, Some(condition)) if hasNullAwarePredicateWithinNot(condition) && isIcebergRelation(r) =>
- // this limitation is present since SPARK-25154 fix is not yet available
- // we use Not(EqualsNullSafe(cond, true)) when processing records that did not match
- // the update condition but were present in files we are overwriting
- // such conditions are rewritten by Spark as an existential join and currently Spark
- // does not handle correctly NOT IN subqueries nested into other expressions
- failAnalysis("Null-aware predicate subqueries are not currently supported in UPDATE")
-
- case merge: MergeIntoTable if isIcebergRelation(merge.targetTable) =>
- validateMergeIntoConditions(merge)
-
- case _ => // OK
- }
- }
-
- private def validateMergeIntoConditions(merge: MergeIntoTable): Unit = {
- checkMergeIntoCondition(merge.mergeCondition, "SEARCH")
- val actions = merge.matchedActions ++ merge.notMatchedActions
- actions.foreach {
- case DeleteAction(Some(cond)) => checkMergeIntoCondition(cond, "DELETE")
- case UpdateAction(Some(cond), _) => checkMergeIntoCondition(cond, "UPDATE")
- case InsertAction(Some(cond), _) => checkMergeIntoCondition(cond, "INSERT")
- case _ => // OK
- }
- }
-
- private def checkMergeIntoCondition(cond: Expression, condName: String): Unit = {
- // Spark already validates the conditions are deterministic and don't contain aggregates
- if (SubqueryExpression.hasSubquery(cond)) {
- throw new AnalysisException(
- s"Subqueries are not supported in conditions of MERGE operations. " +
- s"Found a subquery in the $condName condition: ${cond.sql}")
- }
- }
-
- private def hasNullAwarePredicateWithinNot(cond: Expression): Boolean = {
- cond.find {
- case Not(expr) if expr.find(_.isInstanceOf[InSubquery]).isDefined => true
- case _ => false
- }.isDefined
- }
-
- private def failAnalysis(msg: String): Unit = throw new AnalysisException(msg)
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/expressions/AccumulateFiles.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/expressions/AccumulateFiles.scala
deleted file mode 100644
index f673db3da2..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/expressions/AccumulateFiles.scala
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.expressions
-
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
-import org.apache.spark.sql.catalyst.utils.SetAccumulator
-import org.apache.spark.sql.types.DataType
-import org.apache.spark.sql.types.IntegerType
-
-case class AccumulateFiles(
- filesAccumulator: SetAccumulator[String],
- child: Expression) extends UnaryExpression with CodegenFallback {
-
- override def dataType: DataType = IntegerType
- override def nullable: Boolean = true
- override def prettyName: String = "AccumulateFiles"
- override lazy val deterministic: Boolean = false
- private val RETURN_VAL: Integer = 1
-
- override def eval(input: InternalRow) : Any = {
- val resultVal = child.eval(input)
- filesAccumulator.add(resultVal.toString)
- RETURN_VAL
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeConditionsInRowLevelOperations.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeConditionsInRowLevelOperations.scala
deleted file mode 100644
index 621d0acd4e..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeConditionsInRowLevelOperations.scala
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.optimizer
-
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.expressions.Literal
-import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
-import org.apache.spark.sql.catalyst.plans.logical.DeleteFromTable
-import org.apache.spark.sql.catalyst.plans.logical.Filter
-import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.plans.logical.UpdateTable
-import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.catalyst.utils.PlanUtils.isIcebergRelation
-import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanRelation
-
-// we have to optimize expressions used in delete/update before we can rewrite row-level operations
-// otherwise, we will have to deal with redundant casts and will not detect noop deletes
-// it is a temp solution since we cannot inject rewrite of row-level ops after operator optimizations
-object OptimizeConditionsInRowLevelOperations extends Rule[LogicalPlan] {
- override def apply(plan: LogicalPlan): LogicalPlan = plan transform {
- case d @ DeleteFromTable(table, cond)
- if !SubqueryExpression.hasSubquery(cond.getOrElse(Literal.TrueLiteral)) && isIcebergRelation(table) =>
- val optimizedCond = optimizeCondition(cond.getOrElse(Literal.TrueLiteral), table)
- d.copy(condition = Some(optimizedCond))
- case u @ UpdateTable(table, _, cond)
- if !SubqueryExpression.hasSubquery(cond.getOrElse(Literal.TrueLiteral)) && isIcebergRelation(table) =>
- val optimizedCond = optimizeCondition(cond.getOrElse(Literal.TrueLiteral), table)
- u.copy(condition = Some(optimizedCond))
- }
-
- private def optimizeCondition(cond: Expression, table: LogicalPlan): Expression = {
- val optimizer = SparkSession.active.sessionState.optimizer
- optimizer.execute(Filter(cond, table)) match {
- case Filter(optimizedCondition, _) => optimizedCondition
- case _: LocalRelation => Literal.FalseLiteral
- case _: DataSourceV2ScanRelation => Literal.TrueLiteral
- case _ => cond
- }
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PullupCorrelatedPredicatesInRowLevelOperations.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PullupCorrelatedPredicatesInRowLevelOperations.scala
deleted file mode 100644
index f0794d79c4..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PullupCorrelatedPredicatesInRowLevelOperations.scala
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.optimizer
-
-import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
-import org.apache.spark.sql.catalyst.plans.logical.DeleteFromTable
-import org.apache.spark.sql.catalyst.plans.logical.Filter
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.plans.logical.UpdateTable
-import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.catalyst.utils.PlanUtils.isIcebergRelation
-
-// a temp solution until PullupCorrelatedPredicates handles row-level operations in Spark
-object PullupCorrelatedPredicatesInRowLevelOperations extends Rule[LogicalPlan] {
- override def apply(plan: LogicalPlan): LogicalPlan = plan transform {
- case d @ DeleteFromTable(table, Some(cond)) if SubqueryExpression.hasSubquery(cond) && isIcebergRelation(table) =>
- val transformedCond = transformCond(table, cond)
- d.copy(condition = Some(transformedCond))
-
- case u @ UpdateTable(table, _, Some(cond)) if SubqueryExpression.hasSubquery(cond) && isIcebergRelation(table) =>
- val transformedCond = transformCond(table, cond)
- u.copy(condition = Some(transformedCond))
- }
-
- // Spark pulls up correlated predicates only for UnaryNodes
- // DeleteFromTable and UpdateTable do not extend UnaryNode so they are ignored in that rule
- // We have this workaround until it is fixed in Spark
- private def transformCond(table: LogicalPlan, cond: Expression): Expression = {
- val filter = Filter(cond, table)
- val transformedFilter = PullupCorrelatedPredicates.apply(filter)
- transformedFilter.asInstanceOf[Filter].condition
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDelete.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDelete.scala
deleted file mode 100644
index eeac38e151..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDelete.scala
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.optimizer
-
-import org.apache.iceberg.DistributionMode
-import org.apache.iceberg.spark.Spark3Util
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.catalyst.expressions.Ascending
-import org.apache.spark.sql.catalyst.expressions.AttributeReference
-import org.apache.spark.sql.catalyst.expressions.EqualNullSafe
-import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.expressions.Literal
-import org.apache.spark.sql.catalyst.expressions.Not
-import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
-import org.apache.spark.sql.catalyst.plans.logical.DeleteFromTable
-import org.apache.spark.sql.catalyst.plans.logical.Filter
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.plans.logical.Project
-import org.apache.spark.sql.catalyst.plans.logical.ReplaceData
-import org.apache.spark.sql.catalyst.plans.logical.Sort
-import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.catalyst.utils.DistributionAndOrderingUtils
-import org.apache.spark.sql.catalyst.utils.PlanUtils.createRepartitionByExpression
-import org.apache.spark.sql.catalyst.utils.PlanUtils.isIcebergRelation
-import org.apache.spark.sql.catalyst.utils.RewriteRowLevelOperationHelper
-import org.apache.spark.sql.connector.catalog.Table
-import org.apache.spark.sql.connector.iceberg.catalog.ExtendedSupportsDelete
-import org.apache.spark.sql.execution.datasources.DataSourceStrategy
-import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
-import org.apache.spark.sql.execution.datasources.v2.ExtendedDataSourceV2Implicits
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.BooleanType
-
-case class RewriteDelete(spark: SparkSession) extends Rule[LogicalPlan] with RewriteRowLevelOperationHelper {
-
- import ExtendedDataSourceV2Implicits._
- import RewriteRowLevelOperationHelper._
- import DistributionAndOrderingUtils._
-
- override def conf: SQLConf = SQLConf.get
-
- override def apply(plan: LogicalPlan): LogicalPlan = plan transform {
- // don't rewrite deletes that can be answered by passing filters to deleteWhere in SupportsDelete
- case d @ DeleteFromTable(r: DataSourceV2Relation, Some(cond))
- if isMetadataDelete(r, cond) && isIcebergRelation(r) =>
- d
-
- // rewrite all operations that require reading the table to delete records
- case DeleteFromTable(r: DataSourceV2Relation, Some(cond)) if isIcebergRelation(r) =>
- // TODO: do a switch based on whether we get BatchWrite or DeltaBatchWrite
- val writeInfo = newWriteInfo(r.schema)
- val mergeBuilder = r.table.asMergeable.newMergeBuilder("delete", writeInfo)
-
- val matchingRowsPlanBuilder = scanRelation => Filter(cond, scanRelation)
- val scanPlan = buildDynamicFilterScanPlan(spark, r, r.output, mergeBuilder, cond, matchingRowsPlanBuilder)
-
- val remainingRowFilter = Not(EqualNullSafe(cond, Literal(true, BooleanType)))
- val remainingRowsPlan = Filter(remainingRowFilter, scanPlan)
-
- val mergeWrite = mergeBuilder.asWriteBuilder.buildForBatch()
- val writePlan = buildWritePlan(remainingRowsPlan, r.table, r.output)
- ReplaceData(r, mergeWrite, writePlan)
- }
-
- private def buildWritePlan(
- remainingRowsPlan: LogicalPlan,
- table: Table,
- output: Seq[AttributeReference]): LogicalPlan = {
-
- val fileNameCol = findOutputAttr(remainingRowsPlan.output, FILE_NAME_COL)
- val rowPosCol = findOutputAttr(remainingRowsPlan.output, ROW_POS_COL)
-
- val icebergTable = Spark3Util.toIcebergTable(table)
- val distributionMode = Spark3Util.distributionModeFor(icebergTable)
- val planWithDistribution = distributionMode match {
- case DistributionMode.NONE =>
- remainingRowsPlan
- case _ =>
- // apply hash partitioning by file if the distribution mode is hash or range
- val numShufflePartitions = conf.numShufflePartitions
- createRepartitionByExpression(Seq(fileNameCol), remainingRowsPlan, numShufflePartitions)
- }
-
- val order = Seq(createSortOrder(fileNameCol, Ascending), createSortOrder(rowPosCol, Ascending))
- val sort = Sort(order, global = false, planWithDistribution)
- Project(output, sort)
- }
-
- private def isMetadataDelete(relation: DataSourceV2Relation, cond: Expression): Boolean = {
- relation.table match {
- case t: ExtendedSupportsDelete if !SubqueryExpression.hasSubquery(cond) =>
- val predicates = splitConjunctivePredicates(cond)
- val normalizedPredicates = DataSourceStrategy.normalizeExprs(predicates, relation.output)
- val dataSourceFilters = toDataSourceFilters(normalizedPredicates)
- val allPredicatesTranslated = normalizedPredicates.size == dataSourceFilters.length
- allPredicatesTranslated && t.canDeleteWhere(dataSourceFilters)
- case _ => false
- }
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteMergeInto.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteMergeInto.scala
deleted file mode 100644
index a08adfbf59..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteMergeInto.scala
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.optimizer
-
-import org.apache.iceberg.TableProperties.MERGE_CARDINALITY_CHECK_ENABLED
-import org.apache.iceberg.TableProperties.MERGE_CARDINALITY_CHECK_ENABLED_DEFAULT
-import org.apache.iceberg.util.PropertyUtil
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.catalyst.expressions.Alias
-import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.expressions.IsNotNull
-import org.apache.spark.sql.catalyst.expressions.Literal
-import org.apache.spark.sql.catalyst.plans.FullOuter
-import org.apache.spark.sql.catalyst.plans.Inner
-import org.apache.spark.sql.catalyst.plans.LeftAnti
-import org.apache.spark.sql.catalyst.plans.RightOuter
-import org.apache.spark.sql.catalyst.plans.logical.AppendData
-import org.apache.spark.sql.catalyst.plans.logical.DeleteAction
-import org.apache.spark.sql.catalyst.plans.logical.Filter
-import org.apache.spark.sql.catalyst.plans.logical.InsertAction
-import org.apache.spark.sql.catalyst.plans.logical.Join
-import org.apache.spark.sql.catalyst.plans.logical.JoinHint
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.plans.logical.MergeAction
-import org.apache.spark.sql.catalyst.plans.logical.MergeInto
-import org.apache.spark.sql.catalyst.plans.logical.MergeIntoParams
-import org.apache.spark.sql.catalyst.plans.logical.MergeIntoTable
-import org.apache.spark.sql.catalyst.plans.logical.Project
-import org.apache.spark.sql.catalyst.plans.logical.ReplaceData
-import org.apache.spark.sql.catalyst.plans.logical.UpdateAction
-import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.catalyst.utils.PlanUtils.isIcebergRelation
-import org.apache.spark.sql.catalyst.utils.RewriteRowLevelOperationHelper
-import org.apache.spark.sql.connector.catalog.Table
-import org.apache.spark.sql.connector.iceberg.write.MergeBuilder
-import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
-import org.apache.spark.sql.execution.datasources.v2.ExtendedDataSourceV2Implicits
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.BooleanType
-
-case class RewriteMergeInto(spark: SparkSession) extends Rule[LogicalPlan] with RewriteRowLevelOperationHelper {
- import ExtendedDataSourceV2Implicits._
- import RewriteMergeInto._
- import RewriteRowLevelOperationHelper._
-
- override def conf: SQLConf = SQLConf.get
-
- override def apply(plan: LogicalPlan): LogicalPlan = {
- plan transform {
- case MergeIntoTable(target: DataSourceV2Relation, source, cond, matchedActions, notMatchedActions)
- if matchedActions.isEmpty && notMatchedActions.size == 1 && isIcebergRelation(target) =>
-
- val targetTableScan = buildSimpleScanPlan(target, cond)
-
- // NOT MATCHED conditions may only refer to columns in source so we can push them down
- val insertAction = notMatchedActions.head.asInstanceOf[InsertAction]
- val filteredSource = insertAction.condition match {
- case Some(insertCond) => Filter(insertCond, source)
- case None => source
- }
-
- // when there are no matched actions, use a left anti join to remove any matching rows and rewrite to use
- // append instead of replace. only unmatched source rows are passed to the merge and actions are all inserts.
- val joinPlan = Join(filteredSource, targetTableScan, LeftAnti, Some(cond), JoinHint.NONE)
-
- val outputExprs = insertAction.assignments.map(_.value)
- val outputColNames = target.output.map(_.name)
- val outputCols = outputExprs.zip(outputColNames).map { case (expr, name) => createAlias(expr, name) }
- val mergePlan = Project(outputCols, joinPlan)
-
- val writePlan = buildWritePlan(mergePlan, target.table)
-
- AppendData.byPosition(target, writePlan, Map.empty)
-
- case MergeIntoTable(target: DataSourceV2Relation, source, cond, matchedActions, notMatchedActions)
- if matchedActions.isEmpty && isIcebergRelation(target) =>
-
- val targetTableScan = buildSimpleScanPlan(target, cond)
-
- // when there are no matched actions, use a left anti join to remove any matching rows and rewrite to use
- // append instead of replace. only unmatched source rows are passed to the merge and actions are all inserts.
- val joinPlan = Join(source, targetTableScan, LeftAnti, Some(cond), JoinHint.NONE)
-
- val mergeParams = MergeIntoParams(
- isSourceRowPresent = TRUE_LITERAL,
- isTargetRowPresent = FALSE_LITERAL,
- matchedConditions = Nil,
- matchedOutputs = Nil,
- notMatchedConditions = notMatchedActions.map(getClauseCondition),
- notMatchedOutputs = notMatchedActions.map(actionOutput),
- targetOutput = Nil,
- joinedAttributes = joinPlan.output
- )
-
- val mergePlan = MergeInto(mergeParams, target.output, joinPlan)
- val writePlan = buildWritePlan(mergePlan, target.table)
-
- AppendData.byPosition(target, writePlan, Map.empty)
-
- case MergeIntoTable(target: DataSourceV2Relation, source, cond, matchedActions, notMatchedActions)
- if notMatchedActions.isEmpty && isIcebergRelation(target) =>
-
- val mergeBuilder = target.table.asMergeable.newMergeBuilder("merge", newWriteInfo(target.schema))
-
- // rewrite the matched actions to ensure there is always an action to produce the output row
- val (matchedConditions, matchedOutputs) = rewriteMatchedActions(matchedActions, target.output)
-
- // when there are no not-matched actions, use a right outer join to ignore source rows that do not match, but
- // keep all unmatched target rows that must be preserved.
- val sourceTableProj = source.output ++ Seq(createAlias(TRUE_LITERAL, ROW_FROM_SOURCE))
- val newSourceTableScan = Project(sourceTableProj, source)
- val targetTableScan = buildDynamicFilterTargetScan(mergeBuilder, target, source, cond, matchedActions)
- val joinPlan = Join(newSourceTableScan, targetTableScan, RightOuter, Some(cond), JoinHint.NONE)
-
- val mergeParams = MergeIntoParams(
- isSourceRowPresent = IsNotNull(findOutputAttr(joinPlan.output, ROW_FROM_SOURCE)),
- isTargetRowPresent = TRUE_LITERAL,
- matchedConditions = matchedConditions,
- matchedOutputs = matchedOutputs,
- notMatchedConditions = Nil,
- notMatchedOutputs = Nil,
- targetOutput = target.output,
- joinedAttributes = joinPlan.output
- )
- val mergePlan = MergeInto(mergeParams, target.output, joinPlan)
- val writePlan = buildWritePlan(mergePlan, target.table)
- val batchWrite = mergeBuilder.asWriteBuilder.buildForBatch()
-
- ReplaceData(target, batchWrite, writePlan)
-
- case MergeIntoTable(target: DataSourceV2Relation, source, cond, matchedActions, notMatchedActions)
- if isIcebergRelation(target) =>
-
- val mergeBuilder = target.table.asMergeable.newMergeBuilder("merge", newWriteInfo(target.schema))
-
- // rewrite the matched actions to ensure there is always an action to produce the output row
- val (matchedConditions, matchedOutputs) = rewriteMatchedActions(matchedActions, target.output)
-
- // use a full outer join because there are both matched and not matched actions
- val sourceTableProj = source.output ++ Seq(createAlias(TRUE_LITERAL, ROW_FROM_SOURCE))
- val newSourceTableScan = Project(sourceTableProj, source)
- val targetTableScan = buildDynamicFilterTargetScan(mergeBuilder, target, source, cond, matchedActions)
- val targetTableProj = targetTableScan.output ++ Seq(createAlias(TRUE_LITERAL, ROW_FROM_TARGET))
- val newTargetTableScan = Project(targetTableProj, targetTableScan)
- val joinPlan = Join(newSourceTableScan, newTargetTableScan, FullOuter, Some(cond), JoinHint.NONE)
-
- val mergeParams = MergeIntoParams(
- isSourceRowPresent = IsNotNull(findOutputAttr(joinPlan.output, ROW_FROM_SOURCE)),
- isTargetRowPresent = IsNotNull(findOutputAttr(joinPlan.output, ROW_FROM_TARGET)),
- matchedConditions = matchedConditions,
- matchedOutputs = matchedOutputs,
- notMatchedConditions = notMatchedActions.map(getClauseCondition),
- notMatchedOutputs = notMatchedActions.map(actionOutput),
- targetOutput = target.output,
- joinedAttributes = joinPlan.output
- )
- val mergePlan = MergeInto(mergeParams, target.output, joinPlan)
- val writePlan = buildWritePlan(mergePlan, target.table)
- val batchWrite = mergeBuilder.asWriteBuilder.buildForBatch()
-
- ReplaceData(target, batchWrite, writePlan)
- }
- }
-
- private def actionOutput(clause: MergeAction): Option[Seq[Expression]] = {
- clause match {
- case u: UpdateAction =>
- Some(u.assignments.map(_.value))
- case _: DeleteAction =>
- None
- case i: InsertAction =>
- Some(i.assignments.map(_.value))
- }
- }
-
- private def getClauseCondition(clause: MergeAction): Expression = {
- clause.condition.getOrElse(TRUE_LITERAL)
- }
-
- private def buildDynamicFilterTargetScan(
- mergeBuilder: MergeBuilder,
- target: DataSourceV2Relation,
- source: LogicalPlan,
- cond: Expression,
- matchedActions: Seq[MergeAction]): LogicalPlan = {
- // Construct the plan to prune target based on join condition between source and target.
- val table = target.table
- val output = target.output
- val matchingRowsPlanBuilder = rel => Join(source, rel, Inner, Some(cond), JoinHint.NONE)
- val runCardinalityCheck = isCardinalityCheckEnabled(table) && isCardinalityCheckNeeded(matchedActions)
- buildDynamicFilterScanPlan(spark, target, output, mergeBuilder, cond, matchingRowsPlanBuilder, runCardinalityCheck)
- }
-
- private def rewriteMatchedActions(
- matchedActions: Seq[MergeAction],
- targetOutput: Seq[Expression]): (Seq[Expression], Seq[Option[Seq[Expression]]]) = {
- val startMatchedConditions = matchedActions.map(getClauseCondition)
- val catchAllIndex = startMatchedConditions.indexWhere {
- case Literal(true, BooleanType) =>
- true
- case _ =>
- false
- }
-
- val outputs = matchedActions.map(actionOutput)
- if (catchAllIndex < 0) {
- // all of the actions have non-trivial conditions. add an action to emit the target row if no action matches
- (startMatchedConditions :+ TRUE_LITERAL, outputs :+ Some(targetOutput))
- } else {
- // one "catch all" action will always match, prune the actions after it
- (startMatchedConditions.take(catchAllIndex + 1), outputs.take(catchAllIndex + 1))
- }
- }
-
- private def isCardinalityCheckEnabled(table: Table): Boolean = {
- PropertyUtil.propertyAsBoolean(
- table.properties(),
- MERGE_CARDINALITY_CHECK_ENABLED,
- MERGE_CARDINALITY_CHECK_ENABLED_DEFAULT)
- }
-
- private def isCardinalityCheckNeeded(actions: Seq[MergeAction]): Boolean = {
- def hasUnconditionalDelete(action: Option[MergeAction]): Boolean = {
- action match {
- case Some(DeleteAction(None)) => true
- case _ => false
- }
- }
- !(actions.size == 1 && hasUnconditionalDelete(actions.headOption))
- }
-}
-
-object RewriteMergeInto {
- private final val ROW_FROM_SOURCE = "_row_from_source_"
- private final val ROW_FROM_TARGET = "_row_from_target_"
- private final val TRUE_LITERAL = Literal(true, BooleanType)
- private final val FALSE_LITERAL = Literal(false, BooleanType)
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteUpdate.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteUpdate.scala
deleted file mode 100644
index de5984cbcc..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteUpdate.scala
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.optimizer
-
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.catalyst.expressions.Alias
-import org.apache.spark.sql.catalyst.expressions.EqualNullSafe
-import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.expressions.If
-import org.apache.spark.sql.catalyst.expressions.Literal
-import org.apache.spark.sql.catalyst.expressions.Not
-import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
-import org.apache.spark.sql.catalyst.plans.logical.Assignment
-import org.apache.spark.sql.catalyst.plans.logical.DynamicFileFilter
-import org.apache.spark.sql.catalyst.plans.logical.Filter
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.plans.logical.Project
-import org.apache.spark.sql.catalyst.plans.logical.ReplaceData
-import org.apache.spark.sql.catalyst.plans.logical.Union
-import org.apache.spark.sql.catalyst.plans.logical.UpdateTable
-import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.catalyst.utils.PlanUtils.isIcebergRelation
-import org.apache.spark.sql.catalyst.utils.RewriteRowLevelOperationHelper
-import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
-import org.apache.spark.sql.execution.datasources.v2.ExtendedDataSourceV2Implicits
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.BooleanType
-
-case class RewriteUpdate(spark: SparkSession) extends Rule[LogicalPlan] with RewriteRowLevelOperationHelper {
-
- import ExtendedDataSourceV2Implicits._
- import RewriteRowLevelOperationHelper._
-
- override def conf: SQLConf = SQLConf.get
-
- // TODO: can we do any better for no-op updates? when conditions evaluate to false/true?
- override def apply(plan: LogicalPlan): LogicalPlan = plan transform {
- case UpdateTable(r: DataSourceV2Relation, assignments, Some(cond))
- if isIcebergRelation(r) && SubqueryExpression.hasSubquery(cond) =>
-
- val writeInfo = newWriteInfo(r.schema)
- val mergeBuilder = r.table.asMergeable.newMergeBuilder("update", writeInfo)
-
- // since we are processing matched and not matched rows using separate jobs
- // there will be two scans but we want to execute the dynamic file filter only once
- // so the first job uses DynamicFileFilter and the second one uses the underlying scan plan
- // both jobs share the same SparkMergeScan instance to ensure they operate on same files
- val matchingRowsPlanBuilder = scanRelation => Filter(cond, scanRelation)
- val scanPlan = buildDynamicFilterScanPlan(spark, r, r.output, mergeBuilder, cond, matchingRowsPlanBuilder)
- val underlyingScanPlan = scanPlan match {
- case DynamicFileFilter(plan, _, _) => plan.clone()
- case _ => scanPlan.clone()
- }
-
- // build a plan for records that match the cond and should be updated
- val matchedRowsPlan = Filter(cond, scanPlan)
- val updatedRowsPlan = buildUpdateProjection(r, matchedRowsPlan, assignments)
-
- // build a plan for records that did not match the cond but had to be copied over
- val remainingRowFilter = Not(EqualNullSafe(cond, Literal(true, BooleanType)))
- val remainingRowsPlan = Filter(remainingRowFilter, Project(r.output, underlyingScanPlan))
-
- // new state is a union of updated and copied over records
- val updatePlan = Union(updatedRowsPlan, remainingRowsPlan)
-
- val mergeWrite = mergeBuilder.asWriteBuilder.buildForBatch()
- val writePlan = buildWritePlan(updatePlan, r.table)
- ReplaceData(r, mergeWrite, writePlan)
-
- case UpdateTable(r: DataSourceV2Relation, assignments, Some(cond)) if isIcebergRelation(r) =>
- val writeInfo = newWriteInfo(r.schema)
- val mergeBuilder = r.table.asMergeable.newMergeBuilder("update", writeInfo)
-
- val matchingRowsPlanBuilder = scanRelation => Filter(cond, scanRelation)
- val scanPlan = buildDynamicFilterScanPlan(spark, r, r.output, mergeBuilder, cond, matchingRowsPlanBuilder)
-
- val updateProjection = buildUpdateProjection(r, scanPlan, assignments, cond)
-
- val mergeWrite = mergeBuilder.asWriteBuilder.buildForBatch()
- val writePlan = buildWritePlan(updateProjection, r.table)
- ReplaceData(r, mergeWrite, writePlan)
- }
-
- private def buildUpdateProjection(
- relation: DataSourceV2Relation,
- scanPlan: LogicalPlan,
- assignments: Seq[Assignment],
- cond: Expression = Literal.TrueLiteral): LogicalPlan = {
-
- // this method relies on the fact that the assignments have been aligned before
- require(relation.output.size == assignments.size, "assignments must be aligned")
-
- // Spark is going to execute the condition for each column but it seems we cannot avoid this
- val assignedExprs = assignments.map(_.value)
- val updatedExprs = assignedExprs.zip(relation.output).map { case (assignedExpr, attr) =>
- // use semanticEquals to avoid unnecessary if expressions as we may run after operator optimization
- if (attr.semanticEquals(assignedExpr)) {
- attr
- } else if (cond == Literal.TrueLiteral) {
- createAlias(assignedExpr, attr.name)
- } else {
- val updatedExpr = If(cond, assignedExpr, attr)
- createAlias(updatedExpr, attr.name)
- }
- }
-
- Project(updatedExprs, scanPlan)
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/parser/extensions/IcebergSparkSqlExtensionsParser.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/parser/extensions/IcebergSparkSqlExtensionsParser.scala
deleted file mode 100644
index 6b6066773d..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/parser/extensions/IcebergSparkSqlExtensionsParser.scala
+++ /dev/null
@@ -1,303 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.parser.extensions
-
-import java.util.Locale
-import org.antlr.v4.runtime._
-import org.antlr.v4.runtime.atn.PredictionMode
-import org.antlr.v4.runtime.misc.Interval
-import org.antlr.v4.runtime.misc.ParseCancellationException
-import org.antlr.v4.runtime.tree.TerminalNodeImpl
-import org.apache.iceberg.common.DynConstructors
-import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.FunctionIdentifier
-import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.parser.ParserInterface
-import org.apache.spark.sql.catalyst.parser.extensions.IcebergSqlExtensionsParser.NonReservedContext
-import org.apache.spark.sql.catalyst.parser.extensions.IcebergSqlExtensionsParser.QuotedIdentifierContext
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.trees.Origin
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.internal.VariableSubstitution
-import org.apache.spark.sql.types.DataType
-import org.apache.spark.sql.types.StructType
-
-class IcebergSparkSqlExtensionsParser(delegate: ParserInterface) extends ParserInterface {
-
- import IcebergSparkSqlExtensionsParser._
-
- private lazy val substitutor = substitutorCtor.newInstance(SQLConf.get)
- private lazy val astBuilder = new IcebergSqlExtensionsAstBuilder(delegate)
-
- /**
- * Parse a string to a DataType.
- */
- override def parseDataType(sqlText: String): DataType = {
- delegate.parseDataType(sqlText)
- }
-
- /**
- * Parse a string to a raw DataType without CHAR/VARCHAR replacement.
- */
- def parseRawDataType(sqlText: String): DataType = throw new UnsupportedOperationException()
-
- /**
- * Parse a string to an Expression.
- */
- override def parseExpression(sqlText: String): Expression = {
- delegate.parseExpression(sqlText)
- }
-
- /**
- * Parse a string to a TableIdentifier.
- */
- override def parseTableIdentifier(sqlText: String): TableIdentifier = {
- delegate.parseTableIdentifier(sqlText)
- }
-
- /**
- * Parse a string to a FunctionIdentifier.
- */
- override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier = {
- delegate.parseFunctionIdentifier(sqlText)
- }
-
- /**
- * Parse a string to a multi-part identifier.
- */
- override def parseMultipartIdentifier(sqlText: String): Seq[String] = {
- delegate.parseMultipartIdentifier(sqlText)
- }
-
- /**
- * Creates StructType for a given SQL string, which is a comma separated list of field
- * definitions which will preserve the correct Hive metadata.
- */
- override def parseTableSchema(sqlText: String): StructType = {
- delegate.parseTableSchema(sqlText)
- }
-
- /**
- * Parse a string to a LogicalPlan.
- */
- override def parsePlan(sqlText: String): LogicalPlan = {
- val sqlTextAfterSubstitution = substitutor.substitute(sqlText)
- if (isIcebergCommand(sqlTextAfterSubstitution)) {
- parse(sqlTextAfterSubstitution) { parser => astBuilder.visit(parser.singleStatement()) }.asInstanceOf[LogicalPlan]
- } else {
- delegate.parsePlan(sqlText)
- }
- }
-
- private def isIcebergCommand(sqlText: String): Boolean = {
- val normalized = sqlText.toLowerCase(Locale.ROOT).trim()
- // Strip simple SQL comments that terminate a line, e.g. comments starting with `--`
- .replaceAll("--.*?\\n", " ")
- // Strip newlines.
- .replaceAll("\\s+", " ")
- // Strip comments of the form /* ... */. This must come after stripping newlines so that
- // comments that span multiple lines are caught.
- .replaceAll("/\\*.*?\\*/", " ")
- .trim()
- normalized.startsWith("call") || (
- normalized.startsWith("alter table") && (
- normalized.contains("add partition field") ||
- normalized.contains("drop partition field") ||
- normalized.contains("replace partition field") ||
- normalized.contains("write ordered by") ||
- normalized.contains("write locally ordered by") ||
- normalized.contains("write distributed by") ||
- normalized.contains("write unordered") ||
- normalized.contains("set identifier fields") ||
- normalized.contains("drop identifier fields")))
- }
-
- protected def parse[T](command: String)(toResult: IcebergSqlExtensionsParser => T): T = {
- val lexer = new IcebergSqlExtensionsLexer(new UpperCaseCharStream(CharStreams.fromString(command)))
- lexer.removeErrorListeners()
- lexer.addErrorListener(IcebergParseErrorListener)
-
- val tokenStream = new CommonTokenStream(lexer)
- val parser = new IcebergSqlExtensionsParser(tokenStream)
- parser.addParseListener(IcebergSqlExtensionsPostProcessor)
- parser.removeErrorListeners()
- parser.addErrorListener(IcebergParseErrorListener)
-
- try {
- try {
- // first, try parsing with potentially faster SLL mode
- parser.getInterpreter.setPredictionMode(PredictionMode.SLL)
- toResult(parser)
- }
- catch {
- case _: ParseCancellationException =>
- // if we fail, parse with LL mode
- tokenStream.seek(0) // rewind input stream
- parser.reset()
-
- // Try Again.
- parser.getInterpreter.setPredictionMode(PredictionMode.LL)
- toResult(parser)
- }
- }
- catch {
- case e: IcebergParseException if e.command.isDefined =>
- throw e
- case e: IcebergParseException =>
- throw e.withCommand(command)
- case e: AnalysisException =>
- val position = Origin(e.line, e.startPosition)
- throw new IcebergParseException(Option(command), e.message, position, position)
- }
- }
-}
-
-object IcebergSparkSqlExtensionsParser {
- private val substitutorCtor: DynConstructors.Ctor[VariableSubstitution] =
- DynConstructors.builder()
- .impl(classOf[VariableSubstitution])
- .impl(classOf[VariableSubstitution], classOf[SQLConf])
- .build()
-}
-
-/* Copied from Apache Spark's to avoid dependency on Spark Internals */
-class UpperCaseCharStream(wrapped: CodePointCharStream) extends CharStream {
- override def consume(): Unit = wrapped.consume
- override def getSourceName(): String = wrapped.getSourceName
- override def index(): Int = wrapped.index
- override def mark(): Int = wrapped.mark
- override def release(marker: Int): Unit = wrapped.release(marker)
- override def seek(where: Int): Unit = wrapped.seek(where)
- override def size(): Int = wrapped.size
-
- override def getText(interval: Interval): String = wrapped.getText(interval)
-
- // scalastyle:off
- override def LA(i: Int): Int = {
- val la = wrapped.LA(i)
- if (la == 0 || la == IntStream.EOF) la
- else Character.toUpperCase(la)
- }
- // scalastyle:on
-}
-
-/**
- * The post-processor validates & cleans-up the parse tree during the parse process.
- */
-case object IcebergSqlExtensionsPostProcessor extends IcebergSqlExtensionsBaseListener {
-
- /** Remove the back ticks from an Identifier. */
- override def exitQuotedIdentifier(ctx: QuotedIdentifierContext): Unit = {
- replaceTokenByIdentifier(ctx, 1) { token =>
- // Remove the double back ticks in the string.
- token.setText(token.getText.replace("``", "`"))
- token
- }
- }
-
- /** Treat non-reserved keywords as Identifiers. */
- override def exitNonReserved(ctx: NonReservedContext): Unit = {
- replaceTokenByIdentifier(ctx, 0)(identity)
- }
-
- private def replaceTokenByIdentifier(
- ctx: ParserRuleContext,
- stripMargins: Int)(
- f: CommonToken => CommonToken = identity): Unit = {
- val parent = ctx.getParent
- parent.removeLastChild()
- val token = ctx.getChild(0).getPayload.asInstanceOf[Token]
- val newToken = new CommonToken(
- new org.antlr.v4.runtime.misc.Pair(token.getTokenSource, token.getInputStream),
- IcebergSqlExtensionsParser.IDENTIFIER,
- token.getChannel,
- token.getStartIndex + stripMargins,
- token.getStopIndex - stripMargins)
- parent.addChild(new TerminalNodeImpl(f(newToken)))
- }
-}
-
-/* Partially copied from Apache Spark's Parser to avoid dependency on Spark Internals */
-case object IcebergParseErrorListener extends BaseErrorListener {
- override def syntaxError(
- recognizer: Recognizer[_, _],
- offendingSymbol: scala.Any,
- line: Int,
- charPositionInLine: Int,
- msg: String,
- e: RecognitionException): Unit = {
- val (start, stop) = offendingSymbol match {
- case token: CommonToken =>
- val start = Origin(Some(line), Some(token.getCharPositionInLine))
- val length = token.getStopIndex - token.getStartIndex + 1
- val stop = Origin(Some(line), Some(token.getCharPositionInLine + length))
- (start, stop)
- case _ =>
- val start = Origin(Some(line), Some(charPositionInLine))
- (start, start)
- }
- throw new IcebergParseException(None, msg, start, stop)
- }
-}
-
-/**
- * Copied from Apache Spark
- * A [[ParseException]] is an [[AnalysisException]] that is thrown during the parse process. It
- * contains fields and an extended error message that make reporting and diagnosing errors easier.
- */
-class IcebergParseException(
- val command: Option[String],
- message: String,
- val start: Origin,
- val stop: Origin) extends AnalysisException(message, start.line, start.startPosition) {
-
- def this(message: String, ctx: ParserRuleContext) = {
- this(Option(IcebergParserUtils.command(ctx)),
- message,
- IcebergParserUtils.position(ctx.getStart),
- IcebergParserUtils.position(ctx.getStop))
- }
-
- override def getMessage: String = {
- val builder = new StringBuilder
- builder ++= "\n" ++= message
- start match {
- case Origin(Some(l), Some(p)) =>
- builder ++= s"(line $l, pos $p)\n"
- command.foreach { cmd =>
- val (above, below) = cmd.split("\n").splitAt(l)
- builder ++= "\n== SQL ==\n"
- above.foreach(builder ++= _ += '\n')
- builder ++= (0 until p).map(_ => "-").mkString("") ++= "^^^\n"
- below.foreach(builder ++= _ += '\n')
- }
- case _ =>
- command.foreach { cmd =>
- builder ++= "\n== SQL ==\n" ++= cmd
- }
- }
- builder.toString
- }
-
- def withCommand(cmd: String): IcebergParseException = {
- new IcebergParseException(Option(cmd), message, start, stop)
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/parser/extensions/IcebergSqlExtensionsAstBuilder.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/parser/extensions/IcebergSqlExtensionsAstBuilder.scala
deleted file mode 100644
index 678da9bfc3..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/parser/extensions/IcebergSqlExtensionsAstBuilder.scala
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.parser.extensions
-
-import org.antlr.v4.runtime._
-import org.antlr.v4.runtime.misc.Interval
-import org.antlr.v4.runtime.tree.ParseTree
-import org.antlr.v4.runtime.tree.TerminalNode
-import org.apache.iceberg.DistributionMode
-import org.apache.iceberg.NullOrder
-import org.apache.iceberg.SortDirection
-import org.apache.iceberg.expressions.Term
-import org.apache.iceberg.spark.Spark3Util
-import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.expressions.Literal
-import org.apache.spark.sql.catalyst.parser.ParserInterface
-import org.apache.spark.sql.catalyst.parser.extensions.IcebergParserUtils.withOrigin
-import org.apache.spark.sql.catalyst.parser.extensions.IcebergSqlExtensionsParser._
-import org.apache.spark.sql.catalyst.plans.logical.AddPartitionField
-import org.apache.spark.sql.catalyst.plans.logical.CallArgument
-import org.apache.spark.sql.catalyst.plans.logical.CallStatement
-import org.apache.spark.sql.catalyst.plans.logical.DropIdentifierFields
-import org.apache.spark.sql.catalyst.plans.logical.DropPartitionField
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.plans.logical.NamedArgument
-import org.apache.spark.sql.catalyst.plans.logical.PositionalArgument
-import org.apache.spark.sql.catalyst.plans.logical.ReplacePartitionField
-import org.apache.spark.sql.catalyst.plans.logical.SetIdentifierFields
-import org.apache.spark.sql.catalyst.plans.logical.SetWriteDistributionAndOrdering
-import org.apache.spark.sql.catalyst.trees.CurrentOrigin
-import org.apache.spark.sql.catalyst.trees.Origin
-import org.apache.spark.sql.connector.expressions
-import org.apache.spark.sql.connector.expressions.ApplyTransform
-import org.apache.spark.sql.connector.expressions.FieldReference
-import org.apache.spark.sql.connector.expressions.IdentityTransform
-import org.apache.spark.sql.connector.expressions.LiteralValue
-import org.apache.spark.sql.connector.expressions.Transform
-import scala.collection.JavaConverters._
-
-class IcebergSqlExtensionsAstBuilder(delegate: ParserInterface) extends IcebergSqlExtensionsBaseVisitor[AnyRef] {
-
- /**
- * Create a [[CallStatement]] for a stored procedure call.
- */
- override def visitCall(ctx: CallContext): CallStatement = withOrigin(ctx) {
- val name = ctx.multipartIdentifier.parts.asScala.map(_.getText)
- val args = ctx.callArgument.asScala.map(typedVisit[CallArgument])
- CallStatement(name, args)
- }
-
- /**
- * Create an ADD PARTITION FIELD logical command.
- */
- override def visitAddPartitionField(ctx: AddPartitionFieldContext): AddPartitionField = withOrigin(ctx) {
- AddPartitionField(
- typedVisit[Seq[String]](ctx.multipartIdentifier),
- typedVisit[Transform](ctx.transform),
- Option(ctx.name).map(_.getText))
- }
-
- /**
- * Create a DROP PARTITION FIELD logical command.
- */
- override def visitDropPartitionField(ctx: DropPartitionFieldContext): DropPartitionField = withOrigin(ctx) {
- DropPartitionField(
- typedVisit[Seq[String]](ctx.multipartIdentifier),
- typedVisit[Transform](ctx.transform))
- }
-
-
- /**
- * Create an REPLACE PARTITION FIELD logical command.
- */
- override def visitReplacePartitionField(ctx: ReplacePartitionFieldContext): ReplacePartitionField = withOrigin(ctx) {
- ReplacePartitionField(
- typedVisit[Seq[String]](ctx.multipartIdentifier),
- typedVisit[Transform](ctx.transform(0)),
- typedVisit[Transform](ctx.transform(1)),
- Option(ctx.name).map(_.getText))
- }
-
- /**
- * Create an SET IDENTIFIER FIELDS logical command.
- */
- override def visitSetIdentifierFields(ctx: SetIdentifierFieldsContext): SetIdentifierFields = withOrigin(ctx) {
- SetIdentifierFields(
- typedVisit[Seq[String]](ctx.multipartIdentifier),
- ctx.fieldList.fields.asScala.map(_.getText))
- }
-
- /**
- * Create an DROP IDENTIFIER FIELDS logical command.
- */
- override def visitDropIdentifierFields(ctx: DropIdentifierFieldsContext): DropIdentifierFields = withOrigin(ctx) {
- DropIdentifierFields(
- typedVisit[Seq[String]](ctx.multipartIdentifier),
- ctx.fieldList.fields.asScala.map(_.getText))
- }
-
- /**
- * Create a [[SetWriteDistributionAndOrdering]] for changing the write distribution and ordering.
- */
- override def visitSetWriteDistributionAndOrdering(
- ctx: SetWriteDistributionAndOrderingContext): SetWriteDistributionAndOrdering = {
-
- val tableName = typedVisit[Seq[String]](ctx.multipartIdentifier)
-
- val (distributionSpec, orderingSpec) = toDistributionAndOrderingSpec(ctx.writeSpec)
-
- if (distributionSpec == null && orderingSpec == null) {
- throw new AnalysisException(
- "ALTER TABLE has no changes: missing both distribution and ordering clauses")
- }
-
- val distributionMode = if (distributionSpec != null) {
- DistributionMode.HASH
- } else if (orderingSpec.UNORDERED != null || orderingSpec.LOCALLY != null) {
- DistributionMode.NONE
- } else {
- DistributionMode.RANGE
- }
-
- val ordering = if (orderingSpec != null && orderingSpec.order != null) {
- orderingSpec.order.fields.asScala.map(typedVisit[(Term, SortDirection, NullOrder)])
- } else {
- Seq.empty
- }
-
- SetWriteDistributionAndOrdering(tableName, distributionMode, ordering)
- }
-
- private def toDistributionAndOrderingSpec(
- writeSpec: WriteSpecContext): (WriteDistributionSpecContext, WriteOrderingSpecContext) = {
-
- if (writeSpec.writeDistributionSpec.size > 1) {
- throw new AnalysisException("ALTER TABLE contains multiple distribution clauses")
- }
-
- if (writeSpec.writeOrderingSpec.size > 1) {
- throw new AnalysisException("ALTER TABLE contains multiple ordering clauses")
- }
-
- val distributionSpec = writeSpec.writeDistributionSpec.asScala.headOption.orNull
- val orderingSpec = writeSpec.writeOrderingSpec.asScala.headOption.orNull
-
- (distributionSpec, orderingSpec)
- }
-
- /**
- * Create an order field.
- */
- override def visitOrderField(ctx: OrderFieldContext): (Term, SortDirection, NullOrder) = {
- val term = Spark3Util.toIcebergTerm(typedVisit[Transform](ctx.transform))
- val direction = Option(ctx.ASC).map(_ => SortDirection.ASC)
- .orElse(Option(ctx.DESC).map(_ => SortDirection.DESC))
- .getOrElse(SortDirection.ASC)
- val nullOrder = Option(ctx.FIRST).map(_ => NullOrder.NULLS_FIRST)
- .orElse(Option(ctx.LAST).map(_ => NullOrder.NULLS_LAST))
- .getOrElse(if (direction == SortDirection.ASC) NullOrder.NULLS_FIRST else NullOrder.NULLS_LAST)
- (term, direction, nullOrder)
- }
-
- /**
- * Create an IdentityTransform for a column reference.
- */
- override def visitIdentityTransform(ctx: IdentityTransformContext): Transform = withOrigin(ctx) {
- IdentityTransform(FieldReference(typedVisit[Seq[String]](ctx.multipartIdentifier())))
- }
-
- /**
- * Create a named Transform from argument expressions.
- */
- override def visitApplyTransform(ctx: ApplyTransformContext): Transform = withOrigin(ctx) {
- val args = ctx.arguments.asScala.map(typedVisit[expressions.Expression])
- ApplyTransform(ctx.transformName.getText, args)
- }
-
- /**
- * Create a transform argument from a column reference or a constant.
- */
- override def visitTransformArgument(ctx: TransformArgumentContext): expressions.Expression = withOrigin(ctx) {
- val reference = Option(ctx.multipartIdentifier())
- .map(typedVisit[Seq[String]])
- .map(FieldReference(_))
- val literal = Option(ctx.constant)
- .map(visitConstant)
- .map(lit => LiteralValue(lit.value, lit.dataType))
- reference.orElse(literal)
- .getOrElse(throw new IcebergParseException(s"Invalid transform argument", ctx))
- }
-
- /**
- * Return a multi-part identifier as Seq[String].
- */
- override def visitMultipartIdentifier(ctx: MultipartIdentifierContext): Seq[String] = withOrigin(ctx) {
- ctx.parts.asScala.map(_.getText)
- }
-
- /**
- * Create a positional argument in a stored procedure call.
- */
- override def visitPositionalArgument(ctx: PositionalArgumentContext): CallArgument = withOrigin(ctx) {
- val expr = typedVisit[Expression](ctx.expression)
- PositionalArgument(expr)
- }
-
- /**
- * Create a named argument in a stored procedure call.
- */
- override def visitNamedArgument(ctx: NamedArgumentContext): CallArgument = withOrigin(ctx) {
- val name = ctx.identifier.getText
- val expr = typedVisit[Expression](ctx.expression)
- NamedArgument(name, expr)
- }
-
- override def visitSingleStatement(ctx: SingleStatementContext): LogicalPlan = withOrigin(ctx) {
- visit(ctx.statement).asInstanceOf[LogicalPlan]
- }
-
- def visitConstant(ctx: ConstantContext): Literal = {
- delegate.parseExpression(ctx.getText).asInstanceOf[Literal]
- }
-
- override def visitExpression(ctx: ExpressionContext): Expression = {
- // reconstruct the SQL string and parse it using the main Spark parser
- // while we can avoid the logic to build Spark expressions, we still have to parse them
- // we cannot call ctx.getText directly since it will not render spaces correctly
- // that's why we need to recurse down the tree in reconstructSqlString
- val sqlString = reconstructSqlString(ctx)
- delegate.parseExpression(sqlString)
- }
-
- private def reconstructSqlString(ctx: ParserRuleContext): String = {
- ctx.children.asScala.map {
- case c: ParserRuleContext => reconstructSqlString(c)
- case t: TerminalNode => t.getText
- }.mkString(" ")
- }
-
- private def typedVisit[T](ctx: ParseTree): T = {
- ctx.accept(this).asInstanceOf[T]
- }
-}
-
-/* Partially copied from Apache Spark's Parser to avoid dependency on Spark Internals */
-object IcebergParserUtils {
-
- private[sql] def withOrigin[T](ctx: ParserRuleContext)(f: => T): T = {
- val current = CurrentOrigin.get
- CurrentOrigin.set(position(ctx.getStart))
- try {
- f
- } finally {
- CurrentOrigin.set(current)
- }
- }
-
- private[sql] def position(token: Token): Origin = {
- val opt = Option(token)
- Origin(opt.map(_.getLine), opt.map(_.getCharPositionInLine))
- }
-
- /** Get the command which created the token. */
- private[sql] def command(ctx: ParserRuleContext): String = {
- val stream = ctx.getStart.getInputStream
- stream.getText(Interval.of(0, stream.size() - 1))
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AddPartitionField.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AddPartitionField.scala
deleted file mode 100644
index ae5a2391aa..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AddPartitionField.scala
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.plans.logical
-
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.connector.expressions.Transform
-
-case class AddPartitionField(table: Seq[String], transform: Transform, name: Option[String]) extends Command {
- import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
-
- override lazy val output: Seq[Attribute] = Nil
-
- override def simpleString(maxFields: Int): String = {
- s"AddPartitionField ${table.quoted} ${name.map(n => s"$n=").getOrElse("")}${transform.describe}"
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Call.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Call.scala
deleted file mode 100644
index 033d3f5670..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Call.scala
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.plans.logical
-
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.util.truncatedString
-import org.apache.spark.sql.connector.iceberg.catalog.Procedure
-import scala.collection.Seq
-
-case class Call(procedure: Procedure, args: Seq[Expression]) extends Command {
- override lazy val output: Seq[Attribute] = procedure.outputType.toAttributes
-
- override def simpleString(maxFields: Int): String = {
- s"Call${truncatedString(output, "[", ", ", "]", maxFields)} ${procedure.description}"
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DropIdentifierFields.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DropIdentifierFields.scala
deleted file mode 100644
index 115af1586d..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DropIdentifierFields.scala
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.plans.logical
-
-import org.apache.spark.sql.catalyst.expressions.Attribute
-
-case class DropIdentifierFields(
- table: Seq[String],
- fields: Seq[String]) extends Command {
- import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
-
- override lazy val output: Seq[Attribute] = Nil
-
- override def simpleString(maxFields: Int): String = {
- s"DropIdentifierFields ${table.quoted} (${fields.quoted})"
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DropPartitionField.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DropPartitionField.scala
deleted file mode 100644
index 5fe8f92b1b..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DropPartitionField.scala
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.plans.logical
-
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.connector.expressions.Transform
-
-case class DropPartitionField(table: Seq[String], transform: Transform) extends Command {
- import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
-
- override lazy val output: Seq[Attribute] = Nil
-
- override def simpleString(maxFields: Int): String = {
- s"DropPartitionField ${table.quoted} ${transform.describe}"
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DynamicFileFilter.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DynamicFileFilter.scala
deleted file mode 100644
index be7059698c..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DynamicFileFilter.scala
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.plans.logical
-
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.expressions.AttributeSet
-import org.apache.spark.sql.catalyst.util.truncatedString
-import org.apache.spark.sql.catalyst.utils.SetAccumulator
-import org.apache.spark.sql.connector.iceberg.read.SupportsFileFilter
-
-// TODO: fix stats (ignore the fact it is a binary node and report only scanRelation stats)
-case class DynamicFileFilter(
- scanPlan: LogicalPlan,
- fileFilterPlan: LogicalPlan,
- filterable: SupportsFileFilter) extends BinaryNode {
-
- @transient
- override lazy val references: AttributeSet = AttributeSet(fileFilterPlan.output)
-
- override def left: LogicalPlan = scanPlan
- override def right: LogicalPlan = fileFilterPlan
- override def output: Seq[Attribute] = scanPlan.output
-
- override def simpleString(maxFields: Int): String = {
- s"DynamicFileFilter${truncatedString(output, "[", ", ", "]", maxFields)}"
- }
-}
-
-case class DynamicFileFilterWithCardinalityCheck(
- scanPlan: LogicalPlan,
- fileFilterPlan: LogicalPlan,
- filterable: SupportsFileFilter,
- filesAccumulator: SetAccumulator[String]) extends BinaryNode {
-
- @transient
- override lazy val references: AttributeSet = AttributeSet(fileFilterPlan.output)
-
- override def left: LogicalPlan = scanPlan
- override def right: LogicalPlan = fileFilterPlan
- override def output: Seq[Attribute] = scanPlan.output
-
- override def simpleString(maxFields: Int): String = {
- s"DynamicFileFilterWithCardinalityCheck${truncatedString(output, "[", ", ", "]", maxFields)}"
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/MergeInto.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/MergeInto.scala
deleted file mode 100644
index 04c3236bc1..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/MergeInto.scala
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.plans.logical
-
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.expressions.Expression
-
-case class MergeInto(
- mergeIntoProcessor: MergeIntoParams,
- output: Seq[Attribute],
- child: LogicalPlan) extends UnaryNode
-
-case class MergeIntoParams(
- isSourceRowPresent: Expression,
- isTargetRowPresent: Expression,
- matchedConditions: Seq[Expression],
- matchedOutputs: Seq[Option[Seq[Expression]]],
- notMatchedConditions: Seq[Expression],
- notMatchedOutputs: Seq[Option[Seq[Expression]]],
- targetOutput: Seq[Expression],
- joinedAttributes: Seq[Attribute]) extends Serializable
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/ReplaceData.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/ReplaceData.scala
deleted file mode 100644
index 16fd559c05..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/ReplaceData.scala
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.plans.logical
-
-import org.apache.spark.sql.catalyst.analysis.NamedRelation
-import org.apache.spark.sql.connector.write.BatchWrite
-
-case class ReplaceData(
- table: NamedRelation,
- write: BatchWrite,
- query: LogicalPlan) extends V2WriteCommand {
-
- def isByName: Boolean = false
-
- def withNewQuery(newQuery: LogicalPlan): ReplaceData = copy(query = newQuery)
-
- def withNewTable(newTable: NamedRelation): ReplaceData = copy(table = newTable)
-
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/ReplacePartitionField.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/ReplacePartitionField.scala
deleted file mode 100644
index 3ad8c59bfc..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/ReplacePartitionField.scala
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.plans.logical
-
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.connector.expressions.Transform
-
-case class ReplacePartitionField(
- table: Seq[String],
- transformFrom: Transform,
- transformTo: Transform,
- name: Option[String]) extends Command {
- import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
-
- override lazy val output: Seq[Attribute] = Nil
-
- override def simpleString(maxFields: Int): String = {
- s"ReplacePartitionField ${table.quoted} ${transformFrom.describe} " +
- s"with ${name.map(n => s"$n=").getOrElse("")}${transformTo.describe}"
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/SetIdentifierFields.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/SetIdentifierFields.scala
deleted file mode 100644
index 2e9a34b872..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/SetIdentifierFields.scala
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.plans.logical
-
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.connector.expressions.Transform
-
-case class SetIdentifierFields(
- table: Seq[String],
- fields: Seq[String]) extends Command {
- import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
-
- override lazy val output: Seq[Attribute] = Nil
-
- override def simpleString(maxFields: Int): String = {
- s"SetIdentifierFields ${table.quoted} (${fields.quoted})"
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala
deleted file mode 100644
index 5842b094be..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.plans.logical
-
-import org.apache.spark.sql.catalyst.expressions.Expression
-
-/**
- * A CALL statement, as parsed from SQL.
- */
-case class CallStatement(name: Seq[String], args: Seq[CallArgument]) extends ParsedStatement
-
-/**
- * An argument in a CALL statement.
- */
-sealed trait CallArgument {
- def expr: Expression
-}
-
-/**
- * An argument in a CALL statement identified by name.
- */
-case class NamedArgument(name: String, expr: Expression) extends CallArgument
-
-/**
- * An argument in a CALL statement identified by position.
- */
-case class PositionalArgument(expr: Expression) extends CallArgument
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/utils/RewriteRowLevelOperationHelper.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/utils/RewriteRowLevelOperationHelper.scala
deleted file mode 100644
index 910e9ce8c2..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/utils/RewriteRowLevelOperationHelper.scala
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.spark.sql.catalyst.utils
-
-import java.util.UUID
-import org.apache.iceberg.common.DynConstructors
-import org.apache.iceberg.spark.Spark3Util
-import org.apache.iceberg.spark.Spark3VersionUtil
-import org.apache.spark.internal.Logging
-import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.catalyst.analysis.Resolver
-import org.apache.spark.sql.catalyst.expressions.AccumulateFiles
-import org.apache.spark.sql.catalyst.expressions.Alias
-import org.apache.spark.sql.catalyst.expressions.And
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.expressions.AttributeReference
-import org.apache.spark.sql.catalyst.expressions.AttributeSet
-import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.expressions.ExprId
-import org.apache.spark.sql.catalyst.expressions.GreaterThan
-import org.apache.spark.sql.catalyst.expressions.Literal
-import org.apache.spark.sql.catalyst.expressions.NamedExpression
-import org.apache.spark.sql.catalyst.expressions.PredicateHelper
-import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
-import org.apache.spark.sql.catalyst.expressions.aggregate.Complete
-import org.apache.spark.sql.catalyst.expressions.aggregate.Sum
-import org.apache.spark.sql.catalyst.plans.logical.Aggregate
-import org.apache.spark.sql.catalyst.plans.logical.DynamicFileFilter
-import org.apache.spark.sql.catalyst.plans.logical.DynamicFileFilterWithCardinalityCheck
-import org.apache.spark.sql.catalyst.plans.logical.Filter
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.plans.logical.Project
-import org.apache.spark.sql.catalyst.plans.logical.Repartition
-import org.apache.spark.sql.connector.catalog.Table
-import org.apache.spark.sql.connector.iceberg.distributions.OrderedDistribution
-import org.apache.spark.sql.connector.iceberg.read.SupportsFileFilter
-import org.apache.spark.sql.connector.iceberg.write.MergeBuilder
-import org.apache.spark.sql.connector.read.Scan
-import org.apache.spark.sql.connector.read.ScanBuilder
-import org.apache.spark.sql.connector.write.LogicalWriteInfo
-import org.apache.spark.sql.connector.write.LogicalWriteInfoImpl
-import org.apache.spark.sql.execution.datasources.DataSourceStrategy
-import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits
-import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
-import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanRelation
-import org.apache.spark.sql.execution.datasources.v2.ExtendedDataSourceV2Implicits
-import org.apache.spark.sql.execution.datasources.v2.PushDownUtils
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.sources
-import org.apache.spark.sql.types.Metadata
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.sql.util.CaseInsensitiveStringMap
-
-trait RewriteRowLevelOperationHelper extends PredicateHelper with Logging {
-
- import DataSourceV2Implicits._
- import RewriteRowLevelOperationHelper._
- import ExtendedDataSourceV2Implicits.ScanBuilderHelper
-
- protected def spark: SparkSession
- def conf: SQLConf
- protected lazy val resolver: Resolver = conf.resolver
-
- protected def buildSimpleScanPlan(
- relation: DataSourceV2Relation,
- cond: Expression): LogicalPlan = {
-
- val scanBuilder = relation.table.asReadable.newScanBuilder(relation.options)
-
- pushFilters(scanBuilder, cond, relation.output)
-
- val scan = scanBuilder.asIceberg.withMetadataColumns(FILE_NAME_COL, ROW_POS_COL).build()
- val outputAttrs = toOutputAttrs(scan.readSchema(), relation.output)
- val predicates = extractFilters(cond, relation.output).reduceLeftOption(And)
- val scanRelation = createScanRelation(relation, scan, outputAttrs)
-
- predicates.map(Filter(_, scanRelation)).getOrElse(scanRelation)
- }
-
- protected def buildDynamicFilterScanPlan(
- spark: SparkSession,
- relation: DataSourceV2Relation,
- tableAttrs: Seq[AttributeReference],
- mergeBuilder: MergeBuilder,
- cond: Expression,
- matchingRowsPlanBuilder: DataSourceV2ScanRelation => LogicalPlan,
- runCardinalityCheck: Boolean = false): LogicalPlan = {
-
- val scanBuilder = mergeBuilder.asScanBuilder
-
- pushFilters(scanBuilder, cond, tableAttrs)
-
- val scan = scanBuilder.asIceberg.withMetadataColumns(FILE_NAME_COL, ROW_POS_COL).build()
- val outputAttrs = toOutputAttrs(scan.readSchema(), tableAttrs)
- val scanRelation = createScanRelation(relation, scan, outputAttrs)
-
- scan match {
- case filterable: SupportsFileFilter if runCardinalityCheck =>
- val affectedFilesAcc = new SetAccumulator[String]()
- spark.sparkContext.register(affectedFilesAcc, AFFECTED_FILES_ACC_NAME)
-
- val matchingRowsPlan = matchingRowsPlanBuilder(scanRelation)
- val matchingFilesPlan = buildFileFilterPlan(affectedFilesAcc, matchingRowsPlan)
-
- DynamicFileFilterWithCardinalityCheck(
- scanRelation,
- matchingFilesPlan,
- filterable,
- affectedFilesAcc)
-
- case filterable: SupportsFileFilter =>
- val matchingRowsPlan = matchingRowsPlanBuilder(scanRelation)
- val matchingFilesPlan = buildFileFilterPlan(scanRelation.output, matchingRowsPlan)
- DynamicFileFilter(scanRelation, matchingFilesPlan, filterable)
-
- case _ =>
- scanRelation
- }
- }
-
- private def extractFilters(cond: Expression, tableAttrs: Seq[AttributeReference]): Seq[Expression] = {
- val tableAttrSet = AttributeSet(tableAttrs)
- splitConjunctivePredicates(cond).filter(_.references.subsetOf(tableAttrSet))
- }
-
- private def pushFilters(
- scanBuilder: ScanBuilder,
- cond: Expression,
- tableAttrs: Seq[AttributeReference]): Unit = {
- val predicates = extractFilters(cond, tableAttrs)
- if (predicates.nonEmpty) {
- val normalizedPredicates = DataSourceStrategy.normalizeExprs(predicates, tableAttrs)
- PushDownUtils.pushFilters(scanBuilder, normalizedPredicates)
- }
- }
-
- protected def toDataSourceFilters(predicates: Seq[Expression]): Array[sources.Filter] = {
- predicates.flatMap { p =>
- val translatedFilter = DataSourceStrategy.translateFilter(p, supportNestedPredicatePushdown = true)
- if (translatedFilter.isEmpty) {
- logWarning(s"Cannot translate expression to source filter: $p")
- }
- translatedFilter
- }.toArray
- }
-
- protected def newWriteInfo(schema: StructType): LogicalWriteInfo = {
- val uuid = UUID.randomUUID()
- LogicalWriteInfoImpl(queryId = uuid.toString, schema, CaseInsensitiveStringMap.empty)
- }
-
- private def buildFileFilterPlan(tableAttrs: Seq[AttributeReference], matchingRowsPlan: LogicalPlan): LogicalPlan = {
- val fileAttr = findOutputAttr(tableAttrs, FILE_NAME_COL)
- val agg = Aggregate(Seq(fileAttr), Seq(fileAttr), matchingRowsPlan)
- Project(Seq(findOutputAttr(agg.output, FILE_NAME_COL)), agg)
- }
-
- private def buildFileFilterPlan(
- filesAccumulator: SetAccumulator[String],
- prunedTargetPlan: LogicalPlan): LogicalPlan = {
- val fileAttr = findOutputAttr(prunedTargetPlan.output, FILE_NAME_COL)
- val rowPosAttr = findOutputAttr(prunedTargetPlan.output, ROW_POS_COL)
- val accumulatorExpr = createAlias(AccumulateFiles(filesAccumulator, fileAttr), AFFECTED_FILES_ACC_ALIAS_NAME)
- val projectList = Seq(fileAttr, rowPosAttr, accumulatorExpr)
- val projectPlan = Project(projectList, prunedTargetPlan)
- val affectedFilesAttr = findOutputAttr(projectPlan.output, AFFECTED_FILES_ACC_ALIAS_NAME)
- val aggSumCol = createAlias(AggregateExpression(Sum(affectedFilesAttr), Complete, false), SUM_ROW_ID_ALIAS_NAME)
- // Group by the rows by row id while collecting the files that need to be over written via accumulator.
- val aggPlan = Aggregate(Seq(fileAttr, rowPosAttr), Seq(aggSumCol), projectPlan)
- val sumAttr = findOutputAttr(aggPlan.output, SUM_ROW_ID_ALIAS_NAME)
- val havingExpr = GreaterThan(sumAttr, Literal(1L))
- // Identifies ambiguous row in the target.
- Filter(havingExpr, aggPlan)
- }
-
- protected def findOutputAttr(attrs: Seq[Attribute], attrName: String): Attribute = {
- attrs.find(attr => resolver(attr.name, attrName)).getOrElse {
- throw new AnalysisException(s"Cannot find $attrName in $attrs")
- }
- }
-
- protected def toOutputAttrs(schema: StructType, attrs: Seq[AttributeReference]): Seq[AttributeReference] = {
- val nameToAttr = attrs.map(_.name).zip(attrs).toMap
- schema.toAttributes.map {
- a => nameToAttr.get(a.name) match {
- case Some(ref) =>
- // keep the attribute id if it was present in the relation
- a.withExprId(ref.exprId)
- case _ =>
- // if the field is new, create a new attribute
- AttributeReference(a.name, a.dataType, a.nullable, a.metadata)()
- }
- }
- }
-
- protected def buildWritePlan(childPlan: LogicalPlan, table: Table): LogicalPlan = {
- val icebergTable = Spark3Util.toIcebergTable(table)
- val distribution = Spark3Util.buildRequiredDistribution(icebergTable)
- val ordering = Spark3Util.buildRequiredOrdering(distribution, icebergTable)
- // range partitioning in Spark triggers a skew estimation job prior to shuffling
- // we insert a round-robin partitioning to avoid executing the merge join twice
- val newChildPlan = distribution match {
- case _: OrderedDistribution =>
- val numShufflePartitions = conf.numShufflePartitions
- Repartition(numShufflePartitions, shuffle = true, childPlan)
- case _ =>
- childPlan
- }
- DistributionAndOrderingUtils.prepareQuery(distribution, ordering, newChildPlan, conf)
- }
-}
-
-object RewriteRowLevelOperationHelper {
- final val FILE_NAME_COL = "_file"
- final val ROW_POS_COL = "_pos"
-
- // `internal.metrics` prefix ensures the accumulator state is not tracked by Spark UI
- private final val AFFECTED_FILES_ACC_NAME = "internal.metrics.merge.affectedFiles"
- private final val AFFECTED_FILES_ACC_ALIAS_NAME = "_affectedFiles_"
- private final val SUM_ROW_ID_ALIAS_NAME = "_sum_"
-
- private val scanRelationCtor: DynConstructors.Ctor[DataSourceV2ScanRelation] =
- DynConstructors.builder()
- .impl(classOf[DataSourceV2ScanRelation],
- classOf[DataSourceV2Relation],
- classOf[Scan],
- classOf[Seq[AttributeReference]])
- .impl(classOf[DataSourceV2ScanRelation],
- classOf[Table],
- classOf[Scan],
- classOf[Seq[AttributeReference]])
- .build()
-
- def createScanRelation(
- relation: DataSourceV2Relation,
- scan: Scan,
- outputAttrs: Seq[AttributeReference]): DataSourceV2ScanRelation = {
- if (Spark3VersionUtil.isSpark30) {
- scanRelationCtor.newInstance(relation.table, scan, outputAttrs)
- } else {
- scanRelationCtor.newInstance(relation, scan, outputAttrs)
- }
- }
-
- private val aliasCtor: DynConstructors.Ctor[Alias] =
- DynConstructors.builder()
- .impl(classOf[Alias],
- classOf[Expression],
- classOf[String],
- classOf[ExprId],
- classOf[Seq[String]],
- classOf[Option[Metadata]],
- classOf[Seq[String]])
- .impl(classOf[Alias],
- classOf[Expression],
- classOf[String],
- classOf[ExprId],
- classOf[Seq[String]],
- classOf[Option[Metadata]])
- .build()
-
- def createAlias(child: Expression, name: String): Alias = {
- aliasCtor.newInstance(child, name, NamedExpression.newExprId, Seq.empty, None, Seq.empty)
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/utils/SetAccumulator.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/utils/SetAccumulator.scala
deleted file mode 100644
index a694666ec4..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/utils/SetAccumulator.scala
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.catalyst.utils
-
-import java.util.Collections
-import org.apache.spark.util.AccumulatorV2
-
-class SetAccumulator[T] extends AccumulatorV2[T, java.util.Set[T]] {
- private val _set = Collections.synchronizedSet(new java.util.HashSet[T]())
-
- override def isZero: Boolean = _set.isEmpty
-
- override def copy(): AccumulatorV2[T, java.util.Set[T]] = {
- val newAcc = new SetAccumulator[T]()
- newAcc._set.addAll(_set)
- newAcc
- }
-
- override def reset(): Unit = _set.clear()
-
- override def add(v: T): Unit = _set.add(v)
-
- override def merge(other: AccumulatorV2[T, java.util.Set[T]]): Unit = {
- _set.addAll(other.value)
- }
-
- override def value: java.util.Set[T] = _set
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AddPartitionFieldExec.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AddPartitionFieldExec.scala
deleted file mode 100644
index b33de89917..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AddPartitionFieldExec.scala
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.execution.datasources.v2
-
-import org.apache.iceberg.spark.Spark3Util
-import org.apache.iceberg.spark.source.SparkTable
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.connector.catalog.Identifier
-import org.apache.spark.sql.connector.catalog.TableCatalog
-import org.apache.spark.sql.connector.expressions.Transform
-
-case class AddPartitionFieldExec(
- catalog: TableCatalog,
- ident: Identifier,
- transform: Transform,
- name: Option[String]) extends V2CommandExec {
- import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
-
- override lazy val output: Seq[Attribute] = Nil
-
- override protected def run(): Seq[InternalRow] = {
- catalog.loadTable(ident) match {
- case iceberg: SparkTable =>
- iceberg.table.updateSpec()
- .addField(name.orNull, Spark3Util.toIcebergTerm(transform))
- .commit()
-
- case table =>
- throw new UnsupportedOperationException(s"Cannot add partition field to non-Iceberg table: $table")
- }
-
- Nil
- }
-
- override def simpleString(maxFields: Int): String = {
- s"AddPartitionField ${catalog.name}.${ident.quoted} ${name.map(n => s"$n=").getOrElse("")}${transform.describe}"
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CallExec.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CallExec.scala
deleted file mode 100644
index fcf6092163..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CallExec.scala
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.execution.datasources.v2
-
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.util.truncatedString
-import org.apache.spark.sql.connector.iceberg.catalog.Procedure
-
-case class CallExec(
- output: Seq[Attribute],
- procedure: Procedure,
- input: InternalRow) extends V2CommandExec {
-
- override protected def run(): Seq[InternalRow] = {
- procedure.call(input)
- }
-
- override def simpleString(maxFields: Int): String = {
- s"CallExec${truncatedString(output, "[", ", ", "]", maxFields)} ${procedure.description}"
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropIdentifierFieldsExec.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropIdentifierFieldsExec.scala
deleted file mode 100644
index 525ed77437..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropIdentifierFieldsExec.scala
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.execution.datasources.v2
-
-import org.apache.iceberg.relocated.com.google.common.base.Preconditions
-import org.apache.iceberg.relocated.com.google.common.collect.Sets
-import org.apache.iceberg.spark.source.SparkTable
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.connector.catalog.Identifier
-import org.apache.spark.sql.connector.catalog.TableCatalog
-
-case class DropIdentifierFieldsExec(
- catalog: TableCatalog,
- ident: Identifier,
- fields: Seq[String]) extends V2CommandExec {
- import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
-
- override lazy val output: Seq[Attribute] = Nil
-
- override protected def run(): Seq[InternalRow] = {
- catalog.loadTable(ident) match {
- case iceberg: SparkTable =>
- val schema = iceberg.table.schema
- val identifierFieldNames = Sets.newHashSet(schema.identifierFieldNames)
-
- for (name <- fields) {
- Preconditions.checkArgument(schema.findField(name) != null,
- "Cannot complete drop identifier fields operation: field %s not found", name)
- Preconditions.checkArgument(identifierFieldNames.contains(name),
- "Cannot complete drop identifier fields operation: %s is not an identifier field", name)
- identifierFieldNames.remove(name)
- }
-
- iceberg.table.updateSchema()
- .setIdentifierFields(identifierFieldNames)
- .commit();
- case table =>
- throw new UnsupportedOperationException(s"Cannot drop identifier fields in non-Iceberg table: $table")
- }
-
- Nil
- }
-
- override def simpleString(maxFields: Int): String = {
- s"DropIdentifierFields ${catalog.name}.${ident.quoted} (${fields.quoted})";
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropPartitionFieldExec.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropPartitionFieldExec.scala
deleted file mode 100644
index 76128c5f47..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropPartitionFieldExec.scala
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.execution.datasources.v2
-
-import org.apache.iceberg.spark.Spark3Util
-import org.apache.iceberg.spark.source.SparkTable
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.connector.catalog.Identifier
-import org.apache.spark.sql.connector.catalog.TableCatalog
-import org.apache.spark.sql.connector.expressions.FieldReference
-import org.apache.spark.sql.connector.expressions.IdentityTransform
-import org.apache.spark.sql.connector.expressions.Transform
-
-case class DropPartitionFieldExec(
- catalog: TableCatalog,
- ident: Identifier,
- transform: Transform) extends V2CommandExec {
- import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
-
- override lazy val output: Seq[Attribute] = Nil
-
- override protected def run(): Seq[InternalRow] = {
- catalog.loadTable(ident) match {
- case iceberg: SparkTable =>
- val schema = iceberg.table.schema
- transform match {
- case IdentityTransform(FieldReference(parts)) if parts.size == 1 && schema.findField(parts.head) == null =>
- // the name is not present in the Iceberg schema, so it must be a partition field name, not a column name
- iceberg.table.updateSpec()
- .removeField(parts.head)
- .commit()
-
- case _ =>
- iceberg.table.updateSpec()
- .removeField(Spark3Util.toIcebergTerm(transform))
- .commit()
- }
-
- case table =>
- throw new UnsupportedOperationException(s"Cannot drop partition field in non-Iceberg table: $table")
- }
-
- Nil
- }
-
- override def simpleString(maxFields: Int): String = {
- s"DropPartitionField ${catalog.name}.${ident.quoted} ${transform.describe}"
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DynamicFileFilterExec.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DynamicFileFilterExec.scala
deleted file mode 100644
index dcb4791c85..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DynamicFileFilterExec.scala
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.execution.datasources.v2
-
-import org.apache.spark.SparkException
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.expressions.AttributeSet
-import org.apache.spark.sql.catalyst.expressions.SortOrder
-import org.apache.spark.sql.catalyst.plans.physical
-import org.apache.spark.sql.catalyst.util.truncatedString
-import org.apache.spark.sql.catalyst.utils.SetAccumulator
-import org.apache.spark.sql.connector.iceberg.read.SupportsFileFilter
-import org.apache.spark.sql.execution.BinaryExecNode
-import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.execution.SQLExecution
-import org.apache.spark.sql.execution.metric.SQLMetrics
-import org.apache.spark.sql.vectorized.ColumnarBatch
-import scala.collection.JavaConverters._
-
-abstract class DynamicFileFilterExecBase(
- scanExec: SparkPlan,
- fileFilterExec: SparkPlan) extends BinaryExecNode {
-
- @transient
- override lazy val references: AttributeSet = AttributeSet(fileFilterExec.output)
-
- override lazy val metrics = Map(
- "candidateFiles" -> SQLMetrics.createMetric(sparkContext, "candidate files"),
- "matchingFiles" -> SQLMetrics.createMetric(sparkContext, "matching files"))
-
- override def left: SparkPlan = scanExec
- override def right: SparkPlan = fileFilterExec
- override def output: Seq[Attribute] = scanExec.output
- override def outputPartitioning: physical.Partitioning = scanExec.outputPartitioning
- override def outputOrdering: Seq[SortOrder] = scanExec.outputOrdering
- override def supportsColumnar: Boolean = scanExec.supportsColumnar
-
- /*
- If both target and source have the same partitioning we can have a problem here if our filter exec actually
- changes the output partitioning of the node. Currently this can only occur in the SinglePartition distribution is
- in use which only happens if both the target and source have a single partition, but if it does we have the potential
- of eliminating the only partition in the target. If there are no partitions in the target then we will throw an
- exception because the partitioning was assumed to be the same 1 partition in source and target. We fix this by making
- sure that we always return at least 1 empty partition, in the future we may need to handle more complicated
- partitioner scenarios.
- */
-
- override protected def doExecute(): RDD[InternalRow] = {
- val result = scanExec.execute()
- if (result.partitions.length == 0) {
- sparkContext.parallelize(Array.empty[InternalRow], 1)
- } else {
- result
- }
- }
- override protected def doExecuteColumnar(): RDD[ColumnarBatch] = {
- val result = scanExec.executeColumnar()
- if (result.partitions.length == 0) {
- sparkContext.parallelize(Array.empty[ColumnarBatch], 1)
- } else {
- result
- }
- }
-
- override def simpleString(maxFields: Int): String = {
- s"DynamicFileFilterExec${truncatedString(output, "[", ", ", "]", maxFields)}"
- }
-
- def postFileFilterMetric(candidateFiles: Int, matchingFiles: Int): Unit = {
- longMetric("candidateFiles").set(candidateFiles)
- longMetric("matchingFiles").set(matchingFiles)
- val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
- SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toSeq)
- }
-}
-
-case class DynamicFileFilterExec(
- scanExec: SparkPlan,
- fileFilterExec: SparkPlan,
- @transient filterable: SupportsFileFilter)
- extends DynamicFileFilterExecBase(scanExec, fileFilterExec) {
-
- override protected def doPrepare(): Unit = {
- val rows = fileFilterExec.executeCollect()
- val matchedFileLocations = rows.map(_.getString(0))
- val metric = filterable.filterFiles(matchedFileLocations.toSet.asJava)
- postFileFilterMetric(metric.candidateFiles(), metric.matchingFiles())
- }
-}
-
-case class DynamicFileFilterWithCardinalityCheckExec(
- scanExec: SparkPlan,
- fileFilterExec: SparkPlan,
- @transient filterable: SupportsFileFilter,
- filesAccumulator: SetAccumulator[String])
- extends DynamicFileFilterExecBase(scanExec, fileFilterExec) {
-
- override protected def doPrepare(): Unit = {
- val rows = fileFilterExec.executeCollect()
- if (rows.length > 0) {
- throw new SparkException(
- "The ON search condition of the MERGE statement matched a single row from " +
- "the target table with multiple rows of the source table. This could result " +
- "in the target row being operated on more than once with an update or delete operation " +
- "and is not allowed.")
- }
- val matchedFileLocations = filesAccumulator.value
- val metric = filterable.filterFiles(matchedFileLocations)
- postFileFilterMetric(metric.candidateFiles(), metric.matchingFiles())
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ExtendedBatchScanExec.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ExtendedBatchScanExec.scala
deleted file mode 100644
index 83c9ca5815..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ExtendedBatchScanExec.scala
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.execution.datasources.v2
-
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.AttributeReference
-import org.apache.spark.sql.catalyst.plans.QueryPlan
-import org.apache.spark.sql.connector.read.InputPartition
-import org.apache.spark.sql.connector.read.PartitionReaderFactory
-import org.apache.spark.sql.connector.read.Scan
-
-// The only reason we need this class and cannot reuse BatchScanExec is because
-// BatchScanExec caches input partitions and we cannot apply file filtering before execution
-// Spark calls supportsColumnar during physical planning which, in turn, triggers split planning
-// We must ensure the result is not cached so that we can push down file filters later
-// The only difference compared to BatchScanExec is that we are using def instead of lazy val for splits
-case class ExtendedBatchScanExec(
- output: Seq[AttributeReference],
- @transient scan: Scan) extends DataSourceV2ScanExecBase {
-
- @transient private lazy val batch = scan.toBatch
-
- // TODO: unify the equal/hashCode implementation for all data source v2 query plans.
- override def equals(other: Any): Boolean = other match {
- case other: ExtendedBatchScanExec => this.batch == other.batch
- case _ => false
- }
-
- override def hashCode(): Int = batch.hashCode()
-
- override def partitions: Seq[InputPartition] = batch.planInputPartitions()
-
- override lazy val readerFactory: PartitionReaderFactory = batch.createReaderFactory()
-
- override def inputRDD: RDD[InternalRow] = {
- new DataSourceRDD(sparkContext, partitions, readerFactory, supportsColumnar)
- }
-
- override def doCanonicalize(): ExtendedBatchScanExec = {
- this.copy(output = output.map(QueryPlan.normalizeExpressions(_, output)))
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ExtendedDataSourceV2Implicits.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ExtendedDataSourceV2Implicits.scala
deleted file mode 100644
index 3c1c44149b..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ExtendedDataSourceV2Implicits.scala
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.execution.datasources.v2
-
-import org.apache.iceberg.spark.source.SparkScanBuilder
-import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.connector.catalog.Table
-import org.apache.spark.sql.connector.iceberg.catalog.SupportsMerge
-import org.apache.spark.sql.connector.read.ScanBuilder
-
-// must be merged with DataSourceV2Implicits in Spark
-object ExtendedDataSourceV2Implicits {
- implicit class TableHelper(table: Table) {
- def asMergeable: SupportsMerge = {
- table match {
- case support: SupportsMerge =>
- support
- case _ =>
- throw new AnalysisException(s"Table does not support updates and deletes: ${table.name}")
- }
- }
- }
-
- implicit class ScanBuilderHelper(scanBuilder: ScanBuilder) {
- def asIceberg: SparkScanBuilder = {
- scanBuilder match {
- case iceberg: SparkScanBuilder =>
- iceberg
- case _ =>
- throw new AnalysisException(s"ScanBuilder is not from an Iceberg table: $scanBuilder")
- }
- }
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ExtendedDataSourceV2Strategy.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ExtendedDataSourceV2Strategy.scala
deleted file mode 100644
index 6f0361fdb1..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ExtendedDataSourceV2Strategy.scala
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.execution.datasources.v2
-
-import org.apache.iceberg.spark.Spark3Util
-import org.apache.iceberg.spark.SparkCatalog
-import org.apache.iceberg.spark.SparkSessionCatalog
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.Strategy
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.analysis.NamedRelation
-import org.apache.spark.sql.catalyst.expressions.And
-import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
-import org.apache.spark.sql.catalyst.expressions.NamedExpression
-import org.apache.spark.sql.catalyst.planning.PhysicalOperation
-import org.apache.spark.sql.catalyst.plans.logical.AddPartitionField
-import org.apache.spark.sql.catalyst.plans.logical.Call
-import org.apache.spark.sql.catalyst.plans.logical.DropIdentifierFields
-import org.apache.spark.sql.catalyst.plans.logical.DropPartitionField
-import org.apache.spark.sql.catalyst.plans.logical.DynamicFileFilter
-import org.apache.spark.sql.catalyst.plans.logical.DynamicFileFilterWithCardinalityCheck
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.plans.logical.MergeInto
-import org.apache.spark.sql.catalyst.plans.logical.ReplaceData
-import org.apache.spark.sql.catalyst.plans.logical.ReplacePartitionField
-import org.apache.spark.sql.catalyst.plans.logical.SetIdentifierFields
-import org.apache.spark.sql.catalyst.plans.logical.SetWriteDistributionAndOrdering
-import org.apache.spark.sql.connector.catalog.Identifier
-import org.apache.spark.sql.connector.catalog.TableCatalog
-import org.apache.spark.sql.connector.iceberg.read.SupportsFileFilter
-import org.apache.spark.sql.execution.FilterExec
-import org.apache.spark.sql.execution.LeafExecNode
-import org.apache.spark.sql.execution.ProjectExec
-import org.apache.spark.sql.execution.SparkPlan
-import scala.collection.JavaConverters._
-
-case class ExtendedDataSourceV2Strategy(spark: SparkSession) extends Strategy {
-
- override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
- case c @ Call(procedure, args) =>
- val input = buildInternalRow(args)
- CallExec(c.output, procedure, input) :: Nil
-
- case AddPartitionField(IcebergCatalogAndIdentifier(catalog, ident), transform, name) =>
- AddPartitionFieldExec(catalog, ident, transform, name) :: Nil
-
- case DropPartitionField(IcebergCatalogAndIdentifier(catalog, ident), transform) =>
- DropPartitionFieldExec(catalog, ident, transform) :: Nil
-
- case ReplacePartitionField(IcebergCatalogAndIdentifier(catalog, ident), transformFrom, transformTo, name) =>
- ReplacePartitionFieldExec(catalog, ident, transformFrom, transformTo, name) :: Nil
-
- case SetIdentifierFields(IcebergCatalogAndIdentifier(catalog, ident), fields) =>
- SetIdentifierFieldsExec(catalog, ident, fields) :: Nil
-
- case DropIdentifierFields(IcebergCatalogAndIdentifier(catalog, ident), fields) =>
- DropIdentifierFieldsExec(catalog, ident, fields) :: Nil
-
- case SetWriteDistributionAndOrdering(
- IcebergCatalogAndIdentifier(catalog, ident), distributionMode, ordering) =>
- SetWriteDistributionAndOrderingExec(catalog, ident, distributionMode, ordering) :: Nil
-
- case DynamicFileFilter(scanPlan, fileFilterPlan, filterable) =>
- DynamicFileFilterExec(planLater(scanPlan), planLater(fileFilterPlan), filterable) :: Nil
-
- case DynamicFileFilterWithCardinalityCheck(scanPlan, fileFilterPlan, filterable, filesAccumulator) =>
- DynamicFileFilterWithCardinalityCheckExec(
- planLater(scanPlan),
- planLater(fileFilterPlan),
- filterable,
- filesAccumulator) :: Nil
-
- case PhysicalOperation(project, filters, DataSourceV2ScanRelation(_, scan: SupportsFileFilter, output)) =>
- // projection and filters were already pushed down in the optimizer.
- // this uses PhysicalOperation to get the projection and ensure that if the batch scan does
- // not support columnar, a projection is added to convert the rows to UnsafeRow.
- val batchExec = ExtendedBatchScanExec(output, scan)
- withProjectAndFilter(project, filters, batchExec, !batchExec.supportsColumnar) :: Nil
-
- case ReplaceData(relation, batchWrite, query) =>
- ReplaceDataExec(batchWrite, refreshCache(relation), planLater(query)) :: Nil
-
- case MergeInto(mergeIntoParams, output, child) =>
- MergeIntoExec(mergeIntoParams, output, planLater(child)) :: Nil
-
- case _ => Nil
- }
-
- private def buildInternalRow(exprs: Seq[Expression]): InternalRow = {
- val values = new Array[Any](exprs.size)
- for (index <- exprs.indices) {
- values(index) = exprs(index).eval()
- }
- new GenericInternalRow(values)
- }
-
- private def withProjectAndFilter(
- project: Seq[NamedExpression],
- filters: Seq[Expression],
- scan: LeafExecNode,
- needsUnsafeConversion: Boolean): SparkPlan = {
- val filterCondition = filters.reduceLeftOption(And)
- val withFilter = filterCondition.map(FilterExec(_, scan)).getOrElse(scan)
-
- if (withFilter.output != project || needsUnsafeConversion) {
- ProjectExec(project, withFilter)
- } else {
- withFilter
- }
- }
-
- private def refreshCache(r: NamedRelation)(): Unit = {
- spark.sharedState.cacheManager.recacheByPlan(spark, r)
- }
-
- private object IcebergCatalogAndIdentifier {
- def unapply(identifier: Seq[String]): Option[(TableCatalog, Identifier)] = {
- val catalogAndIdentifier = Spark3Util.catalogAndIdentifier(spark, identifier.asJava)
- catalogAndIdentifier.catalog match {
- case icebergCatalog: SparkCatalog =>
- Some((icebergCatalog, catalogAndIdentifier.identifier))
- case icebergCatalog: SparkSessionCatalog[_] =>
- Some((icebergCatalog, catalogAndIdentifier.identifier))
- case _ =>
- None
- }
- }
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/MergeIntoExec.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/MergeIntoExec.scala
deleted file mode 100644
index a7d8ad6ea8..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/MergeIntoExec.scala
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.execution.datasources.v2
-
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.expressions.BasePredicate
-import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
-import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate
-import org.apache.spark.sql.catalyst.plans.logical.MergeIntoParams
-import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.execution.UnaryExecNode
-
-case class MergeIntoExec(
- mergeIntoParams: MergeIntoParams,
- output: Seq[Attribute],
- override val child: SparkPlan) extends UnaryExecNode {
-
- protected override def doExecute(): RDD[InternalRow] = {
- child.execute().mapPartitions {
- processPartition(mergeIntoParams, _)
- }
- }
-
- private def generateProjection(exprs: Seq[Expression], attrs: Seq[Attribute]): UnsafeProjection = {
- UnsafeProjection.create(exprs, attrs)
- }
-
- private def generatePredicate(expr: Expression, attrs: Seq[Attribute]): BasePredicate = {
- GeneratePredicate.generate(expr, attrs)
- }
-
- private def applyProjection(
- actions: Seq[(BasePredicate, Option[UnsafeProjection])],
- inputRow: InternalRow): InternalRow = {
-
- // Find the first combination where the predicate evaluates to true.
- // In case when there are overlapping condition in the MATCHED
- // clauses, for the first one that satisfies the predicate, the
- // corresponding action is applied. For example:
- // WHEN MATCHED AND id > 1 AND id < 10 UPDATE *
- // WHEN MATCHED AND id = 5 OR id = 21 DELETE
- // In above case, when id = 5, it applies both that matched predicates. In this
- // case the first one we see is applied.
-
- val pair = actions.find {
- case (predicate, _) => predicate.eval(inputRow)
- }
-
- // Now apply the appropriate projection to produce an output row, or return null to suppress this row
- pair match {
- case Some((_, Some(projection))) =>
- projection.apply(inputRow)
- case _ =>
- null
- }
- }
-
- private def processPartition(
- params: MergeIntoParams,
- rowIterator: Iterator[InternalRow]): Iterator[InternalRow] = {
-
- val joinedAttrs = params.joinedAttributes
- val isSourceRowPresentPred = generatePredicate(params.isSourceRowPresent, joinedAttrs)
- val isTargetRowPresentPred = generatePredicate(params.isTargetRowPresent, joinedAttrs)
- val matchedPreds = params.matchedConditions.map(generatePredicate(_, joinedAttrs))
- val matchedProjs = params.matchedOutputs.map(_.map(generateProjection(_, joinedAttrs)))
- val notMatchedPreds = params.notMatchedConditions.map(generatePredicate(_, joinedAttrs))
- val notMatchedProjs = params.notMatchedOutputs.map(_.map(generateProjection(_, joinedAttrs)))
- val projectTargetCols = generateProjection(params.targetOutput, joinedAttrs)
- val nonMatchedPairs = notMatchedPreds zip notMatchedProjs
- val matchedPairs = matchedPreds zip matchedProjs
-
- /**
- * This method is responsible for processing a input row to emit the resultant row with an
- * additional column that indicates whether the row is going to be included in the final
- * output of merge or not.
- * 1. If there is a target row for which there is no corresponding source row (join condition not met)
- * - Only project the target columns with deleted flag set to false.
- * 2. If there is a source row for which there is no corresponding target row (join condition not met)
- * - Apply the not matched actions (i.e INSERT actions) if non match conditions are met.
- * 3. If there is a source row for which there is a corresponding target row (join condition met)
- * - Apply the matched actions (i.e DELETE or UPDATE actions) if match conditions are met.
- */
- def processRow(inputRow: InternalRow): InternalRow = {
- if (!isSourceRowPresentPred.eval(inputRow)) {
- projectTargetCols.apply(inputRow)
- } else if (!isTargetRowPresentPred.eval(inputRow)) {
- applyProjection(nonMatchedPairs, inputRow)
- } else {
- applyProjection(matchedPairs, inputRow)
- }
- }
-
- rowIterator
- .map(processRow)
- .filter(row => row != null)
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ReplaceDataExec.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ReplaceDataExec.scala
deleted file mode 100644
index f26a8c71b5..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ReplaceDataExec.scala
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.execution.datasources.v2
-
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.connector.write.BatchWrite
-import org.apache.spark.sql.execution.SparkPlan
-
-case class ReplaceDataExec(
- batchWrite: BatchWrite,
- refreshCache: () => Unit,
- query: SparkPlan) extends V2TableWriteExec {
-
- override protected def run(): Seq[InternalRow] = {
- // calling prepare() ensures we execute DynamicFileFilter if present
- prepare()
- val writtenRows = writeWithV2(batchWrite)
- refreshCache()
- writtenRows
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ReplacePartitionFieldExec.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ReplacePartitionFieldExec.scala
deleted file mode 100644
index 11e900c77b..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ReplacePartitionFieldExec.scala
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.execution.datasources.v2
-
-import org.apache.iceberg.spark.Spark3Util
-import org.apache.iceberg.spark.source.SparkTable
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.connector.catalog.Identifier
-import org.apache.spark.sql.connector.catalog.TableCatalog
-import org.apache.spark.sql.connector.expressions.FieldReference
-import org.apache.spark.sql.connector.expressions.IdentityTransform
-import org.apache.spark.sql.connector.expressions.Transform
-
-case class ReplacePartitionFieldExec(
- catalog: TableCatalog,
- ident: Identifier,
- transformFrom: Transform,
- transformTo: Transform,
- name: Option[String]) extends V2CommandExec {
- import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
-
- override lazy val output: Seq[Attribute] = Nil
-
- override protected def run(): Seq[InternalRow] = {
- catalog.loadTable(ident) match {
- case iceberg: SparkTable =>
- val schema = iceberg.table.schema
- transformFrom match {
- case IdentityTransform(FieldReference(parts)) if parts.size == 1 && schema.findField(parts.head) == null =>
- // the name is not present in the Iceberg schema, so it must be a partition field name, not a column name
- iceberg.table.updateSpec()
- .removeField(parts.head)
- .addField(name.orNull, Spark3Util.toIcebergTerm(transformTo))
- .commit()
-
- case _ =>
- iceberg.table.updateSpec()
- .removeField(Spark3Util.toIcebergTerm(transformFrom))
- .addField(name.orNull, Spark3Util.toIcebergTerm(transformTo))
- .commit()
- }
-
- case table =>
- throw new UnsupportedOperationException(s"Cannot replace partition field in non-Iceberg table: $table")
- }
-
- Nil
- }
-
- override def simpleString(maxFields: Int): String = {
- s"ReplacePartitionField ${catalog.name}.${ident.quoted} ${transformFrom.describe} " +
- s"with ${name.map(n => s"$n=").getOrElse("")}${transformTo.describe}"
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/SetIdentifierFieldsExec.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/SetIdentifierFieldsExec.scala
deleted file mode 100644
index 7fad2dc016..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/SetIdentifierFieldsExec.scala
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.execution.datasources.v2
-
-import org.apache.iceberg.spark.source.SparkTable
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.connector.catalog.Identifier
-import org.apache.spark.sql.connector.catalog.TableCatalog
-
-case class SetIdentifierFieldsExec(
- catalog: TableCatalog,
- ident: Identifier,
- fields: Seq[String]) extends V2CommandExec {
- import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
-
- override lazy val output: Seq[Attribute] = Nil
-
- override protected def run(): Seq[InternalRow] = {
- catalog.loadTable(ident) match {
- case iceberg: SparkTable =>
- iceberg.table.updateSchema()
- .setIdentifierFields(scala.collection.JavaConverters.seqAsJavaList(fields))
- .commit();
- case table =>
- throw new UnsupportedOperationException(s"Cannot set identifier fields in non-Iceberg table: $table")
- }
-
- Nil
- }
-
- override def simpleString(maxFields: Int): String = {
- s"SetIdentifierFields ${catalog.name}.${ident.quoted} (${fields.quoted})";
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/SetWriteDistributionAndOrderingExec.scala b/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/SetWriteDistributionAndOrderingExec.scala
deleted file mode 100644
index 9916b53506..0000000000
--- a/spark/v3.0/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/SetWriteDistributionAndOrderingExec.scala
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.execution.datasources.v2
-
-import org.apache.iceberg.DistributionMode
-import org.apache.iceberg.NullOrder
-import org.apache.iceberg.SortDirection
-import org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE
-import org.apache.iceberg.expressions.Term
-import org.apache.iceberg.spark.source.SparkTable
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.connector.catalog.CatalogV2Implicits
-import org.apache.spark.sql.connector.catalog.Identifier
-import org.apache.spark.sql.connector.catalog.TableCatalog
-
-case class SetWriteDistributionAndOrderingExec(
- catalog: TableCatalog,
- ident: Identifier,
- distributionMode: DistributionMode,
- sortOrder: Seq[(Term, SortDirection, NullOrder)]) extends V2CommandExec {
-
- import CatalogV2Implicits._
-
- override lazy val output: Seq[Attribute] = Nil
-
- override protected def run(): Seq[InternalRow] = {
- catalog.loadTable(ident) match {
- case iceberg: SparkTable =>
- val txn = iceberg.table.newTransaction()
-
- val orderBuilder = txn.replaceSortOrder()
- sortOrder.foreach {
- case (term, SortDirection.ASC, nullOrder) =>
- orderBuilder.asc(term, nullOrder)
- case (term, SortDirection.DESC, nullOrder) =>
- orderBuilder.desc(term, nullOrder)
- }
- orderBuilder.commit()
-
- txn.updateProperties()
- .set(WRITE_DISTRIBUTION_MODE, distributionMode.modeName())
- .commit()
-
- txn.commitTransaction()
-
- case table =>
- throw new UnsupportedOperationException(s"Cannot set write order of non-Iceberg table: $table")
- }
-
- Nil
- }
-
- override def simpleString(maxFields: Int): String = {
- val tableIdent = s"${catalog.name}.${ident.quoted}"
- val order = sortOrder.map {
- case (term, direction, nullOrder) => s"$term $direction $nullOrder"
- }.mkString(", ")
- s"SetWriteDistributionAndOrdering $tableIdent $distributionMode $order"
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/Employee.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/Employee.java
deleted file mode 100644
index 8918dfec65..0000000000
--- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/Employee.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.iceberg.spark.extensions;
-
-import java.util.Objects;
-
-public class Employee {
- private Integer id;
- private String dep;
-
- public Employee() {}
-
- public Employee(Integer id, String dep) {
- this.id = id;
- this.dep = dep;
- }
-
- public Integer getId() {
- return id;
- }
-
- public void setId(Integer id) {
- this.id = id;
- }
-
- public String getDep() {
- return dep;
- }
-
- public void setDep(String dep) {
- this.dep = dep;
- }
-
- @Override
- public boolean equals(Object other) {
- if (this == other) {
- return true;
- } else if (other == null || getClass() != other.getClass()) {
- return false;
- }
-
- Employee employee = (Employee) other;
- return Objects.equals(id, employee.id) && Objects.equals(dep, employee.dep);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(id, dep);
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkExtensionsTestBase.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkExtensionsTestBase.java
deleted file mode 100644
index 0a1cf75204..0000000000
--- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkExtensionsTestBase.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.iceberg.spark.extensions;
-
-import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS;
-
-import java.util.Map;
-import org.apache.iceberg.CatalogUtil;
-import org.apache.iceberg.hive.HiveCatalog;
-import org.apache.iceberg.hive.TestHiveMetastore;
-import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
-import org.apache.iceberg.spark.SparkCatalogTestBase;
-import org.apache.iceberg.spark.SparkTestBase;
-import org.apache.spark.sql.SparkSession;
-import org.apache.spark.sql.internal.SQLConf;
-import org.junit.BeforeClass;
-
-public abstract class SparkExtensionsTestBase extends SparkCatalogTestBase {
-
- public SparkExtensionsTestBase(
- String catalogName, String implementation, Map<String, String> config) {
- super(catalogName, implementation, config);
- }
-
- @BeforeClass
- public static void startMetastoreAndSpark() {
- SparkTestBase.metastore = new TestHiveMetastore();
- metastore.start();
- SparkTestBase.hiveConf = metastore.hiveConf();
-
- SparkTestBase.spark =
- SparkSession.builder()
- .master("local[2]")
- .config("spark.testing", "true")
- .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic")
- .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName())
- .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname))
- .config("spark.sql.shuffle.partitions", "4")
- .config("spark.sql.hive.metastorePartitionPruningFallbackOnException", "true")
- .enableHiveSupport()
- .getOrCreate();
-
- SparkTestBase.catalog =
- (HiveCatalog)
- CatalogUtil.loadCatalog(
- HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf);
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkRowLevelOperationsTestBase.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkRowLevelOperationsTestBase.java
deleted file mode 100644
index 37f6dc37d5..0000000000
--- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkRowLevelOperationsTestBase.java
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.iceberg.spark.extensions;
-
-import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT;
-import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED;
-import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE;
-import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_HASH;
-import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_NONE;
-import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE;
-
-import java.util.Arrays;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.concurrent.ThreadLocalRandom;
-import java.util.stream.Collectors;
-import org.apache.iceberg.Snapshot;
-import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
-import org.apache.iceberg.spark.SparkCatalog;
-import org.apache.iceberg.spark.SparkSessionCatalog;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Encoder;
-import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
-import org.junit.Assert;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-import org.junit.runners.Parameterized.Parameters;
-
-@RunWith(Parameterized.class)
-public abstract class SparkRowLevelOperationsTestBase extends SparkExtensionsTestBase {
-
- private static final Random RANDOM = ThreadLocalRandom.current();
-
- protected final String fileFormat;
- protected final boolean vectorized;
- protected final String distributionMode;
-
- public SparkRowLevelOperationsTestBase(
- String catalogName,
- String implementation,
- Map<String, String> config,
- String fileFormat,
- boolean vectorized,
- String distributionMode) {
- super(catalogName, implementation, config);
- this.fileFormat = fileFormat;
- this.vectorized = vectorized;
- this.distributionMode = distributionMode;
- }
-
- @Parameters(
- name =
- "catalogName = {0}, implementation = {1}, config = {2},"
- + " format = {3}, vectorized = {4}, distributionMode = {5}")
- public static Object[][] parameters() {
- return new Object[][] {
- {
- "testhive",
- SparkCatalog.class.getName(),
- ImmutableMap.of(
- "type", "hive",
- "default-namespace", "default"),
- "orc",
- true,
- WRITE_DISTRIBUTION_MODE_NONE
- },
- {
- "testhadoop",
- SparkCatalog.class.getName(),
- ImmutableMap.of("type", "hadoop"),
- "parquet",
- RANDOM.nextBoolean(),
- WRITE_DISTRIBUTION_MODE_HASH
- },
- {
- "spark_catalog",
- SparkSessionCatalog.class.getName(),
- ImmutableMap.of(
- "type", "hive",
- "default-namespace", "default",
- "clients", "1",
- "parquet-enabled", "false",
- "cache-enabled",
- "false" // Spark will delete tables using v1, leaving the cache out of sync
- ),
- "avro",
- false,
- WRITE_DISTRIBUTION_MODE_RANGE
- }
- };
- }
-
- protected abstract Map<String, String> extraTableProperties();
-
- protected void initTable() {
- sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, DEFAULT_FILE_FORMAT, fileFormat);
- sql(
- "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')",
- tableName, WRITE_DISTRIBUTION_MODE, distributionMode);
-
- switch (fileFormat) {
- case "parquet":
- sql(
- "ALTER TABLE %s SET TBLPROPERTIES('%s' '%b')",
- tableName, PARQUET_VECTORIZATION_ENABLED, vectorized);
- break;
- case "orc":
- Assert.assertTrue(vectorized);
- break;
- case "avro":
- Assert.assertFalse(vectorized);
- break;
- }
-
- Map<String, String> props = extraTableProperties();
- props.forEach(
- (prop, value) -> {
- sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, prop, value);
- });
- }
-
- protected void createAndInitTable(String schema) {
- createAndInitTable(schema, null);
- }
-
- protected void createAndInitTable(String schema, String jsonData) {
- sql("CREATE TABLE %s (%s) USING iceberg", tableName, schema);
- initTable();
-
- if (jsonData != null) {
- try {
- Dataset<Row> ds = toDS(schema, jsonData);
- ds.writeTo(tableName).append();
- } catch (NoSuchTableException e) {
- throw new RuntimeException("Failed to write data", e);
- }
- }
- }
-
- protected void append(String table, String jsonData) {
- append(table, null, jsonData);
- }
-
- protected void append(String table, String schema, String jsonData) {
- try {
- Dataset<Row> ds = toDS(schema, jsonData);
- ds.coalesce(1).writeTo(table).append();
- } catch (NoSuchTableException e) {
- throw new RuntimeException("Failed to write data", e);
- }
- }
-
- protected void createOrReplaceView(String name, String jsonData) {
- createOrReplaceView(name, null, jsonData);
- }
-
- protected void createOrReplaceView(String name, String schema, String jsonData) {
- Dataset<Row> ds = toDS(schema, jsonData);
- ds.createOrReplaceTempView(name);
- }
-
- protected <T> void createOrReplaceView(String name, List<T> data, Encoder<T> encoder) {
- spark.createDataset(data, encoder).createOrReplaceTempView(name);
- }
-
- private Dataset<Row> toDS(String schema, String jsonData) {
- List<String> jsonRows =
- Arrays.stream(jsonData.split("\n"))
- .filter(str -> str.trim().length() > 0)
- .collect(Collectors.toList());
- Dataset<String> jsonDS = spark.createDataset(jsonRows, Encoders.STRING());
-
- if (schema != null) {
- return spark.read().schema(schema).json(jsonDS);
- } else {
- return spark.read().json(jsonDS);
- }
- }
-
- protected void validateSnapshot(
- Snapshot snapshot,
- String operation,
- String changedPartitionCount,
- String deletedDataFiles,
- String addedDataFiles) {
- Assert.assertEquals("Operation must match", operation, snapshot.operation());
- Assert.assertEquals(
- "Changed partitions count must match",
- changedPartitionCount,
- snapshot.summary().get("changed-partition-count"));
- Assert.assertEquals(
- "Deleted data files count must match",
- deletedDataFiles,
- snapshot.summary().get("deleted-data-files"));
- Assert.assertEquals(
- "Added data files count must match",
- addedDataFiles,
- snapshot.summary().get("added-data-files"));
- }
-
- protected void sleep(long millis) {
- try {
- Thread.sleep(millis);
- } catch (InterruptedException e) {
- throw new RuntimeException(e);
- }
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAddFilesProcedure.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAddFilesProcedure.java
deleted file mode 100644
index 046590cd0d..0000000000
--- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAddFilesProcedure.java
+++ /dev/null
@@ -1,1048 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.iceberg.spark.extensions;
-
-import static org.apache.iceberg.types.Types.NestedField.optional;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.List;
-import java.util.Map;
-import java.util.stream.Collectors;
-import org.apache.avro.Schema;
-import org.apache.avro.SchemaBuilder;
-import org.apache.avro.file.DataFileWriter;
-import org.apache.avro.generic.GenericData;
-import org.apache.avro.generic.GenericDatumWriter;
-import org.apache.avro.generic.GenericRecord;
-import org.apache.avro.io.DatumWriter;
-import org.apache.iceberg.AssertHelpers;
-import org.apache.iceberg.DataFile;
-import org.apache.iceberg.Files;
-import org.apache.iceberg.MetricsConfig;
-import org.apache.iceberg.data.Record;
-import org.apache.iceberg.data.orc.GenericOrcWriter;
-import org.apache.iceberg.io.FileAppender;
-import org.apache.iceberg.io.OutputFile;
-import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
-import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
-import org.apache.iceberg.relocated.com.google.common.collect.Lists;
-import org.apache.iceberg.types.Types;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-import org.junit.After;
-import org.junit.Assert;
-import org.junit.Assume;
-import org.junit.Before;
-import org.junit.Ignore;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TemporaryFolder;
-
-public class TestAddFilesProcedure extends SparkExtensionsTestBase {
-
- private final String sourceTableName = "source_table";
- private File fileTableDir;
-
- public TestAddFilesProcedure(
- String catalogName, String implementation, Map<String, String> config) {
- super(catalogName, implementation, config);
- }
-
- @Rule public TemporaryFolder temp = new TemporaryFolder();
-
- @Before
- public void setupTempDirs() {
- try {
- fileTableDir = temp.newFolder();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
-
- @After
- public void dropTables() {
- sql("DROP TABLE IF EXISTS %s", sourceTableName);
- sql("DROP TABLE IF EXISTS %s", tableName);
- }
-
- @Test
- public void addDataUnpartitioned() {
- createUnpartitionedFileTable("parquet");
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg";
-
- sql(createIceberg, tableName);
-
- Object result =
- scalarSql(
- "CALL %s.system.add_files('%s', '`parquet`.`%s`')",
- catalogName, tableName, fileTableDir.getAbsolutePath());
-
- Assert.assertEquals(2L, result);
-
- assertEquals(
- "Iceberg table contains correct data",
- sql("SELECT * FROM %s ORDER BY id", sourceTableName),
- sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- @Ignore // TODO Classpath issues prevent us from actually writing to a Spark ORC table
- public void addDataUnpartitionedOrc() {
- createUnpartitionedFileTable("orc");
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg";
-
- sql(createIceberg, tableName);
-
- Object result =
- scalarSql(
- "CALL %s.system.add_files('%s', '`orc`.`%s`')",
- catalogName, tableName, fileTableDir.getAbsolutePath());
-
- Assert.assertEquals(2L, result);
-
- assertEquals(
- "Iceberg table contains correct data",
- sql("SELECT * FROM %s ORDER BY id", sourceTableName),
- sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void addAvroFile() throws Exception {
- // Spark Session Catalog cannot load metadata tables
- // with "The namespace in session catalog must have exactly one name part"
- Assume.assumeFalse(catalogName.equals("spark_catalog"));
-
- // Create an Avro file
-
- Schema schema =
- SchemaBuilder.record("record")
- .fields()
- .requiredInt("id")
- .requiredString("data")
- .endRecord();
- GenericRecord record1 = new GenericData.Record(schema);
- record1.put("id", 1L);
- record1.put("data", "a");
- GenericRecord record2 = new GenericData.Record(schema);
- record2.put("id", 2L);
- record2.put("data", "b");
- File outputFile = temp.newFile("test.avro");
-
- DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter(schema);
- DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter(datumWriter);
- dataFileWriter.create(schema, outputFile);
- dataFileWriter.append(record1);
- dataFileWriter.append(record2);
- dataFileWriter.close();
-
- String createIceberg = "CREATE TABLE %s (id Long, data String) USING iceberg";
- sql(createIceberg, tableName);
-
- Object result =
- scalarSql(
- "CALL %s.system.add_files('%s', '`avro`.`%s`')",
- catalogName, tableName, outputFile.getPath());
- Assert.assertEquals(1L, result);
-
- List<Object[]> expected = Lists.newArrayList(new Object[] {1L, "a"}, new Object[] {2L, "b"});
-
- assertEquals(
- "Iceberg table contains correct data",
- expected,
- sql("SELECT * FROM %s ORDER BY id", tableName));
-
- List<Object[]> actualRecordCount =
- sql("select %s from %s.files", DataFile.RECORD_COUNT.name(), tableName);
- List<Object[]> expectedRecordCount = Lists.newArrayList();
- expectedRecordCount.add(new Object[] {2L});
- assertEquals(
- "Iceberg file metadata should have correct metadata count",
- expectedRecordCount,
- actualRecordCount);
- }
-
- // TODO Adding spark-avro doesn't work in tests
- @Ignore
- public void addDataUnpartitionedAvro() {
- createUnpartitionedFileTable("avro");
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg";
-
- sql(createIceberg, tableName);
-
- Object result =
- scalarSql(
- "CALL %s.system.add_files('%s', '`avro`.`%s`')",
- catalogName, tableName, fileTableDir.getAbsolutePath());
-
- Assert.assertEquals(2L, result);
-
- assertEquals(
- "Iceberg table contains correct data",
- sql("SELECT * FROM %s ORDER BY id", sourceTableName),
- sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void addDataUnpartitionedHive() {
- createUnpartitionedHiveTable();
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg";
-
- sql(createIceberg, tableName);
-
- Object result =
- scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName);
-
- Assert.assertEquals(2L, result);
-
- assertEquals(
- "Iceberg table contains correct data",
- sql("SELECT * FROM %s ORDER BY id", sourceTableName),
- sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void addDataUnpartitionedExtraCol() {
- createUnpartitionedFileTable("parquet");
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String, foo string) USING iceberg";
-
- sql(createIceberg, tableName);
-
- Object result =
- scalarSql(
- "CALL %s.system.add_files('%s', '`parquet`.`%s`')",
- catalogName, tableName, fileTableDir.getAbsolutePath());
-
- Assert.assertEquals(2L, result);
-
- assertEquals(
- "Iceberg table contains correct data",
- sql("SELECT * FROM %s ORDER BY id", sourceTableName),
- sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void addDataUnpartitionedMissingCol() {
- createUnpartitionedFileTable("parquet");
-
- String createIceberg = "CREATE TABLE %s (id Integer, name String, dept String) USING iceberg";
-
- sql(createIceberg, tableName);
-
- Object result =
- scalarSql(
- "CALL %s.system.add_files('%s', '`parquet`.`%s`')",
- catalogName, tableName, fileTableDir.getAbsolutePath());
-
- Assert.assertEquals(2L, result);
-
- assertEquals(
- "Iceberg table contains correct data",
- sql("SELECT id, name, dept FROM %s ORDER BY id", sourceTableName),
- sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void addDataPartitionedMissingCol() {
- createPartitionedFileTable("parquet");
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String) USING iceberg PARTITIONED BY (id)";
-
- sql(createIceberg, tableName);
-
- Object result =
- scalarSql(
- "CALL %s.system.add_files('%s', '`parquet`.`%s`')",
- catalogName, tableName, fileTableDir.getAbsolutePath());
-
- Assert.assertEquals(8L, result);
-
- assertEquals(
- "Iceberg table contains correct data",
- sql("SELECT id, name, dept FROM %s ORDER BY id", sourceTableName),
- sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void addDataPartitioned() {
- createPartitionedFileTable("parquet");
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg PARTITIONED BY (id)";
-
- sql(createIceberg, tableName);
-
- Object result =
- scalarSql(
- "CALL %s.system.add_files('%s', '`parquet`.`%s`')",
- catalogName, tableName, fileTableDir.getAbsolutePath());
-
- Assert.assertEquals(8L, result);
-
- assertEquals(
- "Iceberg table contains correct data",
- sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", sourceTableName),
- sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName));
- }
-
- @Ignore // TODO Classpath issues prevent us from actually writing to a Spark ORC table
- public void addDataPartitionedOrc() {
- createPartitionedFileTable("orc");
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg PARTITIONED BY (id)";
-
- sql(createIceberg, tableName);
-
- Object result =
- scalarSql(
- "CALL %s.system.add_files('%s', '`parquet`.`%s`')",
- catalogName, tableName, fileTableDir.getAbsolutePath());
-
- Assert.assertEquals(8L, result);
-
- assertEquals(
- "Iceberg table contains correct data",
- sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", sourceTableName),
- sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName));
- }
-
- // TODO Adding spark-avro doesn't work in tests
- @Ignore
- public void addDataPartitionedAvro() {
- createPartitionedFileTable("avro");
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg PARTITIONED BY (id)";
-
- sql(createIceberg, tableName);
-
- Object result =
- scalarSql(
- "CALL %s.system.add_files('%s', '`avro`.`%s`')",
- catalogName, tableName, fileTableDir.getAbsolutePath());
-
- Assert.assertEquals(8L, result);
-
- assertEquals(
- "Iceberg table contains correct data",
- sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", sourceTableName),
- sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void addDataPartitionedHive() {
- createPartitionedHiveTable();
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg PARTITIONED BY (id)";
-
- sql(createIceberg, tableName);
-
- Object result =
- scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName);
-
- Assert.assertEquals(8L, result);
-
- assertEquals(
- "Iceberg table contains correct data",
- sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", sourceTableName),
- sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void addPartitionToPartitioned() {
- createPartitionedFileTable("parquet");
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg PARTITIONED BY (id)";
-
- sql(createIceberg, tableName);
-
- Object result =
- scalarSql(
- "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))",
- catalogName, tableName, fileTableDir.getAbsolutePath());
-
- Assert.assertEquals(2L, result);
-
- assertEquals(
- "Iceberg table contains correct data",
- sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName),
- sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void addFilteredPartitionsToPartitioned() {
- createCompositePartitionedTable("parquet");
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg "
- + "PARTITIONED BY (id, dept)";
-
- sql(createIceberg, tableName);
-
- Object result =
- scalarSql(
- "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))",
- catalogName, tableName, fileTableDir.getAbsolutePath());
-
- Assert.assertEquals(2L, result);
-
- assertEquals(
- "Iceberg table contains correct data",
- sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName),
- sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void addFilteredPartitionsToPartitioned2() {
- createCompositePartitionedTable("parquet");
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg "
- + "PARTITIONED BY (id, dept)";
-
- sql(createIceberg, tableName);
-
- Object result =
- scalarSql(
- "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', 'hr'))",
- catalogName, tableName, fileTableDir.getAbsolutePath());
-
- Assert.assertEquals(6L, result);
-
- assertEquals(
- "Iceberg table contains correct data",
- sql(
- "SELECT id, name, dept, subdept FROM %s WHERE dept = 'hr' ORDER BY id",
- sourceTableName),
- sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void addFilteredPartitionsToPartitionedWithNullValueFilteringOnId() {
- createCompositePartitionedTableWithNullValueInPartitionColumn("parquet");
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg "
- + "PARTITIONED BY (id, dept)";
-
- sql(createIceberg, tableName);
-
- Object result =
- scalarSql(
- "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))",
- catalogName, tableName, fileTableDir.getAbsolutePath());
-
- Assert.assertEquals(2L, result);
-
- assertEquals(
- "Iceberg table contains correct data",
- sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName),
- sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void addFilteredPartitionsToPartitionedWithNullValueFilteringOnDept() {
- createCompositePartitionedTableWithNullValueInPartitionColumn("parquet");
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg "
- + "PARTITIONED BY (id, dept)";
-
- sql(createIceberg, tableName);
-
- Object result =
- scalarSql(
- "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', 'hr'))",
- catalogName, tableName, fileTableDir.getAbsolutePath());
-
- Assert.assertEquals(6L, result);
-
- assertEquals(
- "Iceberg table contains correct data",
- sql(
- "SELECT id, name, dept, subdept FROM %s WHERE dept = 'hr' ORDER BY id",
- sourceTableName),
- sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void addWeirdCaseHiveTable() {
- createWeirdCaseTable();
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, `naMe` String, dept String, subdept String) USING iceberg "
- + "PARTITIONED BY (`naMe`)";
-
- sql(createIceberg, tableName);
-
- Object result =
- scalarSql(
- "CALL %s.system.add_files('%s', '%s', map('naMe', 'John Doe'))",
- catalogName, tableName, sourceTableName);
-
- Assert.assertEquals(2L, result);
-
- /*
- While we would like to use
- SELECT id, `naMe`, dept, subdept FROM %s WHERE `naMe` = 'John Doe' ORDER BY id
- Spark does not actually handle this pushdown correctly for hive based tables and it returns 0 records
- */
- List<Object[]> expected =
- sql("SELECT id, `naMe`, dept, subdept from %s ORDER BY id", sourceTableName).stream()
- .filter(r -> r[1].equals("John Doe"))
- .collect(Collectors.toList());
-
- // TODO when this assert breaks Spark fixed the pushdown issue
- Assert.assertEquals(
- "If this assert breaks it means that Spark has fixed the pushdown issue",
- 0,
- sql(
- "SELECT id, `naMe`, dept, subdept from %s WHERE `naMe` = 'John Doe' ORDER BY id",
- sourceTableName)
- .size());
-
- // Pushdown works for iceberg
- Assert.assertEquals(
- "We should be able to pushdown mixed case partition keys",
- 2,
- sql(
- "SELECT id, `naMe`, dept, subdept FROM %s WHERE `naMe` = 'John Doe' ORDER BY id",
- tableName)
- .size());
-
- assertEquals(
- "Iceberg table contains correct data",
- expected,
- sql("SELECT id, `naMe`, dept, subdept FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void addPartitionToPartitionedHive() {
- createPartitionedHiveTable();
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg PARTITIONED BY (id)";
-
- sql(createIceberg, tableName);
-
- Object result =
- scalarSql(
- "CALL %s.system.add_files('%s', '%s', map('id', 1))",
- catalogName, tableName, sourceTableName);
-
- Assert.assertEquals(2L, result);
-
- assertEquals(
- "Iceberg table contains correct data",
- sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName),
- sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void invalidDataImport() {
- createPartitionedFileTable("parquet");
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg";
-
- sql(createIceberg, tableName);
-
- AssertHelpers.assertThrows(
- "Should forbid adding of partitioned data to unpartitioned table",
- IllegalArgumentException.class,
- "Cannot use partition filter with an unpartitioned table",
- () ->
- scalarSql(
- "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))",
- catalogName, tableName, fileTableDir.getAbsolutePath()));
-
- AssertHelpers.assertThrows(
- "Should forbid adding of partitioned data to unpartitioned table",
- IllegalArgumentException.class,
- "Cannot add partitioned files to an unpartitioned table",
- () ->
- scalarSql(
- "CALL %s.system.add_files('%s', '`parquet`.`%s`')",
- catalogName, tableName, fileTableDir.getAbsolutePath()));
- }
-
- @Test
- public void invalidDataImportPartitioned() {
- createUnpartitionedFileTable("parquet");
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg PARTITIONED BY (id)";
-
- sql(createIceberg, tableName);
-
- AssertHelpers.assertThrows(
- "Should forbid adding with a mismatching partition spec",
- IllegalArgumentException.class,
- "is greater than the number of partitioned columns",
- () ->
- scalarSql(
- "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('x', '1', 'y', '2'))",
- catalogName, tableName, fileTableDir.getAbsolutePath()));
-
- AssertHelpers.assertThrows(
- "Should forbid adding with partition spec with incorrect columns",
- IllegalArgumentException.class,
- "specified partition filter refers to columns that are not partitioned",
- () ->
- scalarSql(
- "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', '2'))",
- catalogName, tableName, fileTableDir.getAbsolutePath()));
- }
-
- @Test
- public void addTwice() {
- createPartitionedHiveTable();
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg PARTITIONED BY (id)";
-
- sql(createIceberg, tableName);
-
- Object result1 =
- scalarSql(
- "CALL %s.system.add_files("
- + "table => '%s', "
- + "source_table => '%s', "
- + "partition_filter => map('id', 1))",
- catalogName, tableName, sourceTableName);
- Assert.assertEquals(2L, result1);
-
- Object result2 =
- scalarSql(
- "CALL %s.system.add_files("
- + "table => '%s', "
- + "source_table => '%s', "
- + "partition_filter => map('id', 2))",
- catalogName, tableName, sourceTableName);
- Assert.assertEquals(2L, result2);
-
- assertEquals(
- "Iceberg table contains correct data",
- sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName),
- sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", tableName));
- assertEquals(
- "Iceberg table contains correct data",
- sql("SELECT id, name, dept, subdept FROM %s WHERE id = 2 ORDER BY id", sourceTableName),
- sql("SELECT id, name, dept, subdept FROM %s WHERE id = 2 ORDER BY id", tableName));
- }
-
- @Test
- public void duplicateDataPartitioned() {
- createPartitionedHiveTable();
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg PARTITIONED BY (id)";
-
- sql(createIceberg, tableName);
-
- scalarSql(
- "CALL %s.system.add_files("
- + "table => '%s', "
- + "source_table => '%s', "
- + "partition_filter => map('id', 1))",
- catalogName, tableName, sourceTableName);
-
- AssertHelpers.assertThrows(
- "Should not allow adding duplicate files",
- IllegalStateException.class,
- "Cannot complete import because data files to be imported already"
- + " exist within the target table",
- () ->
- scalarSql(
- "CALL %s.system.add_files("
- + "table => '%s', "
- + "source_table => '%s', "
- + "partition_filter => map('id', 1))",
- catalogName, tableName, sourceTableName));
- }
-
- @Test
- public void duplicateDataPartitionedAllowed() {
- createPartitionedHiveTable();
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg PARTITIONED BY (id)";
-
- sql(createIceberg, tableName);
-
- Object result1 =
- scalarSql(
- "CALL %s.system.add_files("
- + "table => '%s', "
- + "source_table => '%s', "
- + "partition_filter => map('id', 1))",
- catalogName, tableName, sourceTableName);
-
- Assert.assertEquals(2L, result1);
-
- Object result2 =
- scalarSql(
- "CALL %s.system.add_files("
- + "table => '%s', "
- + "source_table => '%s', "
- + "partition_filter => map('id', 1),"
- + "check_duplicate_files => false)",
- catalogName, tableName, sourceTableName);
-
- Assert.assertEquals(2L, result2);
-
- assertEquals(
- "Iceberg table contains correct data",
- sql(
- "SELECT id, name, dept, subdept FROM %s WHERE id = 1 UNION ALL "
- + "SELECT id, name, dept, subdept FROM %s WHERE id = 1",
- sourceTableName, sourceTableName),
- sql("SELECT id, name, dept, subdept FROM %s", tableName, tableName));
- }
-
- @Test
- public void duplicateDataUnpartitioned() {
- createUnpartitionedHiveTable();
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg";
-
- sql(createIceberg, tableName);
-
- scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName);
-
- AssertHelpers.assertThrows(
- "Should not allow adding duplicate files",
- IllegalStateException.class,
- "Cannot complete import because data files to be imported already"
- + " exist within the target table",
- () ->
- scalarSql(
- "CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName));
- }
-
- @Test
- public void duplicateDataUnpartitionedAllowed() {
- createUnpartitionedHiveTable();
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg";
-
- sql(createIceberg, tableName);
-
- Object result1 =
- scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName);
- Assert.assertEquals(2L, result1);
-
- Object result2 =
- scalarSql(
- "CALL %s.system.add_files("
- + "table => '%s', "
- + "source_table => '%s',"
- + "check_duplicate_files => false)",
- catalogName, tableName, sourceTableName);
- Assert.assertEquals(2L, result2);
-
- assertEquals(
- "Iceberg table contains correct data",
- sql(
- "SELECT * FROM (SELECT * FROM %s UNION ALL " + "SELECT * from %s) ORDER BY id",
- sourceTableName, sourceTableName),
- sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void addOrcFileWithDoubleAndFloatColumns() throws Exception {
- // Spark Session Catalog cannot load metadata tables
- // with "The namespace in session catalog must have exactly one name part"
- Assume.assumeFalse(catalogName.equals("spark_catalog"));
-
- // Create an ORC file
- File outputFile = temp.newFile("test.orc");
- final int numRows = 5;
- List<Record> expectedRecords = createOrcFile(outputFile, numRows);
- String createIceberg = "CREATE TABLE %s (x float, y double, z long) USING iceberg";
- sql(createIceberg, tableName);
-
- Object result =
- scalarSql(
- "CALL %s.system.add_files('%s', '`orc`.`%s`')",
- catalogName, tableName, outputFile.getPath());
- Assert.assertEquals(1L, result);
-
- List<Object[]> expected =
- expectedRecords.stream()
- .map(record -> new Object[] {record.get(0), record.get(1), record.get(2)})
- .collect(Collectors.toList());
-
- // x goes 2.00, 1.99, 1.98, ...
- assertEquals(
- "Iceberg table contains correct data",
- expected,
- sql("SELECT * FROM %s ORDER BY x DESC", tableName));
-
- List<Object[]> actualRecordCount =
- sql("select %s from %s.files", DataFile.RECORD_COUNT.name(), tableName);
- List<Object[]> expectedRecordCount = Lists.newArrayList();
- expectedRecordCount.add(new Object[] {(long) numRows});
- assertEquals(
- "Iceberg file metadata should have correct metadata count",
- expectedRecordCount,
- actualRecordCount);
- }
-
- @Test
- public void testEmptyImportDoesNotThrow() {
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg";
- sql(createIceberg, tableName);
-
- // Empty path based import
- Object pathResult =
- scalarSql(
- "CALL %s.system.add_files('%s', '`parquet`.`%s`')",
- catalogName, tableName, fileTableDir.getAbsolutePath());
- Assert.assertEquals(0L, pathResult);
- assertEquals(
- "Iceberg table contains no added data when importing from an empty path",
- emptyQueryResult,
- sql("SELECT * FROM %s ORDER BY id", tableName));
-
- // Empty table based import
- String createHive =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) STORED AS parquet";
- sql(createHive, sourceTableName);
-
- Object tableResult =
- scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName);
- Assert.assertEquals(0L, tableResult);
- assertEquals(
- "Iceberg table contains no added data when importing from an empty table",
- emptyQueryResult,
- sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void testPartitionedImportFromEmptyPartitionDoesNotThrow() {
- createPartitionedHiveTable();
-
- final int emptyPartitionId = 999;
- // Add an empty partition to the hive table
- sql(
- "ALTER TABLE %s ADD PARTITION (id = '%d') LOCATION '%d'",
- sourceTableName, emptyPartitionId, emptyPartitionId);
-
- String createIceberg =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg PARTITIONED BY (id)";
-
- sql(createIceberg, tableName);
-
- Object tableResult =
- scalarSql(
- "CALL %s.system.add_files("
- + "table => '%s', "
- + "source_table => '%s', "
- + "partition_filter => map('id', %d))",
- catalogName, tableName, sourceTableName, emptyPartitionId);
-
- Assert.assertEquals(0L, tableResult);
- assertEquals(
- "Iceberg table contains no added data when importing from an empty table",
- emptyQueryResult,
- sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- private static final List<Object[]> emptyQueryResult = Lists.newArrayList();
-
- private static final StructField[] struct = {
- new StructField("id", DataTypes.IntegerType, true, Metadata.empty()),
- new StructField("name", DataTypes.StringType, true, Metadata.empty()),
- new StructField("dept", DataTypes.StringType, true, Metadata.empty()),
- new StructField("subdept", DataTypes.StringType, true, Metadata.empty())
- };
-
- private static final Dataset<Row> unpartitionedDF =
- spark
- .createDataFrame(
- ImmutableList.of(
- RowFactory.create(1, "John Doe", "hr", "communications"),
- RowFactory.create(2, "Jane Doe", "hr", "salary"),
- RowFactory.create(3, "Matt Doe", "hr", "communications"),
- RowFactory.create(4, "Will Doe", "facilities", "all")),
- new StructType(struct))
- .repartition(1);
-
- private static final Dataset<Row> singleNullRecordDF =
- spark
- .createDataFrame(
- ImmutableList.of(RowFactory.create(null, null, null, null)), new StructType(struct))
- .repartition(1);
-
- private static final Dataset<Row> partitionedDF =
- unpartitionedDF.select("name", "dept", "subdept", "id");
-
- private static final Dataset<Row> compositePartitionedDF =
- unpartitionedDF.select("name", "subdept", "id", "dept");
-
- private static final Dataset<Row> compositePartitionedNullRecordDF =
- singleNullRecordDF.select("name", "subdept", "id", "dept");
-
- private static final Dataset<Row> weirdColumnNamesDF =
- unpartitionedDF.select(
- unpartitionedDF.col("id"),
- unpartitionedDF.col("subdept"),
- unpartitionedDF.col("dept"),
- unpartitionedDF.col("name").as("naMe"));
-
- private void createUnpartitionedFileTable(String format) {
- String createParquet =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s LOCATION '%s'";
-
- sql(createParquet, sourceTableName, format, fileTableDir.getAbsolutePath());
- unpartitionedDF.write().insertInto(sourceTableName);
- unpartitionedDF.write().insertInto(sourceTableName);
- }
-
- private void createPartitionedFileTable(String format) {
- String createParquet =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s PARTITIONED BY (id) "
- + "LOCATION '%s'";
-
- sql(createParquet, sourceTableName, format, fileTableDir.getAbsolutePath());
-
- partitionedDF.write().insertInto(sourceTableName);
- partitionedDF.write().insertInto(sourceTableName);
- }
-
- private void createCompositePartitionedTable(String format) {
- String createParquet =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s "
- + "PARTITIONED BY (id, dept) LOCATION '%s'";
- sql(createParquet, sourceTableName, format, fileTableDir.getAbsolutePath());
-
- compositePartitionedDF.write().insertInto(sourceTableName);
- compositePartitionedDF.write().insertInto(sourceTableName);
- }
-
- private void createCompositePartitionedTableWithNullValueInPartitionColumn(String format) {
- String createParquet =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s "
- + "PARTITIONED BY (id, dept) LOCATION '%s'";
- sql(createParquet, sourceTableName, format, fileTableDir.getAbsolutePath());
-
- Dataset<Row> unionedDF =
- compositePartitionedDF
- .unionAll(compositePartitionedNullRecordDF)
- .select("name", "subdept", "id", "dept")
- .repartition(1);
-
- unionedDF.write().insertInto(sourceTableName);
- unionedDF.write().insertInto(sourceTableName);
- }
-
- private void createWeirdCaseTable() {
- String createParquet =
- "CREATE TABLE %s (id Integer, subdept String, dept String) "
- + "PARTITIONED BY (`naMe` String) STORED AS parquet";
-
- sql(createParquet, sourceTableName);
-
- weirdColumnNamesDF.write().insertInto(sourceTableName);
- weirdColumnNamesDF.write().insertInto(sourceTableName);
- }
-
- private void createUnpartitionedHiveTable() {
- String createHive =
- "CREATE TABLE %s (id Integer, name String, dept String, subdept String) STORED AS parquet";
-
- sql(createHive, sourceTableName);
-
- unpartitionedDF.write().insertInto(sourceTableName);
- unpartitionedDF.write().insertInto(sourceTableName);
- }
-
- private void createPartitionedHiveTable() {
- String createHive =
- "CREATE TABLE %s (name String, dept String, subdept String) "
- + "PARTITIONED BY (id Integer) STORED AS parquet";
-
- sql(createHive, sourceTableName);
-
- partitionedDF.write().insertInto(sourceTableName);
- partitionedDF.write().insertInto(sourceTableName);
- }
-
- // Update this to not write a file for import using Iceberg's ID numbers
- public List<Record> createOrcFile(File orcFile, int numRows) throws IOException {
- // Needs to be deleted but depend on the rule to delete the file for us again at the end.
- if (orcFile.exists()) {
- orcFile.delete();
- }
- final org.apache.iceberg.Schema icebergSchema =
- new org.apache.iceberg.Schema(
- optional(1, "x", Types.FloatType.get()),
- optional(2, "y", Types.DoubleType.get()),
- optional(3, "z", Types.LongType.get()));
-
- List<Record> records = Lists.newArrayListWithExpectedSize(numRows);
- for (int i = 0; i < numRows; i += 1) {
- Record record = org.apache.iceberg.data.GenericRecord.create(icebergSchema);
- record.setField("x", ((float) (100 - i)) / 100F + 1.0F); // 2.0f, 1.99f, 1.98f, ...
- record.setField("y", ((double) i) / 100.0D + 2.0D); // 2.0d, 2.01d, 2.02d, ...
- record.setField("z", 5_000_000_000L + i);
- records.add(record);
- }
-
- OutputFile outFile = Files.localOutput(orcFile);
- try (FileAppender<Record> appender =
- org.apache
- .iceberg
- .orc
- .ORC
- .write(outFile)
- .schema(icebergSchema)
- .metricsConfig(
- MetricsConfig.fromProperties(
- ImmutableMap.of("write.metadata.metrics.default", "none")))
- .createWriterFunc(GenericOrcWriter::buildWriter)
- .build()) {
- appender.addAll(records);
- }
- return records;
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java
deleted file mode 100644
index 8aee7c9775..0000000000
--- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java
+++ /dev/null
@@ -1,439 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.iceberg.spark.extensions;
-
-import java.util.Map;
-import org.apache.iceberg.PartitionSpec;
-import org.apache.iceberg.Table;
-import org.apache.iceberg.spark.source.SparkTable;
-import org.apache.spark.sql.connector.catalog.CatalogManager;
-import org.apache.spark.sql.connector.catalog.Identifier;
-import org.apache.spark.sql.connector.catalog.TableCatalog;
-import org.junit.After;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class TestAlterTablePartitionFields extends SparkExtensionsTestBase {
- public TestAlterTablePartitionFields(
- String catalogName, String implementation, Map<String, String> config) {
- super(catalogName, implementation, config);
- }
-
- @After
- public void removeTable() {
- sql("DROP TABLE IF EXISTS %s", tableName);
- }
-
- @Test
- public void testAddIdentityPartition() {
- sql(
- "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg",
- tableName);
- Table table = validationCatalog.loadTable(tableIdent);
-
- Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned());
-
- sql("ALTER TABLE %s ADD PARTITION FIELD category", tableName);
-
- table.refresh();
-
- PartitionSpec expected =
- PartitionSpec.builderFor(table.schema()).withSpecId(1).identity("category").build();
-
- Assert.assertEquals("Should have new spec field", expected, table.spec());
- }
-
- @Test
- public void testAddBucketPartition() {
- sql(
- "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg",
- tableName);
- Table table = validationCatalog.loadTable(tableIdent);
-
- Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned());
-
- sql("ALTER TABLE %s ADD PARTITION FIELD bucket(16, id)", tableName);
-
- table.refresh();
-
- PartitionSpec expected =
- PartitionSpec.builderFor(table.schema())
- .withSpecId(1)
- .bucket("id", 16, "id_bucket_16")
- .build();
-
- Assert.assertEquals("Should have new spec field", expected, table.spec());
- }
-
- @Test
- public void testAddTruncatePartition() {
- sql(
- "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg",
- tableName);
- Table table = validationCatalog.loadTable(tableIdent);
-
- Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned());
-
- sql("ALTER TABLE %s ADD PARTITION FIELD truncate(data, 4)", tableName);
-
- table.refresh();
-
- PartitionSpec expected =
- PartitionSpec.builderFor(table.schema())
- .withSpecId(1)
- .truncate("data", 4, "data_trunc_4")
- .build();
-
- Assert.assertEquals("Should have new spec field", expected, table.spec());
- }
-
- @Test
- public void testAddYearsPartition() {
- sql(
- "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg",
- tableName);
- Table table = validationCatalog.loadTable(tableIdent);
-
- Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned());
-
- sql("ALTER TABLE %s ADD PARTITION FIELD years(ts)", tableName);
-
- table.refresh();
-
- PartitionSpec expected =
- PartitionSpec.builderFor(table.schema()).withSpecId(1).year("ts").build();
-
- Assert.assertEquals("Should have new spec field", expected, table.spec());
- }
-
- @Test
- public void testAddMonthsPartition() {
- sql(
- "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg",
- tableName);
- Table table = validationCatalog.loadTable(tableIdent);
-
- Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned());
-
- sql("ALTER TABLE %s ADD PARTITION FIELD months(ts)", tableName);
-
- table.refresh();
-
- PartitionSpec expected =
- PartitionSpec.builderFor(table.schema()).withSpecId(1).month("ts").build();
-
- Assert.assertEquals("Should have new spec field", expected, table.spec());
- }
-
- @Test
- public void testAddDaysPartition() {
- sql(
- "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg",
- tableName);
- Table table = validationCatalog.loadTable(tableIdent);
-
- Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned());
-
- sql("ALTER TABLE %s ADD PARTITION FIELD days(ts)", tableName);
-
- table.refresh();
-
- PartitionSpec expected =
- PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts").build();
-
- Assert.assertEquals("Should have new spec field", expected, table.spec());
- }
-
- @Test
- public void testAddHoursPartition() {
- sql(
- "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg",
- tableName);
- Table table = validationCatalog.loadTable(tableIdent);
-
- Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned());
-
- sql("ALTER TABLE %s ADD PARTITION FIELD hours(ts)", tableName);
-
- table.refresh();
-
- PartitionSpec expected =
- PartitionSpec.builderFor(table.schema()).withSpecId(1).hour("ts").build();
-
- Assert.assertEquals("Should have new spec field", expected, table.spec());
- }
-
- @Test
- public void testAddNamedPartition() {
- sql(
- "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg",
- tableName);
- Table table = validationCatalog.loadTable(tableIdent);
-
- Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned());
-
- sql("ALTER TABLE %s ADD PARTITION FIELD bucket(16, id) AS shard", tableName);
-
- table.refresh();
-
- PartitionSpec expected =
- PartitionSpec.builderFor(table.schema()).withSpecId(1).bucket("id", 16, "shard").build();
-
- Assert.assertEquals("Should have new spec field", expected, table.spec());
- }
-
- @Test
- public void testDropIdentityPartition() {
- sql(
- "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg PARTITIONED BY (category)",
- tableName);
- Table table = validationCatalog.loadTable(tableIdent);
-
- Assert.assertEquals(
- "Table should start with 1 partition field", 1, table.spec().fields().size());
-
- sql("ALTER TABLE %s DROP PARTITION FIELD category", tableName);
-
- table.refresh();
-
- PartitionSpec expected =
- PartitionSpec.builderFor(table.schema())
- .withSpecId(1)
- .alwaysNull("category", "category")
- .build();
-
- Assert.assertEquals("Should have new spec field", expected, table.spec());
- }
-
- @Test
- public void testDropDaysPartition() {
- sql(
- "CREATE TABLE %s (id bigint NOT NULL, ts timestamp, data string) USING iceberg PARTITIONED BY (days(ts))",
- tableName);
- Table table = validationCatalog.loadTable(tableIdent);
-
- Assert.assertEquals(
- "Table should start with 1 partition field", 1, table.spec().fields().size());
-
- sql("ALTER TABLE %s DROP PARTITION FIELD days(ts)", tableName);
-
- table.refresh();
-
- PartitionSpec expected =
- PartitionSpec.builderFor(table.schema()).withSpecId(1).alwaysNull("ts", "ts_day").build();
-
- Assert.assertEquals("Should have new spec field", expected, table.spec());
- }
-
- @Test
- public void testDropBucketPartition() {
- sql(
- "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (bucket(16, id))",
- tableName);
- Table table = validationCatalog.loadTable(tableIdent);
-
- Assert.assertEquals(
- "Table should start with 1 partition field", 1, table.spec().fields().size());
-
- sql("ALTER TABLE %s DROP PARTITION FIELD bucket(16, id)", tableName);
-
- table.refresh();
-
- PartitionSpec expected =
- PartitionSpec.builderFor(table.schema())
- .withSpecId(1)
- .alwaysNull("id", "id_bucket")
- .build();
-
- Assert.assertEquals("Should have new spec field", expected, table.spec());
- }
-
- @Test
- public void testDropPartitionByName() {
- sql(
- "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg",
- tableName);
- Table table = validationCatalog.loadTable(tableIdent);
-
- Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned());
-
- sql("ALTER TABLE %s ADD PARTITION FIELD bucket(16, id) AS shard", tableName);
-
- table.refresh();
-
- Assert.assertEquals("Table should have 1 partition field", 1, table.spec().fields().size());
-
- // Should be recognized as iceberg command even with extra white spaces
- sql("ALTER TABLE %s DROP PARTITION \n FIELD shard", tableName);
-
- table.refresh();
-
- PartitionSpec expected =
- PartitionSpec.builderFor(table.schema()).withSpecId(2).alwaysNull("id", "shard").build();
-
- Assert.assertEquals("Should have new spec field", expected, table.spec());
- }
-
- @Test
- public void testReplacePartition() {
- sql(
- "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg",
- tableName);
- Table table = validationCatalog.loadTable(tableIdent);
- Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned());
-
- sql("ALTER TABLE %s ADD PARTITION FIELD days(ts)", tableName);
- table.refresh();
- PartitionSpec expected =
- PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts").build();
- Assert.assertEquals("Should have new spec field", expected, table.spec());
-
- sql("ALTER TABLE %s REPLACE PARTITION FIELD days(ts) WITH hours(ts)", tableName);
- table.refresh();
- expected =
- PartitionSpec.builderFor(table.schema())
- .withSpecId(2)
- .alwaysNull("ts", "ts_day")
- .hour("ts")
- .build();
- Assert.assertEquals(
- "Should changed from daily to hourly partitioned field", expected, table.spec());
- }
-
- @Test
- public void testReplacePartitionAndRename() {
- sql(
- "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg",
- tableName);
- Table table = validationCatalog.loadTable(tableIdent);
- Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned());
-
- sql("ALTER TABLE %s ADD PARTITION FIELD days(ts)", tableName);
- table.refresh();
- PartitionSpec expected =
- PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts").build();
- Assert.assertEquals("Should have new spec field", expected, table.spec());
-
- sql("ALTER TABLE %s REPLACE PARTITION FIELD days(ts) WITH hours(ts) AS hour_col", tableName);
- table.refresh();
- expected =
- PartitionSpec.builderFor(table.schema())
- .withSpecId(2)
- .alwaysNull("ts", "ts_day")
- .hour("ts", "hour_col")
- .build();
- Assert.assertEquals(
- "Should changed from daily to hourly partitioned field", expected, table.spec());
- }
-
- @Test
- public void testReplaceNamedPartition() {
- sql(
- "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg",
- tableName);
- Table table = validationCatalog.loadTable(tableIdent);
- Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned());
-
- sql("ALTER TABLE %s ADD PARTITION FIELD days(ts) AS day_col", tableName);
- table.refresh();
- PartitionSpec expected =
- PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts", "day_col").build();
- Assert.assertEquals("Should have new spec field", expected, table.spec());
-
- sql("ALTER TABLE %s REPLACE PARTITION FIELD day_col WITH hours(ts)", tableName);
- table.refresh();
- expected =
- PartitionSpec.builderFor(table.schema())
- .withSpecId(2)
- .alwaysNull("ts", "day_col")
- .hour("ts")
- .build();
- Assert.assertEquals(
- "Should changed from daily to hourly partitioned field", expected, table.spec());
- }
-
- @Test
- public void testReplaceNamedPartitionAndRenameDifferently() {
- sql(
- "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg",
- tableName);
- Table table = validationCatalog.loadTable(tableIdent);
- Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned());
-
- sql("ALTER TABLE %s ADD PARTITION FIELD days(ts) AS day_col", tableName);
- table.refresh();
- PartitionSpec expected =
- PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts", "day_col").build();
- Assert.assertEquals("Should have new spec field", expected, table.spec());
-
- sql("ALTER TABLE %s REPLACE PARTITION FIELD day_col WITH hours(ts) AS hour_col", tableName);
- table.refresh();
- expected =
- PartitionSpec.builderFor(table.schema())
- .withSpecId(2)
- .alwaysNull("ts", "day_col")
- .hour("ts", "hour_col")
- .build();
- Assert.assertEquals(
- "Should changed from daily to hourly partitioned field", expected, table.spec());
- }
-
- @Test
- public void testSparkTableAddDropPartitions() throws Exception {
- sql("CREATE TABLE %s (id bigint NOT NULL, ts timestamp, data string) USING iceberg", tableName);
- Assert.assertEquals(
- "spark table partition should be empty", 0, sparkTable().partitioning().length);
-
- sql("ALTER TABLE %s ADD PARTITION FIELD bucket(16, id) AS shard", tableName);
- assertPartitioningEquals(sparkTable(), 1, "bucket(16, id)");
-
- sql("ALTER TABLE %s ADD PARTITION FIELD truncate(data, 4)", tableName);
- assertPartitioningEquals(sparkTable(), 2, "truncate(data, 4)");
-
- sql("ALTER TABLE %s ADD PARTITION FIELD years(ts)", tableName);
- assertPartitioningEquals(sparkTable(), 3, "years(ts)");
-
- sql("ALTER TABLE %s DROP PARTITION FIELD years(ts)", tableName);
- assertPartitioningEquals(sparkTable(), 2, "truncate(data, 4)");
-
- sql("ALTER TABLE %s DROP PARTITION FIELD truncate(data, 4)", tableName);
- assertPartitioningEquals(sparkTable(), 1, "bucket(16, id)");
-
- sql("ALTER TABLE %s DROP PARTITION FIELD shard", tableName);
- sql("DESCRIBE %s", tableName);
- Assert.assertEquals(
- "spark table partition should be empty", 0, sparkTable().partitioning().length);
- }
-
- private void assertPartitioningEquals(SparkTable table, int len, String transform) {
- Assert.assertEquals("spark table partition should be " + len, len, table.partitioning().length);
- Assert.assertEquals(
- "latest spark table partition transform should match",
- transform,
- table.partitioning()[len - 1].toString());
- }
-
- private SparkTable sparkTable() throws Exception {
- validationCatalog.loadTable(tableIdent).refresh();
- CatalogManager catalogManager = spark.sessionState().catalogManager();
- TableCatalog catalog = (TableCatalog) catalogManager.catalog(catalogName);
- Identifier identifier = Identifier.of(tableIdent.namespace().levels(), tableIdent.name());
- return (SparkTable) catalog.loadTable(identifier);
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTableSchema.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTableSchema.java
deleted file mode 100644
index c993c213dc..0000000000
--- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTableSchema.java
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.iceberg.spark.extensions;
-
-import java.util.Map;
-import org.apache.iceberg.AssertHelpers;
-import org.apache.iceberg.Table;
-import org.apache.iceberg.relocated.com.google.common.collect.Sets;
-import org.junit.After;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class TestAlterTableSchema extends SparkExtensionsTestBase {
- public TestAlterTableSchema(
- String catalogName, String implementation, Map<String, String> config) {
- super(catalogName, implementation, config);
- }
-
- @After
- public void removeTable() {
- sql("DROP TABLE IF EXISTS %s", tableName);
- }
-
- @Test
- public void testSetIdentifierFields() {
- sql(
- "CREATE TABLE %s (id bigint NOT NULL, "
- + "location struct<lon:bigint NOT NULL,lat:bigint NOT NULL> NOT NULL) USING iceberg",
- tableName);
- Table table = validationCatalog.loadTable(tableIdent);
- Assert.assertTrue(
- "Table should start without identifier", table.schema().identifierFieldIds().isEmpty());
-
- sql("ALTER TABLE %s SET IDENTIFIER FIELDS id", tableName);
- table.refresh();
- Assert.assertEquals(
- "Should have new identifier field",
- Sets.newHashSet(table.schema().findField("id").fieldId()),
- table.schema().identifierFieldIds());
-
- sql("ALTER TABLE %s SET IDENTIFIER FIELDS id, location.lon", tableName);
- table.refresh();
- Assert.assertEquals(
- "Should have new identifier field",
- Sets.newHashSet(
- table.schema().findField("id").fieldId(),
- table.schema().findField("location.lon").fieldId()),
- table.schema().identifierFieldIds());
-
- sql("ALTER TABLE %s SET IDENTIFIER FIELDS location.lon", tableName);
- table.refresh();
- Assert.assertEquals(
- "Should have new identifier field",
- Sets.newHashSet(table.schema().findField("location.lon").fieldId()),
- table.schema().identifierFieldIds());
- }
-
- @Test
- public void testSetInvalidIdentifierFields() {
- sql("CREATE TABLE %s (id bigint NOT NULL, id2 bigint) USING iceberg", tableName);
- Table table = validationCatalog.loadTable(tableIdent);
- Assert.assertTrue(
- "Table should start without identifier", table.schema().identifierFieldIds().isEmpty());
- AssertHelpers.assertThrows(
- "should not allow setting unknown fields",
- IllegalArgumentException.class,
- "not found in current schema or added columns",
- () -> sql("ALTER TABLE %s SET IDENTIFIER FIELDS unknown", tableName));
-
- AssertHelpers.assertThrows(
- "should not allow setting optional fields",
- IllegalArgumentException.class,
- "not a required field",
- () -> sql("ALTER TABLE %s SET IDENTIFIER FIELDS id2", tableName));
- }
-
- @Test
- public void testDropIdentifierFields() {
- sql(
- "CREATE TABLE %s (id bigint NOT NULL, "
- + "location struct<lon:bigint NOT NULL,lat:bigint NOT NULL> NOT NULL) USING iceberg",
- tableName);
- Table table = validationCatalog.loadTable(tableIdent);
- Assert.assertTrue(
- "Table should start without identifier", table.schema().identifierFieldIds().isEmpty());
-
- sql("ALTER TABLE %s SET IDENTIFIER FIELDS id, location.lon", tableName);
- table.refresh();
- Assert.assertEquals(
- "Should have new identifier fields",
- Sets.newHashSet(
- table.schema().findField("id").fieldId(),
- table.schema().findField("location.lon").fieldId()),
- table.schema().identifierFieldIds());
-
- sql("ALTER TABLE %s DROP IDENTIFIER FIELDS id", tableName);
- table.refresh();
- Assert.assertEquals(
- "Should removed identifier field",
- Sets.newHashSet(table.schema().findField("location.lon").fieldId()),
- table.schema().identifierFieldIds());
-
- sql("ALTER TABLE %s SET IDENTIFIER FIELDS id, location.lon", tableName);
- table.refresh();
- Assert.assertEquals(
- "Should have new identifier fields",
- Sets.newHashSet(
- table.schema().findField("id").fieldId(),
- table.schema().findField("location.lon").fieldId()),
- table.schema().identifierFieldIds());
-
- sql("ALTER TABLE %s DROP IDENTIFIER FIELDS id, location.lon", tableName);
- table.refresh();
- Assert.assertEquals(
- "Should have no identifier field", Sets.newHashSet(), table.schema().identifierFieldIds());
- }
-
- @Test
- public void testDropInvalidIdentifierFields() {
- sql(
- "CREATE TABLE %s (id bigint NOT NULL, data string NOT NULL, "
- + "location struct<lon:bigint NOT NULL,lat:bigint NOT NULL> NOT NULL) USING iceberg",
- tableName);
- Table table = validationCatalog.loadTable(tableIdent);
- Assert.assertTrue(
- "Table should start without identifier", table.schema().identifierFieldIds().isEmpty());
- AssertHelpers.assertThrows(
- "should not allow dropping unknown fields",
- IllegalArgumentException.class,
- "field unknown not found",
- () -> sql("ALTER TABLE %s DROP IDENTIFIER FIELDS unknown", tableName));
-
- sql("ALTER TABLE %s SET IDENTIFIER FIELDS id", tableName);
- AssertHelpers.assertThrows(
- "should not allow dropping a field that is not an identifier",
- IllegalArgumentException.class,
- "data is not an identifier field",
- () -> sql("ALTER TABLE %s DROP IDENTIFIER FIELDS data", tableName));
-
- AssertHelpers.assertThrows(
- "should not allow dropping a nested field that is not an identifier",
- IllegalArgumentException.class,
- "location.lon is not an identifier field",
- () -> sql("ALTER TABLE %s DROP IDENTIFIER FIELDS location.lon", tableName));
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAncestorsOfProcedure.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAncestorsOfProcedure.java
deleted file mode 100644
index d676101b10..0000000000
--- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAncestorsOfProcedure.java
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.iceberg.spark.extensions;
-
-import java.util.List;
-import java.util.Map;
-import org.apache.iceberg.AssertHelpers;
-import org.apache.iceberg.Table;
-import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
-import org.apache.spark.sql.AnalysisException;
-import org.junit.After;
-import org.junit.Test;
-
-public class TestAncestorsOfProcedure extends SparkExtensionsTestBase {
-
- public TestAncestorsOfProcedure(
- String catalogName, String implementation, Map<String, String> config) {
- super(catalogName, implementation, config);
- }
-
- @After
- public void removeTables() {
- sql("DROP TABLE IF EXISTS %s", tableName);
- }
-
- @Test
- public void testAncestorOfUsingEmptyArgs() {
- sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName);
- sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName);
- sql("INSERT INTO TABLE %s VALUES (2, 'b')", tableName);
-
- Table table = validationCatalog.loadTable(tableIdent);
- Long currentSnapshotId = table.currentSnapshot().snapshotId();
- Long currentTimestamp = table.currentSnapshot().timestampMillis();
- Long preSnapshotId = table.currentSnapshot().parentId();
- Long preTimeStamp = table.snapshot(table.currentSnapshot().parentId()).timestampMillis();
-
- List<Object[]> output = sql("CALL %s.system.ancestors_of('%s')", catalogName, tableIdent);
-
- assertEquals(
- "Procedure output must match",
- ImmutableList.of(
- row(currentSnapshotId, currentTimestamp), row(preSnapshotId, preTimeStamp)),
- output);
- }
-
- @Test
- public void testAncestorOfUsingSnapshotId() {
- sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName);
- sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName);
- sql("INSERT INTO TABLE %s VALUES (2, 'b')", tableName);
-
- Table table = validationCatalog.loadTable(tableIdent);
- Long currentSnapshotId = table.currentSnapshot().snapshotId();
- Long currentTimestamp = table.currentSnapshot().timestampMillis();
- Long preSnapshotId = table.currentSnapshot().parentId();
- Long preTimeStamp = table.snapshot(table.currentSnapshot().parentId()).timestampMillis();
-
- assertEquals(
- "Procedure output must match",
- ImmutableList.of(
- row(currentSnapshotId, currentTimestamp), row(preSnapshotId, preTimeStamp)),
- sql("CALL %s.system.ancestors_of('%s', %dL)", catalogName, tableIdent, currentSnapshotId));
-
- assertEquals(
- "Procedure output must match",
- ImmutableList.of(row(preSnapshotId, preTimeStamp)),
- sql("CALL %s.system.ancestors_of('%s', %dL)", catalogName, tableIdent, preSnapshotId));
- }
-
- @Test
- public void testAncestorOfWithRollBack() {
- sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName);
- Table table = validationCatalog.loadTable(tableIdent);
- sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName);
- table.refresh();
- Long firstSnapshotId = table.currentSnapshot().snapshotId();
- Long firstTimestamp = table.currentSnapshot().timestampMillis();
- sql("INSERT INTO TABLE %s VALUES (2, 'b')", tableName);
- table.refresh();
- Long secondSnapshotId = table.currentSnapshot().snapshotId();
- Long secondTimestamp = table.currentSnapshot().timestampMillis();
- sql("INSERT INTO TABLE %s VALUES (3, 'c')", tableName);
- table.refresh();
- Long thirdSnapshotId = table.currentSnapshot().snapshotId();
- Long thirdTimestamp = table.currentSnapshot().timestampMillis();
-
- // roll back
- sql(
- "CALL %s.system.rollback_to_snapshot('%s', %dL)",
- catalogName, tableIdent, secondSnapshotId);
-
- sql("INSERT INTO TABLE %s VALUES (4, 'd')", tableName);
- table.refresh();
- Long fourthSnapshotId = table.currentSnapshot().snapshotId();
- Long fourthTimestamp = table.currentSnapshot().timestampMillis();
-
- assertEquals(
- "Procedure output must match",
- ImmutableList.of(
- row(fourthSnapshotId, fourthTimestamp),
- row(secondSnapshotId, secondTimestamp),
- row(firstSnapshotId, firstTimestamp)),
- sql("CALL %s.system.ancestors_of('%s', %dL)", catalogName, tableIdent, fourthSnapshotId));
-
- assertEquals(
- "Procedure output must match",
- ImmutableList.of(
- row(thirdSnapshotId, thirdTimestamp),
- row(secondSnapshotId, secondTimestamp),
- row(firstSnapshotId, firstTimestamp)),
- sql("CALL %s.system.ancestors_of('%s', %dL)", catalogName, tableIdent, thirdSnapshotId));
- }
-
- @Test
- public void testAncestorOfUsingNamedArgs() {
- sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName);
- sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName);
-
- Table table = validationCatalog.loadTable(tableIdent);
- Long firstSnapshotId = table.currentSnapshot().snapshotId();
- Long firstTimestamp = table.currentSnapshot().timestampMillis();
-
- assertEquals(
- "Procedure output must match",
- ImmutableList.of(row(firstSnapshotId, firstTimestamp)),
- sql(
- "CALL %s.system.ancestors_of(snapshot_id => %dL, table => '%s')",
- catalogName, firstSnapshotId, tableIdent));
- }
-
- @Test
- public void testInvalidAncestorOfCases() {
- AssertHelpers.assertThrows(
- "Should reject calls without all required args",
- AnalysisException.class,
- "Missing required parameters",
- () -> sql("CALL %s.system.ancestors_of()", catalogName));
-
- AssertHelpers.assertThrows(
- "Should reject calls with empty table identifier",
- IllegalArgumentException.class,
- "Cannot handle an empty identifier for argument table",
- () -> sql("CALL %s.system.ancestors_of('')", catalogName));
-
- AssertHelpers.assertThrows(
- "Should reject calls with invalid arg types",
- AnalysisException.class,
- "Wrong arg type for snapshot_id: cannot cast",
- () -> sql("CALL %s.system.ancestors_of('%s', 1.1)", catalogName, tableIdent));
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCallStatementParser.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCallStatementParser.java
deleted file mode 100644
index 7bcc088456..0000000000
--- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCallStatementParser.java
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.iceberg.spark.extensions;
-
-import java.math.BigDecimal;
-import java.sql.Timestamp;
-import java.time.Instant;
-import java.util.List;
-import org.apache.iceberg.AssertHelpers;
-import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
-import org.apache.iceberg.relocated.com.google.common.collect.Lists;
-import org.apache.spark.sql.SparkSession;
-import org.apache.spark.sql.catalyst.expressions.Expression;
-import org.apache.spark.sql.catalyst.expressions.Literal;
-import org.apache.spark.sql.catalyst.expressions.Literal$;
-import org.apache.spark.sql.catalyst.parser.ParseException;
-import org.apache.spark.sql.catalyst.parser.ParserInterface;
-import org.apache.spark.sql.catalyst.parser.extensions.IcebergParseException;
-import org.apache.spark.sql.catalyst.plans.logical.CallArgument;
-import org.apache.spark.sql.catalyst.plans.logical.CallStatement;
-import org.apache.spark.sql.catalyst.plans.logical.NamedArgument;
-import org.apache.spark.sql.catalyst.plans.logical.PositionalArgument;
-import org.apache.spark.sql.types.DataType;
-import org.apache.spark.sql.types.DataTypes;
-import org.junit.AfterClass;
-import org.junit.Assert;
-import org.junit.BeforeClass;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TemporaryFolder;
-import scala.collection.JavaConverters;
-
-public class TestCallStatementParser {
-
- @Rule public TemporaryFolder temp = new TemporaryFolder();
-
- private static SparkSession spark = null;
- private static ParserInterface parser = null;
-
- @BeforeClass
- public static void startSpark() {
- TestCallStatementParser.spark =
- SparkSession.builder()
- .master("local[2]")
- .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName())
- .config("spark.extra.prop", "value")
- .getOrCreate();
- TestCallStatementParser.parser = spark.sessionState().sqlParser();
- }
-
- @AfterClass
- public static void stopSpark() {
- SparkSession currentSpark = TestCallStatementParser.spark;
- TestCallStatementParser.spark = null;
- TestCallStatementParser.parser = null;
- currentSpark.stop();
- }
-
- @Test
- public void testCallWithPositionalArgs() throws ParseException {
- CallStatement call =
- (CallStatement) parser.parsePlan("CALL c.n.func(1, '2', 3L, true, 1.0D, 9.0e1, 900e-1BD)");
- Assert.assertEquals(
- ImmutableList.of("c", "n", "func"), JavaConverters.seqAsJavaList(call.name()));
-
- Assert.assertEquals(7, call.args().size());
-
- checkArg(call, 0, 1, DataTypes.IntegerType);
- checkArg(call, 1, "2", DataTypes.StringType);
- checkArg(call, 2, 3L, DataTypes.LongType);
- checkArg(call, 3, true, DataTypes.BooleanType);
- checkArg(call, 4, 1.0D, DataTypes.DoubleType);
- checkArg(call, 5, 9.0e1, DataTypes.DoubleType);
- checkArg(call, 6, new BigDecimal("900e-1"), DataTypes.createDecimalType(3, 1));
- }
-
- @Test
- public void testCallWithNamedArgs() throws ParseException {
- CallStatement call =
- (CallStatement) parser.parsePlan("CALL cat.system.func(c1 => 1, c2 => '2', c3 => true)");
- Assert.assertEquals(
- ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name()));
-
- Assert.assertEquals(3, call.args().size());
-
- checkArg(call, 0, "c1", 1, DataTypes.IntegerType);
- checkArg(call, 1, "c2", "2", DataTypes.StringType);
- checkArg(call, 2, "c3", true, DataTypes.BooleanType);
- }
-
- @Test
- public void testCallWithMixedArgs() throws ParseException {
- CallStatement call = (CallStatement) parser.parsePlan("CALL cat.system.func(c1 => 1, '2')");
- Assert.assertEquals(
- ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name()));
-
- Assert.assertEquals(2, call.args().size());
-
- checkArg(call, 0, "c1", 1, DataTypes.IntegerType);
- checkArg(call, 1, "2", DataTypes.StringType);
- }
-
- @Test
- public void testCallWithTimestampArg() throws ParseException {
- CallStatement call =
- (CallStatement)
- parser.parsePlan("CALL cat.system.func(TIMESTAMP '2017-02-03T10:37:30.00Z')");
- Assert.assertEquals(
- ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name()));
-
- Assert.assertEquals(1, call.args().size());
-
- checkArg(
- call, 0, Timestamp.from(Instant.parse("2017-02-03T10:37:30.00Z")), DataTypes.TimestampType);
- }
-
- @Test
- public void testCallWithVarSubstitution() throws ParseException {
- CallStatement call =
- (CallStatement) parser.parsePlan("CALL cat.system.func('${spark.extra.prop}')");
- Assert.assertEquals(
- ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name()));
-
- Assert.assertEquals(1, call.args().size());
-
- checkArg(call, 0, "value", DataTypes.StringType);
- }
-
- @Test
- public void testCallStripsComments() throws ParseException {
- List<String> callStatementsWithComments =
- Lists.newArrayList(
- "/* bracketed comment */ CALL cat.system.func('${spark.extra.prop}')",
- "/**/ CALL cat.system.func('${spark.extra.prop}')",
- "-- single line comment \n CALL cat.system.func('${spark.extra.prop}')",
- "-- multiple \n-- single line \n-- comments \n CALL cat.system.func('${spark.extra.prop}')",
- "/* select * from multiline_comment \n where x like '%sql%'; */ CALL cat.system.func('${spark.extra.prop}')",
- "/* {\"app\": \"dbt\", \"dbt_version\": \"1.0.1\", \"profile_name\": \"profile1\", \"target_name\": \"dev\", "
- + "\"node_id\": \"model.profile1.stg_users\"} \n*/ CALL cat.system.func('${spark.extra.prop}')",
- "/* Some multi-line comment \n"
- + "*/ CALL /* inline comment */ cat.system.func('${spark.extra.prop}') -- ending comment",
- "CALL -- a line ending comment\n" + "cat.system.func('${spark.extra.prop}')");
- for (String sqlText : callStatementsWithComments) {
- CallStatement call = (CallStatement) parser.parsePlan(sqlText);
- Assert.assertEquals(
- ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name()));
-
- Assert.assertEquals(1, call.args().size());
-
- checkArg(call, 0, "value", DataTypes.StringType);
- }
- }
-
- @Test
- public void testCallParseError() {
- AssertHelpers.assertThrows(
- "Should fail with a sensible parse error",
- IcebergParseException.class,
- "missing '(' at 'radish'",
- () -> parser.parsePlan("CALL cat.system radish kebab"));
- }
-
- private void checkArg(
- CallStatement call, int index, Object expectedValue, DataType expectedType) {
- checkArg(call, index, null, expectedValue, expectedType);
- }
-
- private void checkArg(
- CallStatement call,
- int index,
- String expectedName,
- Object expectedValue,
- DataType expectedType) {
-
- if (expectedName != null) {
- NamedArgument arg = checkCast(call.args().apply(index), NamedArgument.class);
- Assert.assertEquals(expectedName, arg.name());
- } else {
- CallArgument arg = call.args().apply(index);
- checkCast(arg, PositionalArgument.class);
- }
-
- Expression expectedExpr = toSparkLiteral(expectedValue, expectedType);
- Expression actualExpr = call.args().apply(index).expr();
- Assert.assertEquals("Arg types must match", expectedExpr.dataType(), actualExpr.dataType());
- Assert.assertEquals("Arg must match", expectedExpr, actualExpr);
- }
-
- private Literal toSparkLiteral(Object value, DataType dataType) {
- return Literal$.MODULE$.create(value, dataType);
- }
-
- private <T> T checkCast(Object value, Class<T> expectedClass) {
- Assert.assertTrue(
- "Expected instance of " + expectedClass.getName(), expectedClass.isInstance(value));
- return expectedClass.cast(value);
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCherrypickSnapshotProcedure.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCherrypickSnapshotProcedure.java
deleted file mode 100644
index 7309a176b9..0000000000
--- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCherrypickSnapshotProcedure.java
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.iceberg.spark.extensions;
-
-import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED;
-
-import java.util.List;
-import java.util.Map;
-import org.apache.iceberg.AssertHelpers;
-import org.apache.iceberg.Snapshot;
-import org.apache.iceberg.Table;
-import org.apache.iceberg.exceptions.ValidationException;
-import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
-import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
-import org.apache.spark.sql.AnalysisException;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.catalyst.analysis.NoSuchProcedureException;
-import org.junit.After;
-import org.junit.Test;
-
-public class TestCherrypickSnapshotProcedure extends SparkExtensionsTestBase {
-
- public TestCherrypickSnapshotProcedure(
- String catalogName, String implementation, Map<String, String> config) {
- super(catalogName, implementation, config);
- }
-
- @After
- public void removeTables() {
- sql("DROP TABLE IF EXISTS %s", tableName);
- }
-
- @Test
- public void testCherrypickSnapshotUsingPositionalArgs() {
- sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName);
- sql("ALTER TABLE %s SET TBLPROPERTIES ('%s' 'true')", tableName, WRITE_AUDIT_PUBLISH_ENABLED);
-
- spark.conf().set("spark.wap.id", "1");
-
- sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName);
-
- assertEquals(
- "Should not see rows from staged snapshot",
- ImmutableList.of(),
- sql("SELECT * FROM %s", tableName));
-
- Table table = validationCatalog.loadTable(tableIdent);
- Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots());
-
- List<Object[]> output =
- sql(
- "CALL %s.system.cherrypick_snapshot('%s', %dL)",
- catalogName, tableIdent, wapSnapshot.snapshotId());
-
- table.refresh();
-
- Snapshot currentSnapshot = table.currentSnapshot();
-
- assertEquals(
- "Procedure output must match",
- ImmutableList.of(row(wapSnapshot.snapshotId(), currentSnapshot.snapshotId())),
- output);
-
- assertEquals(
- "Cherrypick must be successful",
- ImmutableList.of(row(1L, "a")),
- sql("SELECT * FROM %s", tableName));
- }
-
- @Test
- public void testCherrypickSnapshotUsingNamedArgs() {
- sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName);
- sql("ALTER TABLE %s SET TBLPROPERTIES ('%s' 'true')", tableName, WRITE_AUDIT_PUBLISH_ENABLED);
-
- spark.conf().set("spark.wap.id", "1");
-
- sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName);
-
- assertEquals(
- "Should not see rows from staged snapshot",
- ImmutableList.of(),
- sql("SELECT * FROM %s", tableName));
-
- Table table = validationCatalog.loadTable(tableIdent);
- Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots());
-
- List<Object[]> output =
- sql(
- "CALL %s.system.cherrypick_snapshot(snapshot_id => %dL, table => '%s')",
- catalogName, wapSnapshot.snapshotId(), tableIdent);
-
- table.refresh();
-
- Snapshot currentSnapshot = table.currentSnapshot();
-
- assertEquals(
- "Procedure output must match",
- ImmutableList.of(row(wapSnapshot.snapshotId(), currentSnapshot.snapshotId())),
- output);
-
- assertEquals(
- "Cherrypick must be successful",
- ImmutableList.of(row(1L, "a")),
- sql("SELECT * FROM %s", tableName));
- }
-
- @Test
- public void testCherrypickSnapshotRefreshesRelationCache() {
- sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName);
- sql("ALTER TABLE %s SET TBLPROPERTIES ('%s' 'true')", tableName, WRITE_AUDIT_PUBLISH_ENABLED);
-
- Dataset<Row> query = spark.sql("SELECT * FROM " + tableName + " WHERE id = 1");
- query.createOrReplaceTempView("tmp");
-
- spark.sql("CACHE TABLE tmp");
-
- assertEquals("View should not produce rows", ImmutableList.of(), sql("SELECT * FROM tmp"));
-
- spark.conf().set("spark.wap.id", "1");
-
- sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName);
-
- assertEquals(
- "Should not see rows from staged snapshot",
- ImmutableList.of(),
- sql("SELECT * FROM %s", tableName));
-
- Table table = validationCatalog.loadTable(tableIdent);
- Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots());
-
- sql(
- "CALL %s.system.cherrypick_snapshot('%s', %dL)",
- catalogName, tableIdent, wapSnapshot.snapshotId());
-
- assertEquals(
- "Cherrypick snapshot should be visible",
- ImmutableList.of(row(1L, "a")),
- sql("SELECT * FROM tmp"));
-
- sql("UNCACHE TABLE tmp");
- }
-
- @Test
- public void testCherrypickInvalidSnapshot() {
- sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName);
-
- AssertHelpers.assertThrows(
- "Should reject invalid snapshot id",
- ValidationException.class,
- "Cannot cherry-pick unknown snapshot ID",
- () -> sql("CALL %s.system.cherrypick_snapshot('%s', -1L)", catalogName, tableIdent));
- }
-
- @Test
- public void testInvalidCherrypickSnapshotCases() {
- AssertHelpers.assertThrows(
- "Should not allow mixed args",
- AnalysisException.class,
- "Named and positional arguments cannot be mixed",
- () -> sql("CALL %s.system.cherrypick_snapshot('n', table => 't', 1L)", catalogName));
-
- AssertHelpers.assertThrows(
- "Should not resolve procedures in arbitrary namespaces",
- NoSuchProcedureException.class,
- "not found",
- () -> sql("CALL %s.custom.cherrypick_snapshot('n', 't', 1L)", catalogName));
-
- AssertHelpers.assertThrows(
- "Should reject calls without all required args",
- AnalysisException.class,
- "Missing required parameters",
- () -> sql("CALL %s.system.cherrypick_snapshot('t')", catalogName));
-
- AssertHelpers.assertThrows(
- "Should reject calls with empty table identifier",
- IllegalArgumentException.class,
- "Cannot handle an empty identifier",
- () -> sql("CALL %s.system.cherrypick_snapshot('', 1L)", catalogName));
-
- AssertHelpers.assertThrows(
- "Should reject calls with invalid arg types",
- AnalysisException.class,
- "Wrong arg type for snapshot_id: cannot cast",
- () -> sql("CALL %s.system.cherrypick_snapshot('t', 2.2)", catalogName));
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java
deleted file mode 100644
index ec9c559851..0000000000
--- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.iceberg.spark.extensions;
-
-import java.util.Map;
-import org.apache.iceberg.RowLevelOperationMode;
-import org.apache.iceberg.TableProperties;
-import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
-
-public class TestCopyOnWriteDelete extends TestDelete {
-
- public TestCopyOnWriteDelete(
- String catalogName,
- String implementation,
- Map<String, String> config,
- String fileFormat,
- Boolean vectorized,
- String distributionMode) {
- super(catalogName, implementation, config, fileFormat, vectorized, distributionMode);
- }
-
- @Override
- protected Map<String, String> extraTableProperties() {
- return ImmutableMap.of(
- TableProperties.DELETE_MODE, RowLevelOperationMode.COPY_ON_WRITE.modeName());
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java
deleted file mode 100644
index 5608e1eeab..0000000000
--- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.iceberg.spark.extensions;
-
-import java.util.Map;
-import org.apache.iceberg.RowLevelOperationMode;
-import org.apache.iceberg.TableProperties;
-import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
-
-public class TestCopyOnWriteMerge extends TestMerge {
-
- public TestCopyOnWriteMerge(
- String catalogName,
- String implementation,
- Map<String, String> config,
- String fileFormat,
- boolean vectorized,
- String distributionMode) {
- super(catalogName, implementation, config, fileFormat, vectorized, distributionMode);
- }
-
- @Override
- protected Map<String, String> extraTableProperties() {
- return ImmutableMap.of(
- TableProperties.MERGE_MODE, RowLevelOperationMode.COPY_ON_WRITE.modeName());
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java
deleted file mode 100644
index 5a81b0bbd5..0000000000
--- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.iceberg.spark.extensions;
-
-import java.util.Map;
-import org.apache.iceberg.RowLevelOperationMode;
-import org.apache.iceberg.TableProperties;
-import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
-
-public class TestCopyOnWriteUpdate extends TestUpdate {
-
- public TestCopyOnWriteUpdate(
- String catalogName,
- String implementation,
- Map<String, String> config,
- String fileFormat,
- boolean vectorized,
- String distributionMode) {
- super(catalogName, implementation, config, fileFormat, vectorized, distributionMode);
- }
-
- @Override
- protected Map<String, String> extraTableProperties() {
- return ImmutableMap.of(
- TableProperties.UPDATE_MODE, RowLevelOperationMode.COPY_ON_WRITE.modeName());
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java
deleted file mode 100644
index 0c0fe39e87..0000000000
--- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java
+++ /dev/null
@@ -1,853 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.iceberg.spark.extensions;
-
-import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL;
-import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES;
-import static org.apache.iceberg.TableProperties.SPLIT_SIZE;
-import static org.apache.spark.sql.functions.lit;
-
-import java.util.Arrays;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-import java.util.concurrent.ThreadPoolExecutor;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicInteger;
-import org.apache.iceberg.AssertHelpers;
-import org.apache.iceberg.Snapshot;
-import org.apache.iceberg.Table;
-import org.apache.iceberg.exceptions.ValidationException;
-import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
-import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
-import org.apache.iceberg.relocated.com.google.common.collect.Lists;
-import org.apache.iceberg.relocated.com.google.common.util.concurrent.MoreExecutors;
-import org.apache.spark.SparkException;
-import org.apache.spark.sql.AnalysisException;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
-import org.assertj.core.api.Assertions;
-import org.junit.After;
-import org.junit.Assert;
-import org.junit.Assume;
-import org.junit.BeforeClass;
-import org.junit.Ignore;
-import org.junit.Test;
-
-public abstract class TestDelete extends SparkRowLevelOperationsTestBase {
-
- public TestDelete(
- String catalogName,
- String implementation,
- Map<String, String> config,
- String fileFormat,
- Boolean vectorized,
- String distributionMode) {
- super(catalogName, implementation, config, fileFormat, vectorized, distributionMode);
- }
-
- @BeforeClass
- public static void setupSparkConf() {
- spark.conf().set("spark.sql.shuffle.partitions", "4");
- }
-
- @After
- public void removeTables() {
- sql("DROP TABLE IF EXISTS %s", tableName);
- sql("DROP TABLE IF EXISTS deleted_id");
- sql("DROP TABLE IF EXISTS deleted_dep");
- }
-
- @Test
- public void testDeleteFromEmptyTable() {
- createAndInitUnpartitionedTable();
-
- sql("DELETE FROM %s WHERE id IN (1)", tableName);
- sql("DELETE FROM %s WHERE dep = 'hr'", tableName);
-
- Table table = validationCatalog.loadTable(tableIdent);
- Assert.assertEquals("Should have 2 snapshots", 2, Iterables.size(table.snapshots()));
-
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(),
- sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void testExplain() {
- createAndInitUnpartitionedTable();
-
- sql("INSERT INTO TABLE %s VALUES (1, 'hr'), (2, 'hardware'), (null, 'hr')", tableName);
-
- sql("EXPLAIN DELETE FROM %s WHERE id <=> 1", tableName);
-
- sql("EXPLAIN DELETE FROM %s WHERE true", tableName);
-
- Table table = validationCatalog.loadTable(tableIdent);
- Assert.assertEquals("Should have 1 snapshot", 1, Iterables.size(table.snapshots()));
-
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
- }
-
- @Test
- public void testDeleteWithAlias() {
- createAndInitUnpartitionedTable();
-
- sql("INSERT INTO TABLE %s VALUES (1, 'hr'), (2, 'hardware'), (null, 'hr')", tableName);
-
- sql("DELETE FROM %s AS t WHERE t.id IS NULL", tableName);
-
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(1, "hr"), row(2, "hardware")),
- sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void testDeleteWithDynamicFileFiltering() throws NoSuchTableException {
- createAndInitPartitionedTable();
-
- append(new Employee(1, "hr"), new Employee(3, "hr"));
- append(new Employee(1, "hardware"), new Employee(2, "hardware"));
-
- sql("DELETE FROM %s WHERE id = 2", tableName);
-
- Table table = validationCatalog.loadTable(tableIdent);
- Assert.assertEquals("Should have 3 snapshots", 3, Iterables.size(table.snapshots()));
-
- Snapshot currentSnapshot = table.currentSnapshot();
- validateSnapshot(currentSnapshot, "overwrite", "1", "1", "1");
-
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(1, "hardware"), row(1, "hr"), row(3, "hr")),
- sql("SELECT * FROM %s ORDER BY id, dep", tableName));
- }
-
- @Test
- public void testDeleteNonExistingRecords() {
- createAndInitPartitionedTable();
-
- sql("INSERT INTO TABLE %s VALUES (1, 'hr'), (2, 'hardware'), (null, 'hr')", tableName);
-
- sql("DELETE FROM %s AS t WHERE t.id > 10", tableName);
-
- Table table = validationCatalog.loadTable(tableIdent);
- Assert.assertEquals("Should have 2 snapshots", 2, Iterables.size(table.snapshots()));
-
- Snapshot currentSnapshot = table.currentSnapshot();
- validateSnapshot(currentSnapshot, "overwrite", "0", null, null);
-
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
- }
-
- @Test
- public void testDeleteWithoutCondition() {
- createAndInitPartitionedTable();
-
- sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName);
- sql("INSERT INTO TABLE %s VALUES (2, 'hardware')", tableName);
- sql("INSERT INTO TABLE %s VALUES (null, 'hr')", tableName);
-
- sql("DELETE FROM %s", tableName);
-
- Table table = validationCatalog.loadTable(tableIdent);
- Assert.assertEquals("Should have 4 snapshots", 4, Iterables.size(table.snapshots()));
-
- // should be a delete instead of an overwrite as it is done through a metadata operation
- Snapshot currentSnapshot = table.currentSnapshot();
- validateSnapshot(currentSnapshot, "delete", "2", "3", null);
-
- assertEquals(
- "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s", tableName));
- }
-
- @Test
- public void testDeleteUsingMetadataWithComplexCondition() {
- createAndInitPartitionedTable();
-
- sql("INSERT INTO TABLE %s VALUES (1, 'dep1')", tableName);
- sql("INSERT INTO TABLE %s VALUES (2, 'dep2')", tableName);
- sql("INSERT INTO TABLE %s VALUES (null, 'dep3')", tableName);
-
- sql("DELETE FROM %s WHERE dep > 'dep2' OR dep = CAST(4 AS STRING) OR dep = 'dep2'", tableName);
-
- Table table = validationCatalog.loadTable(tableIdent);
- Assert.assertEquals("Should have 4 snapshots", 4, Iterables.size(table.snapshots()));
-
- // should be a delete instead of an overwrite as it is done through a metadata operation
- Snapshot currentSnapshot = table.currentSnapshot();
- validateSnapshot(currentSnapshot, "delete", "2", "2", null);
-
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(1, "dep1")),
- sql("SELECT * FROM %s", tableName));
- }
-
- @Test
- public void testDeleteWithArbitraryPartitionPredicates() {
- createAndInitPartitionedTable();
-
- sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName);
- sql("INSERT INTO TABLE %s VALUES (2, 'hardware')", tableName);
- sql("INSERT INTO TABLE %s VALUES (null, 'hr')", tableName);
-
- // %% is an escaped version of %
- sql("DELETE FROM %s WHERE id = 10 OR dep LIKE '%%ware'", tableName);
-
- Table table = validationCatalog.loadTable(tableIdent);
- Assert.assertEquals("Should have 4 snapshots", 4, Iterables.size(table.snapshots()));
-
- // should be an overwrite since cannot be executed using a metadata operation
- Snapshot currentSnapshot = table.currentSnapshot();
- validateSnapshot(currentSnapshot, "overwrite", "1", "1", null);
-
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(1, "hr"), row(null, "hr")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
- }
-
- @Test
- public void testDeleteWithNonDeterministicCondition() {
- createAndInitPartitionedTable();
-
- sql("INSERT INTO TABLE %s VALUES (1, 'hr'), (2, 'hardware')", tableName);
-
- AssertHelpers.assertThrows(
- "Should complain about non-deterministic expressions",
- AnalysisException.class,
- "nondeterministic expressions are only allowed",
- () -> sql("DELETE FROM %s WHERE id = 1 AND rand() > 0.5", tableName));
- }
-
- @Test
- public void testDeleteWithFoldableConditions() {
- createAndInitPartitionedTable();
-
- sql("INSERT INTO TABLE %s VALUES (1, 'hr'), (2, 'hardware')", tableName);
-
- // should keep all rows and don't trigger execution
- sql("DELETE FROM %s WHERE false", tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(1, "hr"), row(2, "hardware")),
- sql("SELECT * FROM %s ORDER BY id", tableName));
-
- // should keep all rows and don't trigger execution
- sql("DELETE FROM %s WHERE 50 <> 50", tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(1, "hr"), row(2, "hardware")),
- sql("SELECT * FROM %s ORDER BY id", tableName));
-
- // should keep all rows and don't trigger execution
- sql("DELETE FROM %s WHERE 1 > null", tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(1, "hr"), row(2, "hardware")),
- sql("SELECT * FROM %s ORDER BY id", tableName));
-
- // should remove all rows
- sql("DELETE FROM %s WHERE 21 = 21", tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(),
- sql("SELECT * FROM %s ORDER BY id", tableName));
-
- Table table = validationCatalog.loadTable(tableIdent);
- Assert.assertEquals("Should have 2 snapshots", 2, Iterables.size(table.snapshots()));
- }
-
- @Test
- public void testDeleteWithNullConditions() {
- createAndInitPartitionedTable();
-
- sql(
- "INSERT INTO TABLE %s VALUES (0, null), (1, 'hr'), (2, 'hardware'), (null, 'hr')",
- tableName);
-
- // should keep all rows as null is never equal to null
- sql("DELETE FROM %s WHERE dep = null", tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(0, null), row(1, "hr"), row(2, "hardware"), row(null, "hr")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
-
- // null = 'software' -> null
- // should delete using metadata operation only
- sql("DELETE FROM %s WHERE dep = 'software'", tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(0, null), row(1, "hr"), row(2, "hardware"), row(null, "hr")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
-
- // should delete using metadata operation only
- sql("DELETE FROM %s WHERE dep <=> NULL", tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
-
- Table table = validationCatalog.loadTable(tableIdent);
- Assert.assertEquals("Should have 3 snapshots", 3, Iterables.size(table.snapshots()));
-
- Snapshot currentSnapshot = table.currentSnapshot();
- validateSnapshot(currentSnapshot, "delete", "1", "1", null);
- }
-
- @Test
- public void testDeleteWithInAndNotInConditions() {
- createAndInitUnpartitionedTable();
-
- sql("INSERT INTO TABLE %s VALUES (1, 'hr'), (2, 'hardware'), (null, 'hr')", tableName);
-
- sql("DELETE FROM %s WHERE id IN (1, null)", tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(2, "hardware"), row(null, "hr")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
-
- sql("DELETE FROM %s WHERE id NOT IN (null, 1)", tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(2, "hardware"), row(null, "hr")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
-
- sql("DELETE FROM %s WHERE id NOT IN (1, 10)", tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(null, "hr")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
- }
-
- @Test
- public void testDeleteWithMultipleRowGroupsParquet() throws NoSuchTableException {
- Assume.assumeTrue(fileFormat.equalsIgnoreCase("parquet"));
-
- createAndInitPartitionedTable();
-
- sql(
- "ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')",
- tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100);
- sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, SPLIT_SIZE, 100);
-
- List<Integer> ids = Lists.newArrayList();
- for (int id = 1; id <= 200; id++) {
- ids.add(id);
- }
- Dataset<Row> df =
- spark
- .createDataset(ids, Encoders.INT())
- .withColumnRenamed("value", "id")
- .withColumn("dep", lit("hr"));
- df.coalesce(1).writeTo(tableName).append();
-
- Assert.assertEquals(200, spark.table(tableName).count());
-
- // delete a record from one of two row groups and copy over the second one
- sql("DELETE FROM %s WHERE id IN (200, 201)", tableName);
-
- Assert.assertEquals(199, spark.table(tableName).count());
- }
-
- @Test
- public void testDeleteWithConditionOnNestedColumn() {
- createAndInitNestedColumnsTable();
-
- sql("INSERT INTO TABLE %s VALUES (1, named_struct(\"c1\", 3, \"c2\", \"v1\"))", tableName);
- sql("INSERT INTO TABLE %s VALUES (2, named_struct(\"c1\", 2, \"c2\", \"v2\"))", tableName);
-
- sql("DELETE FROM %s WHERE complex.c1 = id + 2", tableName);
- assertEquals(
- "Should have expected rows", ImmutableList.of(row(2)), sql("SELECT id FROM %s", tableName));
-
- sql("DELETE FROM %s t WHERE t.complex.c1 = id", tableName);
- assertEquals(
- "Should have expected rows", ImmutableList.of(), sql("SELECT id FROM %s", tableName));
- }
-
- @Test
- public void testDeleteWithInSubquery() throws NoSuchTableException {
- createAndInitUnpartitionedTable();
-
- sql("INSERT INTO TABLE %s VALUES (1, 'hr'), (2, 'hardware'), (null, 'hr')", tableName);
-
- createOrReplaceView("deleted_id", Arrays.asList(0, 1, null), Encoders.INT());
- createOrReplaceView("deleted_dep", Arrays.asList("software", "hr"), Encoders.STRING());
-
- sql(
- "DELETE FROM %s WHERE id IN (SELECT * FROM deleted_id) AND dep IN (SELECT * from deleted_dep)",
- tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(2, "hardware"), row(null, "hr")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
-
- append(new Employee(1, "hr"), new Employee(-1, "hr"));
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(-1, "hr"), row(1, "hr"), row(2, "hardware"), row(null, "hr")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
-
- sql("DELETE FROM %s WHERE id IS NULL OR id IN (SELECT value + 2 FROM deleted_id)", tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(-1, "hr"), row(1, "hr")),
- sql("SELECT * FROM %s ORDER BY id", tableName));
-
- append(new Employee(null, "hr"), new Employee(2, "hr"));
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(-1, "hr"), row(1, "hr"), row(2, "hr"), row(null, "hr")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
-
- sql("DELETE FROM %s WHERE id IN (SELECT value + 2 FROM deleted_id) AND dep = 'hr'", tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(-1, "hr"), row(1, "hr"), row(null, "hr")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
- }
-
- @Test
- public void testDeleteWithMultiColumnInSubquery() throws NoSuchTableException {
- createAndInitUnpartitionedTable();
-
- append(new Employee(1, "hr"), new Employee(2, "hardware"), new Employee(null, "hr"));
-
- List<Employee> deletedEmployees =
- Arrays.asList(new Employee(null, "hr"), new Employee(1, "hr"));
- createOrReplaceView("deleted_employee", deletedEmployees, Encoders.bean(Employee.class));
-
- sql("DELETE FROM %s WHERE (id, dep) IN (SELECT id, dep FROM deleted_employee)", tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(2, "hardware"), row(null, "hr")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
- }
-
- @Ignore // TODO: not supported since SPARK-25154 fix is not yet available
- public void testDeleteWithNotInSubquery() throws NoSuchTableException {
- createAndInitUnpartitionedTable();
-
- append(new Employee(1, "hr"), new Employee(2, "hardware"), new Employee(null, "hr"));
-
- createOrReplaceView("deleted_id", Arrays.asList(-1, -2, null), Encoders.INT());
- createOrReplaceView("deleted_dep", Arrays.asList("software", "hr"), Encoders.STRING());
-
- // the file filter subquery (nested loop lef-anti join) returns 0 records
- sql("DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id)", tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
-
- sql(
- "DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL)",
- tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(null, "hr")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
-
- sql("INSERT INTO TABLE %s VALUES (1, 'hr'), (2, 'hardware'), (null, 'hr')", tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr"), row(null, "hr")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
-
- sql(
- "DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id) OR dep IN ('software', 'hr')",
- tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(2, "hardware")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
-
- sql(
- "DELETE FROM %s t WHERE "
- + "id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL) AND "
- + "EXISTS (SELECT 1 FROM FROM deleted_dep WHERE t.dep = deleted_dep.value)",
- tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(2, "hardware")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
-
- sql(
- "DELETE FROM %s t WHERE "
- + "id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL) OR "
- + "EXISTS (SELECT 1 FROM FROM deleted_dep WHERE t.dep = deleted_dep.value)",
- tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
- }
-
- @Test
- public void testDeleteWithNotInSubqueryNotSupported() throws NoSuchTableException {
- createAndInitUnpartitionedTable();
-
- append(new Employee(1, "hr"), new Employee(2, "hardware"));
-
- createOrReplaceView("deleted_id", Arrays.asList(-1, -2, null), Encoders.INT());
-
- AssertHelpers.assertThrows(
- "Should complain about NOT IN subquery",
- AnalysisException.class,
- "Null-aware predicate subqueries are not currently supported",
- () -> sql("DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id)", tableName));
- }
-
- @Test
- public void testDeleteOnNonIcebergTableNotSupported() throws NoSuchTableException {
- createOrReplaceView("testtable", "{ \"c1\": -100, \"c2\": -200 }");
-
- AssertHelpers.assertThrows(
- "Delete is not supported for non iceberg table",
- AnalysisException.class,
- "DELETE is only supported with v2 tables.",
- () -> sql("DELETE FROM %s WHERE c1 = -100", "testtable"));
- }
-
- @Test
- public void testDeleteWithExistSubquery() throws NoSuchTableException {
- createAndInitUnpartitionedTable();
-
- append(new Employee(1, "hr"), new Employee(2, "hardware"), new Employee(null, "hr"));
-
- createOrReplaceView("deleted_id", Arrays.asList(-1, -2, null), Encoders.INT());
- createOrReplaceView("deleted_dep", Arrays.asList("software", "hr"), Encoders.STRING());
-
- sql(
- "DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value)",
- tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
-
- sql(
- "DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2)",
- tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(2, "hardware"), row(null, "hr")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
-
- sql(
- "DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value) OR t.id IS NULL",
- tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(2, "hardware")),
- sql("SELECT * FROM %s", tableName));
-
- sql(
- "DELETE FROM %s t WHERE "
- + "EXISTS (SELECT 1 FROM deleted_id di WHERE t.id = di.value) AND "
- + "EXISTS (SELECT 1 FROM deleted_dep dd WHERE t.dep = dd.value)",
- tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(2, "hardware")),
- sql("SELECT * FROM %s", tableName));
- }
-
- @Test
- public void testDeleteWithNotExistsSubquery() throws NoSuchTableException {
- createAndInitUnpartitionedTable();
-
- append(new Employee(1, "hr"), new Employee(2, "hardware"), new Employee(null, "hr"));
-
- createOrReplaceView("deleted_id", Arrays.asList(-1, -2, null), Encoders.INT());
- createOrReplaceView("deleted_dep", Arrays.asList("software", "hr"), Encoders.STRING());
-
- sql(
- "DELETE FROM %s t WHERE "
- + "NOT EXISTS (SELECT 1 FROM deleted_id di WHERE t.id = di.value + 2) AND "
- + "NOT EXISTS (SELECT 1 FROM deleted_dep dd WHERE t.dep = dd.value)",
- tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(1, "hr"), row(null, "hr")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
-
- sql(
- "DELETE FROM %s t WHERE NOT EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2)",
- tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(1, "hr")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
-
- String subquery = "SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2";
- sql("DELETE FROM %s t WHERE NOT EXISTS (%s) OR t.id = 1", tableName, subquery);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
- }
-
- @Test
- public void testDeleteWithScalarSubquery() throws NoSuchTableException {
- createAndInitUnpartitionedTable();
-
- append(new Employee(1, "hr"), new Employee(2, "hardware"), new Employee(null, "hr"));
-
- createOrReplaceView("deleted_id", Arrays.asList(1, 100, null), Encoders.INT());
-
- sql("DELETE FROM %s t WHERE id <= (SELECT min(value) FROM deleted_id)", tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(2, "hardware"), row(null, "hr")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
- }
-
- @Test
- public void testDeleteThatRequiresGroupingBeforeWrite() throws NoSuchTableException {
- createAndInitPartitionedTable();
-
- append(new Employee(0, "hr"), new Employee(1, "hr"), new Employee(2, "hr"));
- append(new Employee(0, "ops"), new Employee(1, "ops"), new Employee(2, "ops"));
- append(new Employee(0, "hr"), new Employee(1, "hr"), new Employee(2, "hr"));
- append(new Employee(0, "ops"), new Employee(1, "ops"), new Employee(2, "ops"));
-
- createOrReplaceView("deleted_id", Arrays.asList(1, 100), Encoders.INT());
-
- String originalNumOfShufflePartitions = spark.conf().get("spark.sql.shuffle.partitions");
- try {
- // set the num of shuffle partitions to 1 to ensure we have only 1 writing task
- spark.conf().set("spark.sql.shuffle.partitions", "1");
-
- sql("DELETE FROM %s t WHERE id IN (SELECT * FROM deleted_id)", tableName);
- Assert.assertEquals("Should have expected num of rows", 8L, spark.table(tableName).count());
- } finally {
- spark.conf().set("spark.sql.shuffle.partitions", originalNumOfShufflePartitions);
- }
- }
-
- @Test
- public synchronized void testDeleteWithSerializableIsolation() throws InterruptedException {
- // cannot run tests with concurrency for Hadoop tables without atomic renames
- Assume.assumeFalse(catalogName.equalsIgnoreCase("testhadoop"));
-
- createAndInitUnpartitionedTable();
-
- sql(
- "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')",
- tableName, DELETE_ISOLATION_LEVEL, "serializable");
-
- // Pre-populate the table to force it to use the Spark Writers instead of Metadata-Only Delete
- // for more consistent exception stack
- List<Integer> ids = ImmutableList.of(1, 2);
- Dataset<Row> inputDF =
- spark
- .createDataset(ids, Encoders.INT())
- .withColumnRenamed("value", "id")
- .withColumn("dep", lit("hr"));
- try {
- inputDF.coalesce(1).writeTo(tableName).append();
- } catch (NoSuchTableException e) {
- throw new RuntimeException(e);
- }
-
- ExecutorService executorService =
- MoreExecutors.getExitingExecutorService(
- (ThreadPoolExecutor) Executors.newFixedThreadPool(2));
-
- AtomicInteger barrier = new AtomicInteger(0);
-
- // delete thread
- Future<?> deleteFuture =
- executorService.submit(
- () -> {
- for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) {
- while (barrier.get() < numOperations * 2) {
- sleep(10);
- }
- sql("DELETE FROM %s WHERE id = 1", tableName);
- barrier.incrementAndGet();
- }
- });
-
- // append thread
- Future<?> appendFuture =
- executorService.submit(
- () -> {
- for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) {
- while (barrier.get() < numOperations * 2) {
- sleep(10);
- }
-
- try {
- inputDF.coalesce(1).writeTo(tableName).append();
- } catch (NoSuchTableException e) {
- throw new RuntimeException(e);
- }
-
- barrier.incrementAndGet();
- }
- });
-
- try {
- Assertions.assertThatThrownBy(deleteFuture::get)
- .isInstanceOf(ExecutionException.class)
- .cause()
- .isInstanceOf(SparkException.class)
- .cause()
- .isInstanceOf(ValidationException.class)
- .hasMessageContaining("Found conflicting files that can contain");
- } finally {
- appendFuture.cancel(true);
- }
-
- executorService.shutdown();
- Assert.assertTrue("Timeout", executorService.awaitTermination(2, TimeUnit.MINUTES));
- }
-
- @Test
- public synchronized void testDeleteWithSnapshotIsolation()
- throws InterruptedException, ExecutionException {
- // cannot run tests with concurrency for Hadoop tables without atomic renames
- Assume.assumeFalse(catalogName.equalsIgnoreCase("testhadoop"));
-
- createAndInitUnpartitionedTable();
-
- sql(
- "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')",
- tableName, DELETE_ISOLATION_LEVEL, "snapshot");
-
- ExecutorService executorService =
- MoreExecutors.getExitingExecutorService(
- (ThreadPoolExecutor) Executors.newFixedThreadPool(2));
-
- AtomicInteger barrier = new AtomicInteger(0);
-
- // delete thread
- Future<?> deleteFuture =
- executorService.submit(
- () -> {
- for (int numOperations = 0; numOperations < 20; numOperations++) {
- while (barrier.get() < numOperations * 2) {
- sleep(10);
- }
- sql("DELETE FROM %s WHERE id = 1", tableName);
- barrier.incrementAndGet();
- }
- });
-
- // append thread
- Future<?> appendFuture =
- executorService.submit(
- () -> {
- for (int numOperations = 0; numOperations < 20; numOperations++) {
- while (barrier.get() < numOperations * 2) {
- sleep(10);
- }
- sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName);
- barrier.incrementAndGet();
- }
- });
-
- try {
- deleteFuture.get();
- } finally {
- appendFuture.cancel(true);
- }
-
- executorService.shutdown();
- Assert.assertTrue("Timeout", executorService.awaitTermination(2, TimeUnit.MINUTES));
- }
-
- @Test
- public void testDeleteRefreshesRelationCache() throws NoSuchTableException {
- createAndInitPartitionedTable();
-
- append(new Employee(1, "hr"), new Employee(3, "hr"));
- append(new Employee(1, "hardware"), new Employee(2, "hardware"));
-
- Dataset<Row> query = spark.sql("SELECT * FROM " + tableName + " WHERE id = 1");
- query.createOrReplaceTempView("tmp");
-
- spark.sql("CACHE TABLE tmp");
-
- assertEquals(
- "View should have correct data",
- ImmutableList.of(row(1, "hardware"), row(1, "hr")),
- sql("SELECT * FROM tmp ORDER BY id, dep"));
-
- sql("DELETE FROM %s WHERE id = 1", tableName);
-
- Table table = validationCatalog.loadTable(tableIdent);
- Assert.assertEquals("Should have 3 snapshots", 3, Iterables.size(table.snapshots()));
-
- Snapshot currentSnapshot = table.currentSnapshot();
- validateSnapshot(currentSnapshot, "overwrite", "2", "2", "2");
-
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(2, "hardware"), row(3, "hr")),
- sql("SELECT * FROM %s ORDER BY id, dep", tableName));
-
- assertEquals(
- "Should refresh the relation cache",
- ImmutableList.of(),
- sql("SELECT * FROM tmp ORDER BY id, dep"));
-
- spark.sql("UNCACHE TABLE tmp");
- }
-
- // TODO: multiple stripes for ORC
-
- protected void createAndInitPartitionedTable() {
- sql("CREATE TABLE %s (id INT, dep STRING) USING iceberg PARTITIONED BY (dep)", tableName);
- initTable();
- }
-
- protected void createAndInitUnpartitionedTable() {
- sql("CREATE TABLE %s (id INT, dep STRING) USING iceberg", tableName);
- initTable();
- }
-
- protected void createAndInitNestedColumnsTable() {
- sql("CREATE TABLE %s (id INT, complex STRUCT<c1:INT,c2:STRING>) USING iceberg", tableName);
- initTable();
- }
-
- protected void append(Employee... employees) throws NoSuchTableException {
- List<Employee> input = Arrays.asList(employees);
- Dataset<Row> inputDF = spark.createDataFrame(input, Employee.class);
- inputDF.coalesce(1).writeTo(tableName).append();
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestExpireSnapshotsProcedure.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestExpireSnapshotsProcedure.java
deleted file mode 100644
index 7db64acf4a..0000000000
--- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestExpireSnapshotsProcedure.java
+++ /dev/null
@@ -1,316 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.iceberg.spark.extensions;
-
-import static org.apache.iceberg.TableProperties.GC_ENABLED;
-
-import java.io.IOException;
-import java.sql.Timestamp;
-import java.time.Instant;
-import java.util.List;
-import java.util.Map;
-import org.apache.iceberg.AssertHelpers;
-import org.apache.iceberg.Snapshot;
-import org.apache.iceberg.Table;
-import org.apache.iceberg.exceptions.ValidationException;
-import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
-import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
-import org.apache.iceberg.spark.SparkCatalog;
-import org.apache.spark.sql.AnalysisException;
-import org.apache.spark.sql.catalyst.analysis.NoSuchProcedureException;
-import org.junit.After;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class TestExpireSnapshotsProcedure extends SparkExtensionsTestBase {
-
- public TestExpireSnapshotsProcedure(
- String catalogName, String implementation, Map<String, String> config) {
- super(catalogName, implementation, config);
- }
-
- @After
- public void removeTables() {
- sql("DROP TABLE IF EXISTS %s", tableName);
- }
-
- @Test
- public void testExpireSnapshotsInEmptyTable() {
- sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName);
-
- List<Object[]> output = sql("CALL %s.system.expire_snapshots('%s')", catalogName, tableIdent);
- assertEquals("Should not delete any files", ImmutableList.of(row(0L, 0L, 0L)), output);
- }
-
- @Test
- public void testExpireSnapshotsUsingPositionalArgs() {
- sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName);
- sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName);
-
- Table table = validationCatalog.loadTable(tableIdent);
- Snapshot firstSnapshot = table.currentSnapshot();
-
- waitUntilAfter(firstSnapshot.timestampMillis());
-
- sql("INSERT INTO TABLE %s VALUES (2, 'b')", tableName);
-
- table.refresh();
-
- Snapshot secondSnapshot = table.currentSnapshot();
- Timestamp secondSnapshotTimestamp =
- Timestamp.from(Instant.ofEpochMilli(secondSnapshot.timestampMillis()));
-
- Assert.assertEquals("Should be 2 snapshots", 2, Iterables.size(table.snapshots()));
-
- // expire without retainLast param
- List<Object[]> output1 =
- sql(
- "CALL %s.system.expire_snapshots('%s', TIMESTAMP '%s')",
- catalogName, tableIdent, secondSnapshotTimestamp);
- assertEquals("Procedure output must match", ImmutableList.of(row(0L, 0L, 1L)), output1);
-
- table.refresh();
-
- Assert.assertEquals("Should expire one snapshot", 1, Iterables.size(table.snapshots()));
-
- sql("INSERT OVERWRITE %s VALUES (3, 'c')", tableName);
- sql("INSERT INTO TABLE %s VALUES (4, 'd')", tableName);
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(3L, "c"), row(4L, "d")),
- sql("SELECT * FROM %s ORDER BY id", tableName));
-
- table.refresh();
-
- waitUntilAfter(table.currentSnapshot().timestampMillis());
-
- Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis()));
-
- Assert.assertEquals("Should be 3 snapshots", 3, Iterables.size(table.snapshots()));
-
- // expire with retainLast param
- List<Object[]> output =
- sql(
- "CALL %s.system.expire_snapshots('%s', TIMESTAMP '%s', 2)",
- catalogName, tableIdent, currentTimestamp);
- assertEquals("Procedure output must match", ImmutableList.of(row(2L, 2L, 1L)), output);
- }
-
- @Test
- public void testExpireSnapshotUsingNamedArgs() {
- sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName);
-
- sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName);
- sql("INSERT INTO TABLE %s VALUES (2, 'b')", tableName);
-
- Table table = validationCatalog.loadTable(tableIdent);
-
- Assert.assertEquals("Should be 2 snapshots", 2, Iterables.size(table.snapshots()));
-
- waitUntilAfter(table.currentSnapshot().timestampMillis());
-
- Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis()));
-
- List<Object[]> output =
- sql(
- "CALL %s.system.expire_snapshots("
- + "older_than => TIMESTAMP '%s',"
- + "table => '%s',"
- + "retain_last => 1)",
- catalogName, currentTimestamp, tableIdent);
- assertEquals("Procedure output must match", ImmutableList.of(row(0L, 0L, 1L)), output);
- }
-
- @Test
- public void testExpireSnapshotsGCDisabled() {
- sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName);
-
- sql("ALTER TABLE %s SET TBLPROPERTIES ('%s' 'false')", tableName, GC_ENABLED);
-
- AssertHelpers.assertThrows(
- "Should reject call",
- ValidationException.class,
- "Cannot expire snapshots: GC is disabled",
- () -> sql("CALL %s.system.expire_snapshots('%s')", catalogName, tableIdent));
- }
-
- @Test
- public void testInvalidExpireSnapshotsCases() {
- AssertHelpers.assertThrows(
- "Should not allow mixed args",
- AnalysisException.class,
- "Named and positional arguments cannot be mixed",
- () -> sql("CALL %s.system.expire_snapshots('n', table => 't')", catalogName));
-
- AssertHelpers.assertThrows(
- "Should not resolve procedures in arbitrary namespaces",
- NoSuchProcedureException.class,
- "not found",
- () -> sql("CALL %s.custom.expire_snapshots('n', 't')", catalogName));
-
- AssertHelpers.assertThrows(
- "Should reject calls without all required args",
- AnalysisException.class,
- "Missing required parameters",
- () -> sql("CALL %s.system.expire_snapshots()", catalogName));
-
- AssertHelpers.assertThrows(
- "Should reject calls with invalid arg types",
- AnalysisException.class,
- "Wrong arg type",
- () -> sql("CALL %s.system.expire_snapshots('n', 2.2)", catalogName));
-
- AssertHelpers.assertThrows(
- "Should reject calls with empty table identifier",
- IllegalArgumentException.class,
- "Cannot handle an empty identifier",
- () -> sql("CALL %s.system.expire_snapshots('')", catalogName));
- }
-
- @Test
- public void testResolvingTableInAnotherCatalog() throws IOException {
- String anotherCatalog = "another_" + catalogName;
- spark.conf().set("spark.sql.catalog." + anotherCatalog, SparkCatalog.class.getName());
- spark.conf().set("spark.sql.catalog." + anotherCatalog + ".type", "hadoop");
- spark
- .conf()
- .set(
- "spark.sql.catalog." + anotherCatalog + ".warehouse",
- "file:" + temp.newFolder().toString());
-
- sql(
- "CREATE TABLE %s.%s (id bigint NOT NULL, data string) USING iceberg",
- anotherCatalog, tableIdent);
-
- AssertHelpers.assertThrows(
- "Should reject calls for a table in another catalog",
- IllegalArgumentException.class,
- "Cannot run procedure in catalog",
- () ->
- sql(
- "CALL %s.system.expire_snapshots('%s')",
- catalogName, anotherCatalog + "." + tableName));
- }
-
- @Test
- public void testConcurrentExpireSnapshots() {
- sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName);
-
- sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName);
- sql("INSERT INTO TABLE %s VALUES (2, 'b')", tableName);
- sql("INSERT INTO TABLE %s VALUES (3, 'c')", tableName);
- sql("INSERT INTO TABLE %s VALUES (4, 'd')", tableName);
-
- Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis()));
- List<Object[]> output =
- sql(
- "CALL %s.system.expire_snapshots("
- + "older_than => TIMESTAMP '%s',"
- + "table => '%s',"
- + "max_concurrent_deletes => %s,"
- + "retain_last => 1)",
- catalogName, currentTimestamp, tableIdent, 4);
- assertEquals(
- "Expiring snapshots concurrently should succeed",
- ImmutableList.of(row(0L, 0L, 3L)),
- output);
- }
-
- @Test
- public void testConcurrentExpireSnapshotsWithInvalidInput() {
- sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName);
-
- AssertHelpers.assertThrows(
- "Should throw an error when max_concurrent_deletes = 0",
- IllegalArgumentException.class,
- "max_concurrent_deletes should have value > 0",
- () ->
- sql(
- "CALL %s.system.expire_snapshots(table => '%s', max_concurrent_deletes => %s)",
- catalogName, tableIdent, 0));
-
- AssertHelpers.assertThrows(
- "Should throw an error when max_concurrent_deletes < 0 ",
- IllegalArgumentException.class,
- "max_concurrent_deletes should have value > 0",
- () ->
- sql(
- "CALL %s.system.expire_snapshots(table => '%s', max_concurrent_deletes => %s)",
- catalogName, tableIdent, -1));
- }
-
- @Test
- public void testExpireSnapshotWithStreamResultsEnabled() {
- sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName);
-
- sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName);
- sql("INSERT INTO TABLE %s VALUES (2, 'b')", tableName);
-
- Table table = validationCatalog.loadTable(tableIdent);
-
- Assert.assertEquals("Should be 2 snapshots", 2, Iterables.size(table.snapshots()));
-
- waitUntilAfter(table.currentSnapshot().timestampMillis());
-
- Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis()));
-
- List<Object[]> output =
- sql(
- "CALL %s.system.expire_snapshots("
- + "older_than => TIMESTAMP '%s',"
- + "table => '%s',"
- + "retain_last => 1, "
- + "stream_results => true)",
- catalogName, currentTimestamp, tableIdent);
- assertEquals("Procedure output must match", ImmutableList.of(row(0L, 0L, 1L)), output);
- }
-
- @Test
- public void testExpireSnapshotsProcedureWorksWithSqlComments() {
- // Ensure that systems such as dbt, that inject comments into the generated SQL files, will
- // work with Iceberg-specific DDL
- sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName);
-
- sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName);
- sql("INSERT INTO TABLE %s VALUES (2, 'b')", tableName);
-
- Table table = validationCatalog.loadTable(tableIdent);
-
- Assert.assertEquals("Should be 2 snapshots", 2, Iterables.size(table.snapshots()));
-
- waitUntilAfter(table.currentSnapshot().timestampMillis());
-
- Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis()));
-
- String callStatement =
- "/* CALL statement is used to expire snapshots */\n"
- + "-- And we have single line comments as well \n"
- + "/* And comments that span *multiple* \n"
- + " lines */ CALL /* this is the actual CALL */ %s.system.expire_snapshots("
- + " older_than => TIMESTAMP '%s',"
- + " table => '%s',"
- + " retain_last => 1)";
- List<Object[]> output = sql(callStatement, catalogName, currentTimestamp, tableIdent);
- assertEquals("Procedure output must match", ImmutableList.of(row(0L, 0L, 1L)), output);
-
- table.refresh();
-
- Assert.assertEquals("Should be 1 snapshot remaining", 1, Iterables.size(table.snapshots()));
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestIcebergExpressions.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestIcebergExpressions.java
deleted file mode 100644
index 8d2e10ea17..0000000000
--- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestIcebergExpressions.java
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.iceberg.spark.extensions;
-
-import java.math.BigDecimal;
-import java.util.Map;
-import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
-import org.apache.spark.sql.Column;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.catalyst.expressions.IcebergTruncateTransform;
-import org.junit.After;
-import org.junit.Test;
-
-public class TestIcebergExpressions extends SparkExtensionsTestBase {
-
- public TestIcebergExpressions(
- String catalogName, String implementation, Map<String, String> config) {
- super(catalogName, implementation, config);
- }
-
- @After
- public void removeTables() {
- sql("DROP TABLE IF EXISTS %s", tableName);
- sql("DROP VIEW IF EXISTS emp");
- sql("DROP VIEW IF EXISTS v");
- }
-
- @Test
- public void testTruncateExpressions() {
- sql(
- "CREATE TABLE %s ( "
- + " int_c INT, long_c LONG, dec_c DECIMAL(4, 2), str_c STRING, binary_c BINARY "
- + ") USING iceberg",
- tableName);
-
- sql(
- "CREATE TEMPORARY VIEW emp "
- + "AS SELECT * FROM VALUES (101, 10001, 10.65, '101-Employee', CAST('1234' AS BINARY)) "
- + "AS EMP(int_c, long_c, dec_c, str_c, binary_c)");
-
- sql("INSERT INTO %s SELECT * FROM emp", tableName);
-
- Dataset<Row> df = spark.sql("SELECT * FROM " + tableName);
- df.select(
- new Column(new IcebergTruncateTransform(df.col("int_c").expr(), 2)).as("int_c"),
- new Column(new IcebergTruncateTransform(df.col("long_c").expr(), 2)).as("long_c"),
- new Column(new IcebergTruncateTransform(df.col("dec_c").expr(), 50)).as("dec_c"),
- new Column(new IcebergTruncateTransform(df.col("str_c").expr(), 2)).as("str_c"),
- new Column(new IcebergTruncateTransform(df.col("binary_c").expr(), 2)).as("binary_c"))
- .createOrReplaceTempView("v");
-
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(100, 10000L, new BigDecimal("10.50"), "10", "12")),
- sql("SELECT int_c, long_c, dec_c, str_c, CAST(binary_c AS STRING) FROM v"));
- }
-}
diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMerge.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMerge.java
deleted file mode 100644
index 7658b1e3eb..0000000000
--- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMerge.java
+++ /dev/null
@@ -1,1837 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.iceberg.spark.extensions;
-
-import static org.apache.iceberg.TableProperties.MERGE_CARDINALITY_CHECK_ENABLED;
-import static org.apache.iceberg.TableProperties.MERGE_ISOLATION_LEVEL;
-import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES;
-import static org.apache.iceberg.TableProperties.SPLIT_SIZE;
-import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE;
-import static org.apache.spark.sql.functions.lit;
-
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-import java.util.concurrent.ThreadPoolExecutor;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicInteger;
-import org.apache.iceberg.AssertHelpers;
-import org.apache.iceberg.DistributionMode;
-import org.apache.iceberg.exceptions.ValidationException;
-import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
-import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
-import org.apache.iceberg.relocated.com.google.common.collect.Lists;
-import org.apache.iceberg.relocated.com.google.common.collect.Maps;
-import org.apache.iceberg.relocated.com.google.common.util.concurrent.MoreExecutors;
-import org.apache.spark.SparkException;
-import org.apache.spark.sql.AnalysisException;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
-import org.assertj.core.api.Assertions;
-import org.junit.After;
-import org.junit.Assert;
-import org.junit.Assume;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-public abstract class TestMerge extends SparkRowLevelOperationsTestBase {
-
- public TestMerge(
- String catalogName,
- String implementation,
- Map<String, String> config,
- String fileFormat,
- boolean vectorized,
- String distributionMode) {
- super(catalogName, implementation, config, fileFormat, vectorized, distributionMode);
- }
-
- @BeforeClass
- public static void setupSparkConf() {
- spark.conf().set("spark.sql.shuffle.partitions", "4");
- }
-
- @After
- public void removeTables() {
- sql("DROP TABLE IF EXISTS %s", tableName);
- sql("DROP TABLE IF EXISTS source");
- }
-
- // TODO: add tests for multiple NOT MATCHED clauses when we move to Spark 3.1
-
- @Test
- public void testMergeIntoEmptyTargetInsertAllNonMatchingRows() {
- createAndInitTable("id INT, dep STRING");
-
- createOrReplaceView(
- "source",
- "id INT, dep STRING",
- "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n"
- + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n"
- + "{ \"id\": 3, \"dep\": \"emp-id-3\" }");
-
- sql(
- "MERGE INTO %s AS t USING source AS s "
- + "ON t.id == s.id "
- + "WHEN NOT MATCHED THEN "
- + " INSERT *",
- tableName);
-
- ImmutableList<Object[]> expectedRows =
- ImmutableList.of(
- row(1, "emp-id-1"), // new
- row(2, "emp-id-2"), // new
- row(3, "emp-id-3") // new
- );
- assertEquals(
- "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void testMergeIntoEmptyTargetInsertOnlyMatchingRows() {
- createAndInitTable("id INT, dep STRING");
-
- createOrReplaceView(
- "source",
- "id INT, dep STRING",
- "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n"
- + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n"
- + "{ \"id\": 3, \"dep\": \"emp-id-3\" }");
-
- sql(
- "MERGE INTO %s AS t USING source AS s "
- + "ON t.id == s.id "
- + "WHEN NOT MATCHED AND (s.id >=2) THEN "
- + " INSERT *",
- tableName);
-
- ImmutableList<Object[]> expectedRows =
- ImmutableList.of(
- row(2, "emp-id-2"), // new
- row(3, "emp-id-3") // new
- );
- assertEquals(
- "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void testMergeWithOnlyUpdateClause() {
- createAndInitTable(
- "id INT, dep STRING",
- "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-six\" }");
-
- createOrReplaceView(
- "source",
- "id INT, dep STRING",
- "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n"
- + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n"
- + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- sql(
- "MERGE INTO %s AS t USING source AS s "
- + "ON t.id == s.id "
- + "WHEN MATCHED AND t.id = 1 THEN "
- + " UPDATE SET *",
- tableName);
-
- ImmutableList<Object[]> expectedRows =
- ImmutableList.of(
- row(1, "emp-id-1"), // updated
- row(6, "emp-id-six") // kept
- );
- assertEquals(
- "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void testMergeWithOnlyDeleteClause() {
- createAndInitTable(
- "id INT, dep STRING",
- "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- createOrReplaceView(
- "source",
- "id INT, dep STRING",
- "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n"
- + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n"
- + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- sql(
- "MERGE INTO %s AS t USING source AS s "
- + "ON t.id == s.id "
- + "WHEN MATCHED AND t.id = 6 THEN "
- + " DELETE",
- tableName);
-
- ImmutableList<Object[]> expectedRows =
- ImmutableList.of(
- row(1, "emp-id-one") // kept
- );
- assertEquals(
- "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void testMergeWithAllCauses() {
- createAndInitTable(
- "id INT, dep STRING",
- "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- createOrReplaceView(
- "source",
- "id INT, dep STRING",
- "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n"
- + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n"
- + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- sql(
- "MERGE INTO %s AS t USING source AS s "
- + "ON t.id == s.id "
- + "WHEN MATCHED AND t.id = 1 THEN "
- + " UPDATE SET * "
- + "WHEN MATCHED AND t.id = 6 THEN "
- + " DELETE "
- + "WHEN NOT MATCHED AND s.id = 2 THEN "
- + " INSERT *",
- tableName);
-
- ImmutableList<Object[]> expectedRows =
- ImmutableList.of(
- row(1, "emp-id-1"), // updated
- row(2, "emp-id-2") // new
- );
- assertEquals(
- "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void testMergeWithAllCausesWithExplicitColumnSpecification() {
- createAndInitTable(
- "id INT, dep STRING",
- "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- createOrReplaceView(
- "source",
- "id INT, dep STRING",
- "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n"
- + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n"
- + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- sql(
- "MERGE INTO %s AS t USING source AS s "
- + "ON t.id == s.id "
- + "WHEN MATCHED AND t.id = 1 THEN "
- + " UPDATE SET t.id = s.id, t.dep = s.dep "
- + "WHEN MATCHED AND t.id = 6 THEN "
- + " DELETE "
- + "WHEN NOT MATCHED AND s.id = 2 THEN "
- + " INSERT (t.id, t.dep) VALUES (s.id, s.dep)",
- tableName);
-
- ImmutableList<Object[]> expectedRows =
- ImmutableList.of(
- row(1, "emp-id-1"), // updated
- row(2, "emp-id-2") // new
- );
- assertEquals(
- "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void testMergeWithSourceCTE() {
- createAndInitTable(
- "id INT, dep STRING",
- "{ \"id\": 2, \"dep\": \"emp-id-two\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- createOrReplaceView(
- "source",
- "id INT, dep STRING",
- "{ \"id\": 2, \"dep\": \"emp-id-3\" }\n"
- + "{ \"id\": 1, \"dep\": \"emp-id-2\" }\n"
- + "{ \"id\": 5, \"dep\": \"emp-id-6\" }");
-
- sql(
- "WITH cte1 AS (SELECT id + 1 AS id, dep FROM source) "
- + "MERGE INTO %s AS t USING cte1 AS s "
- + "ON t.id == s.id "
- + "WHEN MATCHED AND t.id = 2 THEN "
- + " UPDATE SET * "
- + "WHEN MATCHED AND t.id = 6 THEN "
- + " DELETE "
- + "WHEN NOT MATCHED AND s.id = 3 THEN "
- + " INSERT *",
- tableName);
-
- ImmutableList<Object[]> expectedRows =
- ImmutableList.of(
- row(2, "emp-id-2"), // updated
- row(3, "emp-id-3") // new
- );
- assertEquals(
- "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void testMergeWithSourceFromSetOps() {
- createAndInitTable(
- "id INT, dep STRING",
- "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- createOrReplaceView(
- "source",
- "id INT, dep STRING",
- "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n"
- + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n"
- + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- String derivedSource =
- "SELECT * FROM source WHERE id = 2 "
- + "UNION ALL "
- + "SELECT * FROM source WHERE id = 1 OR id = 6";
-
- sql(
- "MERGE INTO %s AS t USING (%s) AS s "
- + "ON t.id == s.id "
- + "WHEN MATCHED AND t.id = 1 THEN "
- + " UPDATE SET * "
- + "WHEN MATCHED AND t.id = 6 THEN "
- + " DELETE "
- + "WHEN NOT MATCHED AND s.id = 2 THEN "
- + " INSERT *",
- tableName, derivedSource);
-
- ImmutableList<Object[]> expectedRows =
- ImmutableList.of(
- row(1, "emp-id-1"), // updated
- row(2, "emp-id-2") // new
- );
- assertEquals(
- "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void testMergeWithMultipleUpdatesForTargetRow() {
- createAndInitTable(
- "id INT, dep STRING",
- "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- createOrReplaceView(
- "source",
- "id INT, dep STRING",
- "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n"
- + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n"
- + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n"
- + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- String errorMsg = "a single row from the target table with multiple rows of the source table";
- AssertHelpers.assertThrows(
- "Should complain non iceberg target table",
- SparkException.class,
- errorMsg,
- () -> {
- sql(
- "MERGE INTO %s AS t USING source AS s "
- + "ON t.id == s.id "
- + "WHEN MATCHED AND t.id = 1 THEN "
- + " UPDATE SET * "
- + "WHEN MATCHED AND t.id = 6 THEN "
- + " DELETE "
- + "WHEN NOT MATCHED AND s.id = 2 THEN "
- + " INSERT *",
- tableName);
- });
-
- assertEquals(
- "Target should be unchanged",
- ImmutableList.of(row(1, "emp-id-one"), row(6, "emp-id-6")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
- }
-
- @Test
- public void testMergeWithDisabledCardinalityCheck() {
- createAndInitTable(
- "id INT, dep STRING",
- "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- createOrReplaceView(
- "source",
- "id INT, dep STRING",
- "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n"
- + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n"
- + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n"
- + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- try {
- // disable the cardinality check
- sql(
- "ALTER TABLE %s SET TBLPROPERTIES('%s' '%b')",
- tableName, MERGE_CARDINALITY_CHECK_ENABLED, false);
-
- sql(
- "MERGE INTO %s AS t USING source AS s "
- + "ON t.id == s.id "
- + "WHEN MATCHED AND t.id = 1 THEN "
- + " UPDATE SET * "
- + "WHEN MATCHED AND t.id = 6 THEN "
- + " DELETE "
- + "WHEN NOT MATCHED AND s.id = 2 THEN "
- + " INSERT *",
- tableName);
- } finally {
- sql(
- "ALTER TABLE %s SET TBLPROPERTIES('%s' '%b')",
- tableName, MERGE_CARDINALITY_CHECK_ENABLED, true);
- }
-
- assertEquals(
- "Should have expected rows",
- ImmutableList.of(row(1, "emp-id-1"), row(1, "emp-id-1"), row(2, "emp-id-2")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
- }
-
- @Test
- public void testMergeWithUnconditionalDelete() {
- createAndInitTable(
- "id INT, dep STRING",
- "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- createOrReplaceView(
- "source",
- "id INT, dep STRING",
- "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n"
- + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n"
- + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n"
- + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- sql(
- "MERGE INTO %s AS t USING source AS s "
- + "ON t.id == s.id "
- + "WHEN MATCHED THEN "
- + " DELETE "
- + "WHEN NOT MATCHED AND s.id = 2 THEN "
- + " INSERT *",
- tableName);
-
- ImmutableList<Object[]> expectedRows =
- ImmutableList.of(
- row(2, "emp-id-2") // new
- );
- assertEquals(
- "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void testMergeWithSingleConditionalDelete() {
- createAndInitTable(
- "id INT, dep STRING",
- "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- createOrReplaceView(
- "source",
- "id INT, dep STRING",
- "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n"
- + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n"
- + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n"
- + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- String errorMsg = "a single row from the target table with multiple rows of the source table";
- AssertHelpers.assertThrows(
- "Should complain non iceberg target table",
- SparkException.class,
- errorMsg,
- () -> {
- sql(
- "MERGE INTO %s AS t USING source AS s "
- + "ON t.id == s.id "
- + "WHEN MATCHED AND t.id = 1 THEN "
- + " DELETE "
- + "WHEN NOT MATCHED AND s.id = 2 THEN "
- + " INSERT *",
- tableName);
- });
-
- assertEquals(
- "Target should be unchanged",
- ImmutableList.of(row(1, "emp-id-one"), row(6, "emp-id-6")),
- sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName));
- }
-
- @Test
- public void testMergeWithIdentityTransform() {
- for (DistributionMode mode : DistributionMode.values()) {
- createAndInitTable("id INT, dep STRING");
- sql("ALTER TABLE %s ADD PARTITION FIELD identity(dep)", tableName);
- sql(
- "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')",
- tableName, WRITE_DISTRIBUTION_MODE, mode.modeName());
-
- append(
- tableName,
- "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- createOrReplaceView(
- "source",
- "id INT, dep STRING",
- "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n"
- + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n"
- + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- sql(
- "MERGE INTO %s AS t USING source AS s "
- + "ON t.id == s.id "
- + "WHEN MATCHED AND t.id = 1 THEN "
- + " UPDATE SET * "
- + "WHEN MATCHED AND t.id = 6 THEN "
- + " DELETE "
- + "WHEN NOT MATCHED AND s.id = 2 THEN "
- + " INSERT *",
- tableName);
-
- ImmutableList<Object[]> expectedRows =
- ImmutableList.of(
- row(1, "emp-id-1"), // updated
- row(2, "emp-id-2") // new
- );
- assertEquals(
- "Should have expected rows",
- expectedRows,
- sql("SELECT * FROM %s ORDER BY id", tableName));
-
- removeTables();
- }
- }
-
- @Test
- public void testMergeWithDaysTransform() {
- for (DistributionMode mode : DistributionMode.values()) {
- createAndInitTable("id INT, ts TIMESTAMP");
- sql("ALTER TABLE %s ADD PARTITION FIELD days(ts)", tableName);
- sql(
- "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')",
- tableName, WRITE_DISTRIBUTION_MODE, mode.modeName());
-
- append(
- tableName,
- "id INT, ts TIMESTAMP",
- "{ \"id\": 1, \"ts\": \"2000-01-01 00:00:00\" }\n"
- + "{ \"id\": 6, \"ts\": \"2000-01-06 00:00:00\" }");
-
- createOrReplaceView(
- "source",
- "id INT, ts TIMESTAMP",
- "{ \"id\": 2, \"ts\": \"2001-01-02 00:00:00\" }\n"
- + "{ \"id\": 1, \"ts\": \"2001-01-01 00:00:00\" }\n"
- + "{ \"id\": 6, \"ts\": \"2001-01-06 00:00:00\" }");
-
- sql(
- "MERGE INTO %s AS t USING source AS s "
- + "ON t.id == s.id "
- + "WHEN MATCHED AND t.id = 1 THEN "
- + " UPDATE SET * "
- + "WHEN MATCHED AND t.id = 6 THEN "
- + " DELETE "
- + "WHEN NOT MATCHED AND s.id = 2 THEN "
- + " INSERT *",
- tableName);
-
- ImmutableList<Object[]> expectedRows =
- ImmutableList.of(
- row(1, "2001-01-01 00:00:00"), // updated
- row(2, "2001-01-02 00:00:00") // new
- );
- assertEquals(
- "Should have expected rows",
- expectedRows,
- sql("SELECT id, CAST(ts AS STRING) FROM %s ORDER BY id", tableName));
-
- removeTables();
- }
- }
-
- @Test
- public void testMergeWithBucketTransform() {
- for (DistributionMode mode : DistributionMode.values()) {
- createAndInitTable("id INT, dep STRING");
- sql("ALTER TABLE %s ADD PARTITION FIELD bucket(2, dep)", tableName);
- sql(
- "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')",
- tableName, WRITE_DISTRIBUTION_MODE, mode.modeName());
-
- append(
- tableName,
- "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- createOrReplaceView(
- "source",
- "id INT, dep STRING",
- "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n"
- + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n"
- + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- sql(
- "MERGE INTO %s AS t USING source AS s "
- + "ON t.id == s.id "
- + "WHEN MATCHED AND t.id = 1 THEN "
- + " UPDATE SET * "
- + "WHEN MATCHED AND t.id = 6 THEN "
- + " DELETE "
- + "WHEN NOT MATCHED AND s.id = 2 THEN "
- + " INSERT *",
- tableName);
-
- ImmutableList<Object[]> expectedRows =
- ImmutableList.of(
- row(1, "emp-id-1"), // updated
- row(2, "emp-id-2") // new
- );
- assertEquals(
- "Should have expected rows",
- expectedRows,
- sql("SELECT * FROM %s ORDER BY id", tableName));
-
- removeTables();
- }
- }
-
- @Test
- public void testMergeWithTruncateTransform() {
- for (DistributionMode mode : DistributionMode.values()) {
- createAndInitTable("id INT, dep STRING");
- sql("ALTER TABLE %s ADD PARTITION FIELD truncate(dep, 2)", tableName);
- sql(
- "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')",
- tableName, WRITE_DISTRIBUTION_MODE, mode.modeName());
-
- append(
- tableName,
- "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- createOrReplaceView(
- "source",
- "id INT, dep STRING",
- "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n"
- + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n"
- + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- sql(
- "MERGE INTO %s AS t USING source AS s "
- + "ON t.id == s.id "
- + "WHEN MATCHED AND t.id = 1 THEN "
- + " UPDATE SET * "
- + "WHEN MATCHED AND t.id = 6 THEN "
- + " DELETE "
- + "WHEN NOT MATCHED AND s.id = 2 THEN "
- + " INSERT *",
- tableName);
-
- ImmutableList<Object[]> expectedRows =
- ImmutableList.of(
- row(1, "emp-id-1"), // updated
- row(2, "emp-id-2") // new
- );
- assertEquals(
- "Should have expected rows",
- expectedRows,
- sql("SELECT * FROM %s ORDER BY id", tableName));
-
- removeTables();
- }
- }
-
- @Test
- public void testMergeIntoPartitionedAndOrderedTable() {
- for (DistributionMode mode : DistributionMode.values()) {
- createAndInitTable("id INT, dep STRING");
- sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName);
- sql("ALTER TABLE %s WRITE ORDERED BY (id)", tableName);
- sql(
- "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')",
- tableName, WRITE_DISTRIBUTION_MODE, mode.modeName());
-
- append(
- tableName,
- "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- createOrReplaceView(
- "source",
- "id INT, dep STRING",
- "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n"
- + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n"
- + "{ \"id\": 6, \"dep\": \"emp-id-6\" }");
-
- sql(
- "MERGE INTO %s AS t USING source AS s "
- + "ON t.id == s.id "
- + "WHEN MATCHED AND t.id = 1 THEN "
- + " UPDATE SET * "
- + "WHEN MATCHED AND t.id = 6 THEN "
- + " DELETE "
- + "WHEN NOT MATCHED AND s.id = 2 THEN "
- + " INSERT *",
- tableName);
-
- ImmutableList<Object[]> expectedRows =
- ImmutableList.of(
- row(1, "emp-id-1"), // updated
- row(2, "emp-id-2") // new
- );
- assertEquals(
- "Should have expected rows",
- expectedRows,
- sql("SELECT * FROM %s ORDER BY id", tableName));
-
- removeTables();
- }
- }
-
- @Test
- public void testSelfMerge() {
- createAndInitTable(
- "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }");
-
- sql(
- "MERGE INTO %s t USING %s s "
- + "ON t.id == s.id "
- + "WHEN MATCHED AND t.id = 1 THEN "
- + " UPDATE SET v = 'x' "
- + "WHEN NOT MATCHED THEN "
- + " INSERT *",
- tableName, tableName);
-
- ImmutableList<Object[]> expectedRows =
- ImmutableList.of(
- row(1, "x"), // updated
- row(2, "v2") // kept
- );
- assertEquals(
- "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void testMergeWithSourceAsSelfSubquery() {
- createAndInitTable(
- "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }");
-
- createOrReplaceView("source", Arrays.asList(1, null), Encoders.INT());
-
- sql(
- "MERGE INTO %s t USING (SELECT id AS value FROM %s r JOIN source ON r.id = source.value) s "
- + "ON t.id == s.value "
- + "WHEN MATCHED AND t.id = 1 THEN "
- + " UPDATE SET v = 'x' "
- + "WHEN NOT MATCHED THEN "
- + " INSERT (v, id) VALUES ('invalid', -1) ",
- tableName, tableName);
-
- ImmutableList<Object[]> expectedRows =
- ImmutableList.of(
- row(1, "x"), // updated
- row(2, "v2") // kept
- );
- assertEquals(
- "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public synchronized void testMergeWithSerializableIsolation() throws InterruptedException {
- // cannot run tests with concurrency for Hadoop tables without atomic renames
- Assume.assumeFalse(catalogName.equalsIgnoreCase("testhadoop"));
-
- createAndInitTable("id INT, dep STRING");
- createOrReplaceView("source", Collections.singletonList(1), Encoders.INT());
-
- sql(
- "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')",
- tableName, MERGE_ISOLATION_LEVEL, "serializable");
-
- ExecutorService executorService =
- MoreExecutors.getExitingExecutorService(
- (ThreadPoolExecutor) Executors.newFixedThreadPool(2));
-
- AtomicInteger barrier = new AtomicInteger(0);
-
- // merge thread
- Future<?> mergeFuture =
- executorService.submit(
- () -> {
- for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) {
- while (barrier.get() < numOperations * 2) {
- sleep(10);
- }
- sql(
- "MERGE INTO %s t USING source s "
- + "ON t.id == s.value "
- + "WHEN MATCHED THEN "
- + " UPDATE SET dep = 'x'",
- tableName);
- barrier.incrementAndGet();
- }
- });
-
- // append thread
- Future<?> appendFuture =
- executorService.submit(
- () -> {
- for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) {
- while (barrier.get() < numOperations * 2) {
- sleep(10);
- }
- sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName);
- barrier.incrementAndGet();
- }
- });
-
- try {
- Assertions.assertThatThrownBy(mergeFuture::get)
- .isInstanceOf(ExecutionException.class)
- .cause()
- .isInstanceOf(SparkException.class)
- .cause()
- .isInstanceOf(ValidationException.class)
- .hasMessageContaining("Found conflicting files that can contain");
- } finally {
- appendFuture.cancel(true);
- }
-
- executorService.shutdown();
- Assert.assertTrue("Timeout", executorService.awaitTermination(2, TimeUnit.MINUTES));
- }
-
- @Test
- public synchronized void testMergeWithSnapshotIsolation()
- throws InterruptedException, ExecutionException {
- // cannot run tests with concurrency for Hadoop tables without atomic renames
- Assume.assumeFalse(catalogName.equalsIgnoreCase("testhadoop"));
-
- createAndInitTable("id INT, dep STRING");
- createOrReplaceView("source", Collections.singletonList(1), Encoders.INT());
-
- sql(
- "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')",
- tableName, MERGE_ISOLATION_LEVEL, "snapshot");
-
- ExecutorService executorService =
- MoreExecutors.getExitingExecutorService(
- (ThreadPoolExecutor) Executors.newFixedThreadPool(2));
-
- AtomicInteger barrier = new AtomicInteger(0);
-
- // merge thread
- Future<?> mergeFuture =
- executorService.submit(
- () -> {
- for (int numOperations = 0; numOperations < 20; numOperations++) {
- while (barrier.get() < numOperations * 2) {
- sleep(10);
- }
- sql(
- "MERGE INTO %s t USING source s "
- + "ON t.id == s.value "
- + "WHEN MATCHED THEN "
- + " UPDATE SET dep = 'x'",
- tableName);
- barrier.incrementAndGet();
- }
- });
-
- // append thread
- Future<?> appendFuture =
- executorService.submit(
- () -> {
- for (int numOperations = 0; numOperations < 20; numOperations++) {
- while (barrier.get() < numOperations * 2) {
- sleep(10);
- }
- sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName);
- barrier.incrementAndGet();
- }
- });
-
- try {
- mergeFuture.get();
- } finally {
- appendFuture.cancel(true);
- }
-
- executorService.shutdown();
- Assert.assertTrue("Timeout", executorService.awaitTermination(2, TimeUnit.MINUTES));
- }
-
- @Test
- public void testMergeWithExtraColumnsInSource() {
- createAndInitTable(
- "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }");
- createOrReplaceView(
- "source",
- "{ \"id\": 1, \"extra_col\": -1, \"v\": \"v1_1\" }\n"
- + "{ \"id\": 3, \"extra_col\": -1, \"v\": \"v3\" }\n"
- + "{ \"id\": 4, \"extra_col\": -1, \"v\": \"v4\" }");
-
- sql(
- "MERGE INTO %s t USING source "
- + "ON t.id == source.id "
- + "WHEN MATCHED THEN "
- + " UPDATE SET v = source.v "
- + "WHEN NOT MATCHED THEN "
- + " INSERT (v, id) VALUES (source.v, source.id)",
- tableName);
-
- ImmutableList<Object[]> expectedRows =
- ImmutableList.of(
- row(1, "v1_1"), // new
- row(2, "v2"), // kept
- row(3, "v3"), // new
- row(4, "v4") // new
- );
- assertEquals(
- "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName));
- }
-
- @Test
- public void testMergeWithNullsInTargetAndSource() {
- createAndInitTable(
- "id INT, v STRING", "{ \"id\": null, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }");
-
- createOrReplaceView(
- "source", "{ \"id\": null, \"v\": \"v1_1\" }\n" + "{ \"id\": 4, \"v\": \"v4\" }");
-
- sql(
- "MERGE INTO %s t USING source "
- + "ON t.id == source.id "
- + "WHEN MATCHED THEN "
- + " UPDATE SET v = source.v "
- + "WHEN NOT MATCHED THEN "
- + " INSERT (v, id) VALUES (source.v, source.id)",
- tableName);
-
- ImmutableList<Object[]> expectedRows =
- ImmutableList.of(
- row(null, "v1"), // kept
- row(null, "v1_1"), // new
- row(2, "v2"), // kept
- row(4, "v4") // new
- );
- assertEquals(
- "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName));
- }
-
- @Test
- public void testMergeWithNullSafeEquals() {
- createAndInitTable(
- "id INT, v STRING", "{ \"id\": null, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }");
-
- createOrReplaceView(
- "source", "{ \"id\": null, \"v\": \"v1_1\" }\n" + "{ \"id\": 4, \"v\": \"v4\" }");
-
- sql(
- "MERGE INTO %s t USING source "
- + "ON t.id <=> source.id "
- + "WHEN MATCHED THEN "
- + " UPDATE SET v = source.v "
- + "WHEN NOT MATCHED THEN "
- + " INSERT (v, id) VALUES (source.v, source.id)",
- tableName);
-
- ImmutableList<Object[]> expectedRows =
- ImmutableList.of(
- row(null, "v1_1"), // updated
- row(2, "v2"), // kept
- row(4, "v4") // new
- );
- assertEquals(
- "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName));
- }
-
- @Test
- public void testMergeWithNullCondition() {
- createAndInitTable(
- "id INT, v STRING", "{ \"id\": null, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }");
-
- createOrReplaceView(
- "source", "{ \"id\": null, \"v\": \"v1_1\" }\n" + "{ \"id\": 2, \"v\": \"v2_2\" }");
-
- sql(
- "MERGE INTO %s t USING source "
- + "ON t.id == source.id AND NULL "
- + "WHEN MATCHED THEN "
- + " UPDATE SET v = source.v "
- + "WHEN NOT MATCHED THEN "
- + " INSERT (v, id) VALUES (source.v, source.id)",
- tableName);
-
- ImmutableList<Object[]> expectedRows =
- ImmutableList.of(
- row(null, "v1"), // kept
- row(null, "v1_1"), // new
- row(2, "v2"), // kept
- row(2, "v2_2") // new
- );
- assertEquals(
- "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName));
- }
-
- @Test
- public void testMergeWithNullActionConditions() {
- createAndInitTable(
- "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }");
-
- createOrReplaceView(
- "source",
- "{ \"id\": 1, \"v\": \"v1_1\" }\n"
- + "{ \"id\": 2, \"v\": \"v2_2\" }\n"
- + "{ \"id\": 3, \"v\": \"v3_3\" }");
-
- // all conditions are NULL and will never match any rows
- sql(
- "MERGE INTO %s t USING source "
- + "ON t.id == source.id "
- + "WHEN MATCHED AND source.id = 1 AND NULL THEN "
- + " UPDATE SET v = source.v "
- + "WHEN MATCHED AND source.v = 'v1_1' AND NULL THEN "
- + " DELETE "
- + "WHEN NOT MATCHED AND source.id = 3 AND NULL THEN "
- + " INSERT (v, id) VALUES (source.v, source.id)",
- tableName);
-
- ImmutableList<Object[]> expectedRows1 =
- ImmutableList.of(
- row(1, "v1"), // kept
- row(2, "v2") // kept
- );
- assertEquals(
- "Output should match", expectedRows1, sql("SELECT * FROM %s ORDER BY v", tableName));
-
- // only the update and insert conditions are NULL
- sql(
- "MERGE INTO %s t USING source "
- + "ON t.id == source.id "
- + "WHEN MATCHED AND source.id = 1 AND NULL THEN "
- + " UPDATE SET v = source.v "
- + "WHEN MATCHED AND source.v = 'v1_1' THEN "
- + " DELETE "
- + "WHEN NOT MATCHED AND source.id = 3 AND NULL THEN "
- + " INSERT (v, id) VALUES (source.v, source.id)",
- tableName);
-
- ImmutableList<Object[]> expectedRows2 =
- ImmutableList.of(
- row(2, "v2") // kept
- );
- assertEquals(
- "Output should match", expectedRows2, sql("SELECT * FROM %s ORDER BY v", tableName));
- }
-
- @Test
- public void testMergeWithMultipleMatchingActions() {
- createAndInitTable(
- "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }");
-
- createOrReplaceView(
- "source", "{ \"id\": 1, \"v\": \"v1_1\" }\n" + "{ \"id\": 2, \"v\": \"v2_2\" }");
-
- // the order of match actions is important in this case
- sql(
- "MERGE INTO %s t USING source "
- + "ON t.id == source.id "
- + "WHEN MATCHED AND source.id = 1 THEN "
- + " UPDATE SET v = source.v "
- + "WHEN MATCHED AND source.v = 'v1_1' THEN "
- + " DELETE "
- + "WHEN NOT MATCHED THEN "
- + " INSERT (v, id) VALUES (source.v, source.id)",
- tableName);
-
- ImmutableList<Object[]> expectedRows =
- ImmutableList.of(
- row(1, "v1_1"), // updated (also matches the delete cond but update is first)
- row(2, "v2") // kept (matches neither the update nor the delete cond)
- );
- assertEquals(
- "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName));
- }
-
- @Test
- public void testMergeWithMultipleRowGroupsParquet() throws NoSuchTableException {
- Assume.assumeTrue(fileFormat.equalsIgnoreCase("parquet"));
-
- createAndInitTable("id INT, dep STRING");
- sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName);
-
- sql(
- "ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')",
- tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100);
- sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, SPLIT_SIZE, 100);
-
- createOrReplaceView("source", Collections.singletonList(1), Encoders.INT());
-
- List<Integer> ids = Lists.newArrayList();
- for (int id = 1; id <= 200; id++) {
- ids.add(id);
- }
- Dataset<Row> df =
- spark
- .createDataset(ids, Encoders.INT())
- .withColumnRenamed("value", "id")
- .withColumn("dep", lit("hr"));
- df.coalesce(1).writeTo(tableName).append();
-
- Assert.assertEquals(200, spark.table(tableName).count());
-
- // update a record from one of two row groups and copy over the second one
- sql(
- "MERGE INTO %s t USING source "
- + "ON t.id == source.value "
- + "WHEN MATCHED THEN "
- + " UPDATE SET dep = 'x'",
- tableName);
-
- Assert.assertEquals(200, spark.table(tableName).count());
- }
-
- @Test
- public void testMergeInsertOnly() {
- createAndInitTable(
- "id STRING, v STRING",
- "{ \"id\": \"a\", \"v\": \"v1\" }\n" + "{ \"id\": \"b\", \"v\": \"v2\" }");
- createOrReplaceView(
- "source",
- "{ \"id\": \"a\", \"v\": \"v1_1\" }\n"
- + "{ \"id\": \"a\", \"v\": \"v1_2\" }\n"
- + "{ \"id\": \"c\", \"v\": \"v3\" }\n"
- + "{ \"id\": \"d\", \"v\": \"v4_1\" }\n"
- + "{ \"id\": \"d\", \"v\": \"v4_2\" }");
-
- sql(
- "MERGE INTO %s t USING source "
- + "ON t.id == source.id "
- + "WHEN NOT MATCHED THEN "
... 59518 lines suppressed ...