You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@iceberg.apache.org by GitBox <gi...@apache.org> on 2019/05/03 01:53:46 UTC

[GitHub] [incubator-iceberg] mccheah commented on a change in pull request #153: [Baseline] Apply baseline linting to iceberg-core

mccheah commented on a change in pull request #153: [Baseline] Apply baseline linting to iceberg-core
URL: https://github.com/apache/incubator-iceberg/pull/153#discussion_r280648571
 
 

 ##########
 File path: core/src/main/java/org/apache/iceberg/MergingSnapshotUpdate.java
 ##########
 @@ -355,80 +351,110 @@ private ManifestFile filterManifest(Expression deleteExpression,
       // this assumes that the manifest doesn't have files to remove and streams through the
       // manifest without copying data. if a manifest does have a file to remove, this will break
       // out of the loop and move on to filtering the manifest.
-      boolean hasDeletedFiles = false;
-      for (ManifestEntry entry : reader.entries()) {
-        DataFile file = entry.file();
-        boolean fileDelete = (deletePaths.contains(pathWrapper.set(file.path())) ||
-            dropPartitions.contains(partitionWrapper.set(file.partition())));
-        if (fileDelete || inclusive.eval(file.partition())) {
-          ValidationException.check(
-              fileDelete || strict.eval(file.partition()) || metricsEvaluator.eval(file),
-              "Cannot delete file where some, but not all, rows match filter %s: %s",
-              this.deleteExpression, file.path());
-
-          hasDeletedFiles = true;
-          if (failAnyDelete) {
-            throw new DeleteException(writeSpec().partitionToPath(file.partition()));
-          }
-          break; // as soon as a deleted file is detected, stop scanning
-        }
-      }
+      boolean hasDeletedFiles =
+          manifestHasDeletedFiles(metricsEvaluator, reader, inclusive, strict, pathWrapper, partitionWrapper);
 
       if (!hasDeletedFiles) {
         filteredManifests.put(manifest, manifest);
         return manifest;
       }
 
-      // when this point is reached, there is at least one file that will be deleted in the
-      // manifest. produce a copy of the manifest with all deleted files removed.
-      List<DataFile> deletedFiles = Lists.newArrayList();
-      Set<CharSequenceWrapper> deletedPaths = Sets.newHashSet();
-      OutputFile filteredCopy = manifestPath(manifestCount.getAndIncrement());
-      ManifestWriter writer = new ManifestWriter(reader.spec(), filteredCopy, snapshotId());
-      try {
-        for (ManifestEntry entry : reader.entries()) {
-          DataFile file = entry.file();
-          boolean fileDelete = (deletePaths.contains(pathWrapper.set(file.path())) ||
-              dropPartitions.contains(partitionWrapper.set(file.partition())));
-          if (entry.status() != Status.DELETED) {
-            if (fileDelete || inclusive.eval(file.partition())) {
-              ValidationException.check(
-                  fileDelete || strict.eval(file.partition()) || metricsEvaluator.eval(file),
-                  "Cannot delete file where some, but not all, rows match filter %s: %s",
-                  this.deleteExpression, file.path());
-
-              writer.delete(entry);
-
-              CharSequenceWrapper wrapper = CharSequenceWrapper.wrap(entry.file().path());
-              if (deletedPaths.contains(wrapper)) {
-                LOG.warn("Deleting a duplicate path from manifest {}: {}",
-                    manifest.path(), wrapper.get());
-                summaryBuilder.incrementDuplicateDeletes();
-              } else {
-                // only add the file to deletes if it is a new delete
-                // this keeps the snapshot summary accurate for non-duplicate data
-                deletedFiles.add(entry.file().copy());
-              }
-              deletedPaths.add(wrapper);
+      return filterManifestsWithDeletedFiles(
+          metricsEvaluator,
+          manifest,
+          reader,
+          inclusive,
+          strict,
+          pathWrapper,
+          partitionWrapper);
+    }
+  }
 
+  private boolean manifestHasDeletedFiles(
+      StrictMetricsEvaluator metricsEvaluator,
+      ManifestReader reader,
+      Evaluator inclusive,
+      Evaluator strict,
+      CharSequenceWrapper pathWrapper,
+      StructLikeWrapper partitionWrapper) {
+    boolean hasDeletedFiles = false;
+    for (ManifestEntry entry : reader.entries()) {
+      DataFile file = entry.file();
+      boolean fileDelete = deletePaths.contains(pathWrapper.set(file.path())) ||
+          dropPartitions.contains(partitionWrapper.set(file.partition()));
+      if (fileDelete || inclusive.eval(file.partition())) {
+        ValidationException.check(
+            fileDelete || strict.eval(file.partition()) || metricsEvaluator.eval(file),
+            "Cannot delete file where some, but not all, rows match filter %s: %s",
+            this.deleteExpression, file.path());
+
+        hasDeletedFiles = true;
+        if (failAnyDelete) {
+          throw new DeleteException(writeSpec().partitionToPath(file.partition()));
+        }
+        break; // as soon as a deleted file is detected, stop scanning
+      }
+    }
+    return hasDeletedFiles;
+  }
+
+  private ManifestFile filterManifestsWithDeletedFiles(
+      StrictMetricsEvaluator metricsEvaluator,
+      ManifestFile manifest,
+      ManifestReader reader,
+      Evaluator inclusive,
 
 Review comment:
   These evaluators are used both here and in the other helper method, `manifestHasDeletedFiles`. Hence why we evaluate them at the level above and then pass through method args.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@iceberg.apache.org
For additional commands, e-mail: issues-help@iceberg.apache.org