You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@iceberg.apache.org by GitBox <gi...@apache.org> on 2020/08/14 16:29:26 UTC
[GitHub] [iceberg] rdblue commented on a change in pull request #1338: Add file stats range optimizations for DeleteFileIndex

rdblue commented on a change in pull request #1338:
URL: https://github.com/apache/iceberg/pull/1338#discussion_r470728905



##########
File path: core/src/main/java/org/apache/iceberg/DeleteFileIndex.java
##########
@@ -96,21 +100,101 @@ private StructLikeWrapper newWrapper(int specId) {
     Pair<Integer, StructLikeWrapper> partition = partition(file.specId(), file.partition());
     Pair<long[], DeleteFile[]> partitionDeletes = sortedDeletesByPartition.get(partition);
 
+    Stream<DeleteFile> matchingDeletes;
     if (partitionDeletes == null) {
-      return limitBySequenceNumber(sequenceNumber, globalSeqs, globalDeletes);
+      matchingDeletes = limitBySequenceNumber(sequenceNumber, globalSeqs, globalDeletes);
     } else if (globalDeletes == null) {
-      return limitBySequenceNumber(sequenceNumber, partitionDeletes.first(), partitionDeletes.second());
+      matchingDeletes = limitBySequenceNumber(sequenceNumber, partitionDeletes.first(), partitionDeletes.second());
     } else {
-      return Stream.concat(
-          Stream.of(limitBySequenceNumber(sequenceNumber, globalSeqs, globalDeletes)),
-          Stream.of(limitBySequenceNumber(sequenceNumber, partitionDeletes.first(), partitionDeletes.second()))
-      ).toArray(DeleteFile[]::new);
+      matchingDeletes = Stream.concat(
+          limitBySequenceNumber(sequenceNumber, globalSeqs, globalDeletes),
+          limitBySequenceNumber(sequenceNumber, partitionDeletes.first(), partitionDeletes.second()));
+    }
+
+    return matchingDeletes
+        .filter(deleteFile -> canContainDeletesForFile(file, deleteFile, schema))
+        .toArray(DeleteFile[]::new);
+  }
+
+  private static boolean canContainDeletesForFile(DataFile dataFile, DeleteFile deleteFile, Schema schema) {
+    switch (deleteFile.content()) {
+      case POSITION_DELETES:
+        // check that the delete file can contain the data file's file_path
+        Map<Integer, ByteBuffer> lowers = deleteFile.lowerBounds();
+        Map<Integer, ByteBuffer> uppers = deleteFile.upperBounds();
+        if (lowers == null || uppers == null) {
+          return true;
+        }
+
+        Type pathType = MetadataColumns.DELETE_FILE_PATH.type();
+        int pathId = MetadataColumns.DELETE_FILE_PATH.fieldId();
+        ByteBuffer lower = lowers.get(pathId);
+        if (lower != null &&
+            Comparators.charSequences().compare(dataFile.path(), Conversions.fromByteBuffer(pathType, lower)) < 0) {
+          return false;
+        }
+
+        ByteBuffer upper = uppers.get(pathId);
+        if (upper != null &&
+            Comparators.charSequences().compare(dataFile.path(), Conversions.fromByteBuffer(pathType, upper)) > 0) {
+          return false;
+        }
+
+        break;
+
+      case EQUALITY_DELETES:
+        if (dataFile.lowerBounds() == null || dataFile.upperBounds() == null ||
+            deleteFile.lowerBounds() == null || deleteFile.upperBounds() == null) {
+          return true;
+        }
+
+        Map<Integer, ByteBuffer> dataLowers = dataFile.lowerBounds();
+        Map<Integer, ByteBuffer> dataUppers = dataFile.upperBounds();
+        Map<Integer, ByteBuffer> deleteLowers = deleteFile.lowerBounds();
+        Map<Integer, ByteBuffer> deleteUppers = deleteFile.upperBounds();
+
+        for (int id : deleteFile.equalityFieldIds()) {
+          Type type = schema.findType(id);
+          if (!type.isPrimitiveType()) {
+            return true;
+          }
+
+          if (!rangesOverlap(type.asPrimitiveType(),
+              dataLowers.get(id), dataUppers.get(id), deleteLowers.get(id), deleteUppers.get(id))) {

Review comment:
       Yes, the stats may be missing. Looks like I missed that case here, but it is handled in the check for position deletes.
   
   I'll update this. Good catch!




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@iceberg.apache.org
For additional commands, e-mail: issues-help@iceberg.apache.org