You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cassandra.apache.org by jb...@apache.org on 2013/11/28 18:55:32 UTC

git commit: Make flush single-pass when partition tombstones are present patch by jbellis; reviewed by slebresne for CASSANDRA-6417

Updated Branches:
  refs/heads/trunk 122bb6b20 -> 58bcdd493


Make flush single-pass when partition tombstones are present
patch by jbellis; reviewed by slebresne for CASSANDRA-6417


Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo
Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/58bcdd49
Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/58bcdd49
Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/58bcdd49

Branch: refs/heads/trunk
Commit: 58bcdd493280252d69151361676e60ce8974736d
Parents: 122bb6b
Author: Jonathan Ellis <jb...@apache.org>
Authored: Thu Nov 28 11:55:04 2013 -0600
Committer: Jonathan Ellis <jb...@apache.org>
Committed: Thu Nov 28 11:55:04 2013 -0600

----------------------------------------------------------------------
 .../org/apache/cassandra/db/ColumnFamilyStore.java  | 16 ++--------------
 src/java/org/apache/cassandra/db/ColumnIndex.java   | 16 +++++++++++-----
 src/java/org/apache/cassandra/db/DeletionTime.java  |  4 ++--
 src/java/org/apache/cassandra/db/Memtable.java      | 10 ----------
 4 files changed, 15 insertions(+), 31 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cassandra/blob/58bcdd49/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
index ccc15ab..86bb7b4 100644
--- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
+++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
@@ -873,7 +873,7 @@ public class ColumnFamilyStore implements ColumnFamilyStoreMBean
     }
 
     /*
-     This is complicated because we need to preserve deleted columns, supercolumns, and columnfamilies
+     This is complicated because we need to preserve deleted columns and columnfamilies
      until they have been deleted for at least GC_GRACE_IN_SECONDS.  But, we do not need to preserve
      their contents; just the object itself as a "tombstone" that can be used to repair other
      replicas that do not know about the deletion.
@@ -885,16 +885,9 @@ public class ColumnFamilyStore implements ColumnFamilyStoreMBean
             return null;
         }
 
-        removeDeletedColumnsOnly(cf, gcBefore, indexer);
-        return removeDeletedCF(cf, gcBefore);
-    }
-
-    private static long removeDeletedColumnsOnly(ColumnFamily cf, int gcBefore, SecondaryIndexManager.Updater indexer)
-    {
         Iterator<Column> iter = cf.iterator();
         DeletionInfo.InOrderTester tester = cf.inOrderDeletionTester();
         boolean hasDroppedColumns = !cf.metadata.getDroppedColumns().isEmpty();
-        long removedBytes = 0;
         while (iter.hasNext())
         {
             Column c = iter.next();
@@ -906,15 +899,10 @@ public class ColumnFamilyStore implements ColumnFamilyStoreMBean
             {
                 iter.remove();
                 indexer.remove(c);
-                removedBytes += c.dataSize();
             }
         }
-        return removedBytes;
-    }
 
-    public static long removeDeletedColumnsOnly(ColumnFamily cf, int gcBefore)
-    {
-        return removeDeletedColumnsOnly(cf, gcBefore, SecondaryIndexManager.nullUpdater);
+        return removeDeletedCF(cf, gcBefore);
     }
 
     // returns true if

http://git-wip-us.apache.org/repos/asf/cassandra/blob/58bcdd49/src/java/org/apache/cassandra/db/ColumnIndex.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/ColumnIndex.java b/src/java/org/apache/cassandra/db/ColumnIndex.java
index 6dd2028..8729238 100644
--- a/src/java/org/apache/cassandra/db/ColumnIndex.java
+++ b/src/java/org/apache/cassandra/db/ColumnIndex.java
@@ -119,18 +119,25 @@ public class ColumnIndex
         public ColumnIndex build(ColumnFamily cf) throws IOException
         {
             // cf has disentangled the columns and range tombstones, we need to re-interleave them in comparator order
+            Comparator<ByteBuffer> comparator = cf.getComparator();
+            DeletionInfo.InOrderTester tester = cf.deletionInfo().inOrderTester();
             Iterator<RangeTombstone> rangeIter = cf.deletionInfo().rangeIterator();
             RangeTombstone tombstone = rangeIter.hasNext() ? rangeIter.next() : null;
-            Comparator<ByteBuffer> comparator = cf.getComparator();
 
             for (Column c : cf)
             {
                 while (tombstone != null && comparator.compare(c.name(), tombstone.min) >= 0)
                 {
-                    add(tombstone);
+                    // skip range tombstones that are shadowed by partition tombstones
+                    if (!cf.deletionInfo().getTopLevelDeletion().isDeleted(tombstone))
+                        add(tombstone);
                     tombstone = rangeIter.hasNext() ? rangeIter.next() : null;
                 }
-                add(c);
+
+                // We can skip any cell if it's shadowed by a tombstone already.  This is a more
+                // general case than was handled by CASSANDRA-2589.
+                if (!tester.isDeleted(c))
+                    add(c);
             }
 
             while (tombstone != null)
@@ -158,9 +165,8 @@ public class ColumnIndex
                 OnDiskAtom c =  columns.next();
                 add(c);
             }
-            ColumnIndex index = build();
 
-            return index;
+            return build();
         }
 
         public void add(OnDiskAtom column) throws IOException

http://git-wip-us.apache.org/repos/asf/cassandra/blob/58bcdd49/src/java/org/apache/cassandra/db/DeletionTime.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/DeletionTime.java b/src/java/org/apache/cassandra/db/DeletionTime.java
index 3d6fad4..79244a6 100644
--- a/src/java/org/apache/cassandra/db/DeletionTime.java
+++ b/src/java/org/apache/cassandra/db/DeletionTime.java
@@ -83,9 +83,9 @@ public class DeletionTime implements Comparable<DeletionTime>
         return localDeletionTime < gcBefore;
     }
 
-    public boolean isDeleted(Column column)
+    public boolean isDeleted(OnDiskAtom atom)
     {
-        return column.timestamp() <= markedForDeleteAt;
+        return atom.maxTimestamp() <= markedForDeleteAt;
     }
 
     public long memorySize()

http://git-wip-us.apache.org/repos/asf/cassandra/blob/58bcdd49/src/java/org/apache/cassandra/db/Memtable.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/Memtable.java b/src/java/org/apache/cassandra/db/Memtable.java
index 6819cda..064851a 100644
--- a/src/java/org/apache/cassandra/db/Memtable.java
+++ b/src/java/org/apache/cassandra/db/Memtable.java
@@ -351,16 +351,6 @@ public class Memtable
                         // See CASSANDRA-4667.
                         if (cfs.name.equals(SystemKeyspace.BATCHLOG_CF) && cfs.keyspace.getName().equals(Keyspace.SYSTEM_KS) && !(cf.getColumnCount() == 0))
                             continue;
-
-                        // Pedantically, you could purge column level tombstones that are past GcGRace when writing to the SSTable.
-                        // But it can result in unexpected behaviour where deletes never make it to disk,
-                        // as they are lost and so cannot override existing column values. So we only remove deleted columns if there
-                        // is a CF level tombstone to ensure the delete makes it into an SSTable.
-                        // We also shouldn't be dropping any columns obsoleted by partition and/or range tombstones in case
-                        // the table has secondary indexes, or else the stale entries wouldn't be cleaned up during compaction,
-                        // and will only be dropped during 2i query read-repair, if at all.
-                        if (!cfs.indexManager.hasIndexes())
-                            currentSize.addAndGet(-ColumnFamilyStore.removeDeletedColumnsOnly(cf, Integer.MIN_VALUE));
                     }
 
                     if (cf.getColumnCount() > 0 || cf.isMarkedForDelete())