You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cassandra.apache.org by sl...@apache.org on 2012/11/13 08:57:40 UTC

[2/2] git commit: Optimize mostRecentTomstone check in CC.collectAllData

Optimize mostRecentTomstone check in CC.collectAllData

patch by slebresne; reviewed by jbellis for CASSANDRA-4883


Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo
Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/53943180
Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/53943180
Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/53943180

Branch: refs/heads/cassandra-1.2.0
Commit: 53943180a4a41ecc5e145d1e08b0ea4a9d849c8c
Parents: 9118764
Author: Sylvain Lebresne <sy...@datastax.com>
Authored: Tue Nov 13 08:54:37 2012 +0100
Committer: Sylvain Lebresne <sy...@datastax.com>
Committed: Tue Nov 13 08:54:37 2012 +0100

----------------------------------------------------------------------
 CHANGES.txt                                        |    1 +
 .../org/apache/cassandra/config/CFMetaData.java    |    3 +-
 .../apache/cassandra/db/CollationController.java   |   33 +++++++--------
 src/java/org/apache/cassandra/db/DataTracker.java  |   31 ++++++--------
 .../cassandra/db/CollationControllerTest.java      |   20 +++++----
 5 files changed, 43 insertions(+), 45 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cassandra/blob/53943180/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 43fc53a..77cc8a8 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -8,6 +8,7 @@
  * Separate tracing from Log4J (CASSANDRA-4861)
  * Exclude gcable tombstones from merkle-tree computation (CASSANDRA-4905)
  * Better printing of AbstractBounds for tracing (CASSANDRA-4931)
+ * Optimize mostRecentTomstone check in CC.collectAllData (CASSANDRA-4883)
 
 
 1.2-beta2

http://git-wip-us.apache.org/repos/asf/cassandra/blob/53943180/src/java/org/apache/cassandra/config/CFMetaData.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/config/CFMetaData.java b/src/java/org/apache/cassandra/config/CFMetaData.java
index 921242a..c3a00b1 100644
--- a/src/java/org/apache/cassandra/config/CFMetaData.java
+++ b/src/java/org/apache/cassandra/config/CFMetaData.java
@@ -181,7 +181,8 @@ public final class CFMetaData
                                                          + "thrift_version text,"
                                                          + "cql_version text,"
                                                          + "data_center text,"
-                                                         + "rack text"
+                                                         + "rack text,"
+                                                         + "partitioner text,"
                                                          + ") WITH COMMENT='information about the local node'");
 
     public static final CFMetaData TraceSessionsCf = compile(14, "CREATE TABLE " + Tracing.SESSIONS_CF + " ("

http://git-wip-us.apache.org/repos/asf/cassandra/blob/53943180/src/java/org/apache/cassandra/db/CollationController.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/CollationController.java b/src/java/org/apache/cassandra/db/CollationController.java
index 80d0a2c..73c675f 100644
--- a/src/java/org/apache/cassandra/db/CollationController.java
+++ b/src/java/org/apache/cassandra/db/CollationController.java
@@ -241,13 +241,27 @@ public class CollationController
                 }
             }
 
+            /*
+             * We can't eliminate full sstables based on the timestamp of what we've already read like
+             * in collectTimeOrderedData, but we still want to eliminate sstable whose maxTimestamp < mostRecentTombstone
+             * we've read. We still rely on the sstable ordering by maxTimestamp since if
+             *   maxTimestamp_s1 > maxTimestamp_s0,
+             * we're guaranteed that s1 cannot have a row tombstone such that
+             *   timestamp(tombstone) > maxTimestamp_s0
+             * since we necessarily have
+             *   timestamp(tombstone) <= maxTimestamp_s1
+             * In othere words, iterating in maxTimestamp order allow to do our mostRecentTombstone elimination
+             * in one pass, and minimize the number of sstables for which we read a rowTombstone.
+             */
+            Collections.sort(view.sstables, SSTable.maxTimestampComparator);
+
             long mostRecentRowTombstone = Long.MIN_VALUE;
             for (SSTableReader sstable : view.sstables)
             {
                 // if we've already seen a row tombstone with a timestamp greater
                 // than the most recent update to this sstable, we can skip it
                 if (sstable.getMaxTimestamp() < mostRecentRowTombstone)
-                    continue;
+                    break;
 
                 OnDiskAtomIterator iter = filter.getSSTableColumnIterator(sstable);
                 iterators.add(iter);
@@ -262,23 +276,6 @@ public class CollationController
                 }
             }
 
-            // If we saw a row tombstone, do a second pass through the iterators we
-            // obtained from the sstables and drop any whose maxTimestamp < that of the
-            // row tombstone
-            {
-                Iterator<OnDiskAtomIterator> it = iterators.iterator();
-                while (it.hasNext())
-                {
-                    OnDiskAtomIterator iter = it.next();
-                    if ((iter instanceof ISSTableColumnIterator)
-                        && ((ISSTableColumnIterator) iter).getSStable().getMaxTimestamp() < mostRecentRowTombstone)
-                    {
-                        FileUtils.closeQuietly(iter);
-                        it.remove();
-                    }
-                }
-            }
-
             // we need to distinguish between "there is no data at all for this row" (BF will let us rebuild that efficiently)
             // and "there used to be data, but it's gone now" (we should cache the empty CF so we don't need to rebuild that slower)
             if (iterators.isEmpty())

http://git-wip-us.apache.org/repos/asf/cassandra/blob/53943180/src/java/org/apache/cassandra/db/DataTracker.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/DataTracker.java b/src/java/org/apache/cassandra/db/DataTracker.java
index 85da449..7e8887f 100644
--- a/src/java/org/apache/cassandra/db/DataTracker.java
+++ b/src/java/org/apache/cassandra/db/DataTracker.java
@@ -64,7 +64,7 @@ public class DataTracker
         return view.get().memtablesPendingFlush;
     }
 
-    public List<SSTableReader> getSSTables()
+    public Set<SSTableReader> getSSTables()
     {
         return view.get().sstables;
     }
@@ -300,7 +300,7 @@ public class DataTracker
     {
         view.set(new View(new Memtable(cfstore),
                           Collections.<Memtable>emptySet(),
-                          Collections.<SSTableReader>emptyList(),
+                          Collections.<SSTableReader>emptySet(),
                           Collections.<SSTableReader>emptySet(),
                           SSTableIntervalTree.empty()));
     }
@@ -456,15 +456,10 @@ public class DataTracker
         public final Memtable memtable;
         public final Set<Memtable> memtablesPendingFlush;
         public final Set<SSTableReader> compacting;
-        // We can't use a SortedSet here because "the ordering maintained by a sorted set (whether or not an
-        // explicit comparator is provided) must be <i>consistent with equals</i>."  In particular,
-        // ImmutableSortedSet will ignore any objects that compare equally with an existing Set member.
-        // Obviously, dropping sstables whose max column timestamp happens to be equal to another's
-        // is not acceptable for us.  So, we use a List instead.
-        public final List<SSTableReader> sstables;
+        public final Set<SSTableReader> sstables;
         public final SSTableIntervalTree intervalTree;
 
-        View(Memtable memtable, Set<Memtable> pendingFlush, List<SSTableReader> sstables, Set<SSTableReader> compacting, SSTableIntervalTree intervalTree)
+        View(Memtable memtable, Set<Memtable> pendingFlush, Set<SSTableReader> sstables, Set<SSTableReader> compacting, SSTableIntervalTree intervalTree)
         {
             this.memtable = memtable;
             this.memtablesPendingFlush = pendingFlush;
@@ -492,18 +487,18 @@ public class DataTracker
         public View replaceFlushed(Memtable flushedMemtable, SSTableReader newSSTable)
         {
             Set<Memtable> newPending = ImmutableSet.copyOf(Sets.difference(memtablesPendingFlush, Collections.singleton(flushedMemtable)));
-            List<SSTableReader> newSSTables = newSSTable == null
-                                            ? Collections.<SSTableReader>emptyList()
+            Set<SSTableReader> newSSTables = newSSTable == null
+                                            ? Collections.<SSTableReader>emptySet()
                                             : newSSTables(newSSTable);
             SSTableIntervalTree intervalTree = buildIntervalTree(newSSTables);
-            return new View(memtable, newPending, Collections.unmodifiableList(newSSTables), compacting, intervalTree);
+            return new View(memtable, newPending, newSSTables, compacting, intervalTree);
         }
 
         public View replace(Collection<SSTableReader> oldSSTables, Iterable<SSTableReader> replacements)
         {
-            List<SSTableReader> newSSTables = newSSTables(oldSSTables, replacements);
+            Set<SSTableReader> newSSTables = newSSTables(oldSSTables, replacements);
             SSTableIntervalTree intervalTree = buildIntervalTree(newSSTables);
-            return new View(memtable, memtablesPendingFlush, Collections.unmodifiableList(newSSTables), compacting, intervalTree);
+            return new View(memtable, memtablesPendingFlush, newSSTables, compacting, intervalTree);
         }
 
         public View markCompacting(Collection<SSTableReader> tomark)
@@ -518,19 +513,19 @@ public class DataTracker
             return new View(memtable, memtablesPendingFlush, sstables, compactingNew, intervalTree);
         }
 
-        private List<SSTableReader> newSSTables(SSTableReader newSSTable)
+        private Set<SSTableReader> newSSTables(SSTableReader newSSTable)
         {
             assert newSSTable != null;
             // not performance-sensitive, don't obsess over doing a selection merge here
             return newSSTables(Collections.<SSTableReader>emptyList(), Collections.singletonList(newSSTable));
         }
 
-        private List<SSTableReader> newSSTables(Collection<SSTableReader> oldSSTables, Iterable<SSTableReader> replacements)
+        private Set<SSTableReader> newSSTables(Collection<SSTableReader> oldSSTables, Iterable<SSTableReader> replacements)
         {
             ImmutableSet<SSTableReader> oldSet = ImmutableSet.copyOf(oldSSTables);
             int newSSTablesSize = sstables.size() - oldSSTables.size() + Iterables.size(replacements);
             assert newSSTablesSize >= Iterables.size(replacements) : String.format("Incoherent new size %d replacing %s by %s in %s", newSSTablesSize, oldSSTables, replacements, this);
-            List<SSTableReader> newSSTables = new ArrayList<SSTableReader>(newSSTablesSize);
+            Set<SSTableReader> newSSTables = new HashSet<SSTableReader>(newSSTablesSize);
             for (SSTableReader sstable : sstables)
             {
                 if (!oldSet.contains(sstable))
@@ -538,7 +533,7 @@ public class DataTracker
             }
             Iterables.addAll(newSSTables, replacements);
             assert newSSTables.size() == newSSTablesSize : String.format("Expecting new size of %d, got %d while replacing %s by %s in %s", newSSTablesSize, newSSTables.size(), oldSSTables, replacements, this);
-            return newSSTables;
+            return ImmutableSet.copyOf(newSSTables);
         }
 
         @Override

http://git-wip-us.apache.org/repos/asf/cassandra/blob/53943180/test/unit/org/apache/cassandra/db/CollationControllerTest.java
----------------------------------------------------------------------
diff --git a/test/unit/org/apache/cassandra/db/CollationControllerTest.java b/test/unit/org/apache/cassandra/db/CollationControllerTest.java
index f469639..4a8e426 100644
--- a/test/unit/org/apache/cassandra/db/CollationControllerTest.java
+++ b/test/unit/org/apache/cassandra/db/CollationControllerTest.java
@@ -30,6 +30,8 @@ import org.apache.cassandra.db.filter.QueryPath;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.junit.Test;
 
+import org.apache.cassandra.io.sstable.SSTableReader;
+
 public class CollationControllerTest extends SchemaLoader
 {
     @Test
@@ -61,20 +63,22 @@ public class CollationControllerTest extends SchemaLoader
         
         store.forceBlockingFlush();
 
+        // add yet one more mutation
+        rm = new RowMutation("Keyspace1", dk.key);
+        rm.add(path, ByteBufferUtil.bytes("foobar"), 30);
+        rm.apply();
+        store.forceBlockingFlush();
+
         // A NamesQueryFilter goes down one code path (through collectTimeOrderedData())
+        // It should only iterate the last flushed sstable, since it probably contains the most recent value for Column1
         QueryFilter filter = QueryFilter.getNamesFilter(dk, path, ByteBufferUtil.bytes("Column1"));
         CollationController controller = new CollationController(store, false, filter, Integer.MIN_VALUE);
         controller.getTopLevelColumns();
         assertEquals(1, controller.getSstablesIterated());
-        
+
         // SliceQueryFilter goes down another path (through collectAllData())
-        // Add another mutation, with a lower timestamp then force another flush 
-        // so we can assert that we're not reading every sstable 
-        rm = new RowMutation("Keyspace1", dk.key);
-        rm.add(path, ByteBufferUtil.bytes("asdf"), 5);
-        rm.apply();
-        store.forceBlockingFlush();
-        
+        // We will read "only" the last sstable in that case, but because the 2nd sstable has a tombstone that is more
+        // recent than the maxTimestamp of the very first sstable we flushed, we should only read the 2 first sstables.
         filter = QueryFilter.getIdentityFilter(dk, path);
         controller = new CollationController(store, false, filter, Integer.MIN_VALUE);
         controller.getTopLevelColumns();