You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cassandra.apache.org by jb...@apache.org on 2012/06/15 00:40:40 UTC

[2/3] git commit: avoid promoting tiny sstables patch by jbellis; reviewed by yukim for CASSANDRA-4341

avoid promoting tiny sstables
patch by jbellis; reviewed by yukim for CASSANDRA-4341


Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo
Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/a4fab90a
Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/a4fab90a
Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/a4fab90a

Branch: refs/heads/trunk
Commit: a4fab90af0063110e1118fb69b9c5923a415b9d3
Parents: 33f1bac
Author: Jonathan Ellis <jb...@apache.org>
Authored: Thu Jun 14 14:30:36 2012 -0500
Committer: Jonathan Ellis <jb...@apache.org>
Committed: Thu Jun 14 17:40:15 2012 -0500

----------------------------------------------------------------------
 CHANGES.txt                                        |    1 +
 .../cassandra/db/compaction/LeveledManifest.java   |   71 +++++++++++----
 2 files changed, 55 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cassandra/blob/a4fab90a/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index fcfff37..b0e667d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,5 @@
 1.1.2
+ * LCS no longer promotes tiny sstables out of L0 (CASSANDRA-4341)
  * skip tombstones during hint replay (CASSANDRA-4320)
  * fix NPE in compactionstats (CASSANDRA-4318)
  * enforce 1m min keycache for auto (CASSANDRA-4306)

http://git-wip-us.apache.org/repos/asf/cassandra/blob/a4fab90a/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java b/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java
index b2ba857..4ed5fac 100644
--- a/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java
+++ b/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java
@@ -63,12 +63,12 @@ public class LeveledManifest
     private final List<SSTableReader>[] generations;
     private final Map<SSTableReader, Integer> sstableGenerations;
     private final RowPosition[] lastCompactedKeys;
-    private final int maxSSTableSizeInMB;
+    private final int maxSSTableSizeInBytes;
 
     private LeveledManifest(ColumnFamilyStore cfs, int maxSSTableSizeInMB)
     {
         this.cfs = cfs;
-        this.maxSSTableSizeInMB = maxSSTableSizeInMB;
+        this.maxSSTableSizeInBytes = maxSSTableSizeInMB * 1024 * 1024;
 
         // allocate enough generations for a PB of data
         int n = (int) Math.log10(1000 * 1000 * 1000 / maxSSTableSizeInMB);
@@ -178,9 +178,18 @@ public class LeveledManifest
         if (!added.iterator().hasNext())
             return;
 
-        int newLevel = minimumLevel == maximumLevel ? maximumLevel + 1 : maximumLevel;
-        newLevel = skipLevels(newLevel, added);
-        assert newLevel > 0;
+        int newLevel;
+        if (minimumLevel == 0 && maximumLevel == 0 && SSTable.getTotalBytes(removed) < maxSSTableSizeInBytes)
+        {
+            // special case for tiny L0 sstables; see CASSANDRA-4341
+            newLevel = 0;
+        }
+        else
+        {
+            newLevel = minimumLevel == maximumLevel ? maximumLevel + 1 : maximumLevel;
+            newLevel = skipLevels(newLevel, added);
+            assert newLevel > 0;
+        }
         if (logger.isDebugEnabled())
             logger.debug("Adding [{}] at L{}", toString(added), newLevel);
 
@@ -225,8 +234,8 @@ public class LeveledManifest
     private long maxBytesForLevel(int level)
     {
         if (level == 0)
-            return 4L * maxSSTableSizeInMB * 1024 * 1024;
-        double bytes = Math.pow(10, level) * maxSSTableSizeInMB * 1024 * 1024;
+            return 4L * maxSSTableSizeInBytes;
+        double bytes = Math.pow(10, level) * maxSSTableSizeInBytes;
         if (bytes > Long.MAX_VALUE)
             throw new RuntimeException("At most " + Long.MAX_VALUE + " bytes may be in a compaction level; your maxSSTableSize must be absurdly high to compute " + bytes);
         return (long) bytes;
@@ -265,7 +274,7 @@ public class LeveledManifest
 
             // L0 gets a special case that if we don't have anything more important to do,
             // we'll go ahead and compact even just one sstable
-            if (score > 1.001 || i == 0)
+            if (score > 1.001 || (i == 0 && sstables.size() > 1))
             {
                 Collection<SSTableReader> candidates = getCandidatesFor(i);
 
@@ -368,17 +377,45 @@ public class LeveledManifest
 
         if (level == 0)
         {
-            // because L0 files may overlap each other, we treat compactions there specially:
-            // a L0 compaction also checks other L0 files for overlap.
+            // L0 is the dumping ground for new sstables which thus may overlap each other.
+            //
+            // We treat L0 compactions specially:
+            // 1a. add sstables to the candidate set until we have at least maxSSTableSizeInMB
+            // 1b. prefer choosing older sstables as candidates, to newer ones
+            // 1c. any L0 sstables that overlap a candidate, will also become candidates
+            // 2. At most MAX_COMPACTING_L0 sstables will be compacted at once
+            // 3. If total candidate size is less than maxSSTableSizeInMB, we won't bother compacting with L1,
+            //    and the result of the compaction will stay in L0 instead of being promoted (see promote())
             Set<SSTableReader> candidates = new HashSet<SSTableReader>();
-            // pick the oldest sstable from L0, and any that overlap with it
+            Set<SSTableReader> remaining = new HashSet<SSTableReader>(generations[0]);
             List<SSTableReader> ageSortedSSTables = new ArrayList<SSTableReader>(generations[0]);
             Collections.sort(ageSortedSSTables, SSTable.maxTimestampComparator);
-            List<SSTableReader> L0 = overlapping(ageSortedSSTables.get(0), generations[0]);
-            L0 = L0.size() > MAX_COMPACTING_L0 ? L0.subList(0, MAX_COMPACTING_L0) : L0;
-            // add the overlapping ones from L1
-            for (SSTableReader sstable : L0)
-                candidates.addAll(overlapping(sstable, generations[1]));
+            for (SSTableReader sstable : ageSortedSSTables)
+            {
+                if (candidates.contains(sstable))
+                    continue;
+
+                List<SSTableReader> newCandidates = overlapping(sstable, remaining);
+                candidates.addAll(newCandidates);
+                remaining.removeAll(newCandidates);
+
+                if (candidates.size() > MAX_COMPACTING_L0)
+                {
+                    // limit to only the MAX_COMPACTING_L0 oldest candidates
+                    List<SSTableReader> ageSortedCandidates = new ArrayList<SSTableReader>(candidates);
+                    Collections.sort(ageSortedCandidates, SSTable.maxTimestampComparator);
+                    return ageSortedCandidates.subList(0, MAX_COMPACTING_L0);
+                }
+
+                if (SSTable.getTotalBytes(candidates) > maxSSTableSizeInBytes)
+                {
+                    // add sstables from L1 that overlap candidates
+                    for (SSTableReader candidate : new ArrayList<SSTableReader>(candidates))
+                        candidates.addAll(overlapping(candidate, generations[1]));
+                    break;
+                }
+            }
+
             return candidates;
         }
 
@@ -470,7 +507,7 @@ public class LeveledManifest
         for (int i = generations.length - 1; i >= 0; i--)
         {
             List<SSTableReader> sstables = generations[i];
-            estimated[i] = Math.max(0L, SSTableReader.getTotalBytes(sstables) - maxBytesForLevel(i)) / (maxSSTableSizeInMB * 1024L * 1024);
+            estimated[i] = Math.max(0L, SSTableReader.getTotalBytes(sstables) - maxBytesForLevel(i)) / maxSSTableSizeInBytes;
             tasks += estimated[i];
         }