You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cassandra.apache.org by jb...@apache.org on 2013/04/19 20:27:51 UTC

[2/2] git commit: Always record row-level tombstones in index component; this time from the correct feature branch patch by jbellis; reviewed by jasobrown for CASSANDRA-5487

Always record row-level tombstones in index component; this time from the correct feature branch
patch by jbellis; reviewed by jasobrown for CASSANDRA-5487


Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo
Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/d7a09825
Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/d7a09825
Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/d7a09825

Branch: refs/heads/cassandra-1.2
Commit: d7a09825025374b6a49a250467039dc15f36d053
Parents: 17034c0
Author: Jonathan Ellis <jb...@apache.org>
Authored: Fri Apr 19 13:27:22 2013 -0500
Committer: Jonathan Ellis <jb...@apache.org>
Committed: Fri Apr 19 13:27:46 2013 -0500

----------------------------------------------------------------------
 build.xml                                          |    2 +-
 src/java/org/apache/cassandra/db/ColumnIndex.java  |    5 +-
 src/java/org/apache/cassandra/db/DeletionInfo.java |    5 +
 .../org/apache/cassandra/db/RowIndexEntry.java     |   99 +++++------
 .../db/columniterator/IndexedSliceReader.java      |   91 +++++-----
 .../db/columniterator/SSTableNamesIterator.java    |  131 +++++----------
 .../db/columniterator/SimpleSliceReader.java       |    9 +-
 .../apache/cassandra/io/sstable/Descriptor.java    |    7 +-
 .../apache/cassandra/io/sstable/IndexHelper.java   |    5 -
 .../apache/cassandra/io/sstable/SSTableReader.java |    2 +-
 .../apache/cassandra/io/sstable/SSTableWriter.java |   16 +-
 .../cassandra/utils/AlwaysPresentFilter.java       |    4 +
 .../org/apache/cassandra/utils/FilterFactory.java  |    2 +-
 .../apache/cassandra/db/RangeTombstoneTest.java    |    4 -
 14 files changed, 170 insertions(+), 212 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cassandra/blob/d7a09825/build.xml
----------------------------------------------------------------------
diff --git a/build.xml b/build.xml
index 3491431..2ddd43d 100644
--- a/build.xml
+++ b/build.xml
@@ -519,7 +519,7 @@
       </artifact:pom>
     </target>
 
-    <target name="maven-ant-tasks-retrieve-build" depends="maven-declare-dependencies">
+    <target name="maven-ant-tasks-retrieve-build" depends="maven-declare-dependencies" unless="without.maven">
       <artifact:dependencies pomRefId="build-deps-pom"
                              filesetId="build-dependency-jars" 
                              sourcesFilesetId="build-dependency-sources" 

http://git-wip-us.apache.org/repos/asf/cassandra/blob/d7a09825/src/java/org/apache/cassandra/db/ColumnIndex.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/ColumnIndex.java b/src/java/org/apache/cassandra/db/ColumnIndex.java
index bcd0eef..e2ac3e4 100644
--- a/src/java/org/apache/cassandra/db/ColumnIndex.java
+++ b/src/java/org/apache/cassandra/db/ColumnIndex.java
@@ -33,7 +33,7 @@ public class ColumnIndex
     public final List<IndexHelper.IndexInfo> columnsIndex;
     public final IFilter bloomFilter;
 
-    private static final ColumnIndex EMPTY = new ColumnIndex(Collections.<IndexHelper.IndexInfo>emptyList(), new AlwaysPresentFilter());
+    private static final ColumnIndex EMPTY = new ColumnIndex(Collections.<IndexHelper.IndexInfo>emptyList(), AlwaysPresentFilter.instance);
 
     private ColumnIndex(int estimatedColumnCount)
     {
@@ -42,6 +42,9 @@ public class ColumnIndex
 
     private ColumnIndex(List<IndexHelper.IndexInfo> columnsIndex, IFilter bloomFilter)
     {
+        assert columnsIndex != null;
+        assert bloomFilter != null;
+
         this.columnsIndex = columnsIndex;
         this.bloomFilter = bloomFilter;
     }

http://git-wip-us.apache.org/repos/asf/cassandra/blob/d7a09825/src/java/org/apache/cassandra/db/DeletionInfo.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/DeletionInfo.java b/src/java/org/apache/cassandra/db/DeletionInfo.java
index 405645f..095a91d 100644
--- a/src/java/org/apache/cassandra/db/DeletionInfo.java
+++ b/src/java/org/apache/cassandra/db/DeletionInfo.java
@@ -73,6 +73,11 @@ public class DeletionInfo
         this.ranges = ranges;
     }
 
+    public DeletionInfo(DeletionTime deletion)
+    {
+        this(deletion, IntervalTree.<ByteBuffer, DeletionTime, RangeTombstone>emptyTree());
+    }
+
     public static Serializer serializer()
     {
         return serializer;

http://git-wip-us.apache.org/repos/asf/cassandra/blob/d7a09825/src/java/org/apache/cassandra/db/RowIndexEntry.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/RowIndexEntry.java b/src/java/org/apache/cassandra/db/RowIndexEntry.java
index a831498..c6caa3a 100644
--- a/src/java/org/apache/cassandra/db/RowIndexEntry.java
+++ b/src/java/org/apache/cassandra/db/RowIndexEntry.java
@@ -28,6 +28,7 @@ import org.apache.cassandra.cache.IMeasurableMemory;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.IndexHelper;
 import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.utils.AlwaysPresentFilter;
 import org.apache.cassandra.utils.IFilter;
 import org.apache.cassandra.utils.FilterFactory;
 import org.apache.cassandra.utils.ObjectSizes;
@@ -45,35 +46,41 @@ public class RowIndexEntry implements IMeasurableMemory
 
     public int serializedSize()
     {
-        return TypeSizes.NATIVE.sizeof(position);
+        return TypeSizes.NATIVE.sizeof(position) + promotedSize();
     }
 
-    public static RowIndexEntry create(long position, DeletionInfo deletionInfo, ColumnIndex index)
+    public int promotedSize()
     {
-        if (index != null && index.columnsIndex != null && index.columnsIndex.size() > 1)
-            return new IndexedEntry(position, deletionInfo, index.columnsIndex, index.bloomFilter);
-        else
-            return new RowIndexEntry(position);
+        return 0;
     }
 
-    public boolean isIndexed()
+    public static RowIndexEntry create(long position, DeletionTime deletion, ColumnIndex index)
     {
-        return !columnsIndex().isEmpty();
+        assert deletion != null;
+        assert index != null;
+
+        if (index.columnsIndex.size() > 1 || deletion != DeletionTime.LIVE)
+            return new IndexedEntry(position,
+                                    deletion,
+                                    index.columnsIndex.isEmpty() ? Collections.<IndexHelper.IndexInfo>emptyList() : index.columnsIndex,
+                                    AlwaysPresentFilter.instance);
+        else
+            return new RowIndexEntry(position);
     }
 
-    public DeletionInfo deletionInfo()
+    public DeletionTime deletionTime()
     {
-        throw new UnsupportedOperationException();
+        return DeletionTime.LIVE;
     }
 
     public List<IndexHelper.IndexInfo> columnsIndex()
     {
-        return Collections.<IndexHelper.IndexInfo>emptyList();
+        return Collections.emptyList();
     }
 
     public IFilter bloomFilter()
     {
-        throw new UnsupportedOperationException();
+        return AlwaysPresentFilter.instance;
     }
 
     public long memorySize()
@@ -87,14 +94,13 @@ public class RowIndexEntry implements IMeasurableMemory
         public void serialize(RowIndexEntry rie, DataOutput dos) throws IOException
         {
             dos.writeLong(rie.position);
-            if (rie.isIndexed())
+            if (!rie.columnsIndex().isEmpty() || rie.deletionTime() != DeletionTime.LIVE)
             {
-                dos.writeInt(((IndexedEntry)rie).serializedSize());
-                DeletionInfo.serializer().serializeForSSTable(rie.deletionInfo(), dos);
+                dos.writeInt(rie.promotedSize());
+                DeletionTime.serializer.serialize(rie.deletionTime(), dos);
                 dos.writeInt(rie.columnsIndex().size());
                 for (IndexHelper.IndexInfo info : rie.columnsIndex())
                     info.serialize(dos);
-                FilterFactory.serialize(rie.bloomFilter(), dos);
             }
             else
             {
@@ -102,38 +108,23 @@ public class RowIndexEntry implements IMeasurableMemory
             }
         }
 
-        public RowIndexEntry deserializePositionOnly(DataInput dis, Descriptor.Version version) throws IOException
-        {
-            long position = dis.readLong();
-            if (version.hasPromotedIndexes)
-            {
-                int size = dis.readInt();
-                if (size > 0)
-                    FileUtils.skipBytesFully(dis, size);
-            }
-            return new RowIndexEntry(position);
-        }
-
         public RowIndexEntry deserialize(DataInput dis, Descriptor.Version version) throws IOException
         {
             long position = dis.readLong();
-            if (version.hasPromotedIndexes)
+            if (!version.hasPromotedIndexes)
+                return new RowIndexEntry(position);
+
+            int size = dis.readInt();
+            if (size > 0)
             {
-                int size = dis.readInt();
-                if (size > 0)
-                {
-                    DeletionInfo delInfo = DeletionInfo.serializer().deserializeFromSSTable(dis, version);
-                    int entries = dis.readInt();
-                    List<IndexHelper.IndexInfo> columnsIndex = new ArrayList<IndexHelper.IndexInfo>(entries);
-                    for (int i = 0; i < entries; i++)
-                        columnsIndex.add(IndexHelper.IndexInfo.deserialize(dis));
-                    IFilter bf = FilterFactory.deserialize(dis, version.filterType, false);
-                    return new IndexedEntry(position, delInfo, columnsIndex, bf);
-                }
-                else
-                {
-                    return new RowIndexEntry(position);
-                }
+                DeletionTime deletion = DeletionTime.serializer.deserialize(dis);
+                int entries = dis.readInt();
+                List<IndexHelper.IndexInfo> columnsIndex = new ArrayList<IndexHelper.IndexInfo>(entries);
+                for (int i = 0; i < entries; i++)
+                    columnsIndex.add(IndexHelper.IndexInfo.deserialize(dis));
+                if (!version.hasPromotedRowTombstones && entries > 0)
+                    FilterFactory.deserialize(dis, version.filterType, false);
+                return new IndexedEntry(position, deletion, columnsIndex, AlwaysPresentFilter.instance);
             }
             else
             {
@@ -163,24 +154,24 @@ public class RowIndexEntry implements IMeasurableMemory
      */
     private static class IndexedEntry extends RowIndexEntry
     {
-        private final DeletionInfo deletionInfo;
+        private final DeletionTime deletion;
         private final List<IndexHelper.IndexInfo> columnsIndex;
         private final IFilter bloomFilter;
 
-        private IndexedEntry(long position, DeletionInfo deletionInfo, List<IndexHelper.IndexInfo> columnsIndex, IFilter bloomFilter)
+        private IndexedEntry(long position, DeletionTime deletion, List<IndexHelper.IndexInfo> columnsIndex, IFilter bloomFilter)
         {
             super(position);
-            assert deletionInfo != null;
-            assert columnsIndex != null && columnsIndex.size() > 1;
-            this.deletionInfo = deletionInfo;
+            assert deletion != null;
+            assert columnsIndex != null;
+            this.deletion = deletion;
             this.columnsIndex = columnsIndex;
             this.bloomFilter = bloomFilter;
         }
 
         @Override
-        public DeletionInfo deletionInfo()
+        public DeletionTime deletionTime()
         {
-            return deletionInfo;
+            return deletion;
         }
 
         @Override
@@ -196,15 +187,15 @@ public class RowIndexEntry implements IMeasurableMemory
         }
 
         @Override
-        public int serializedSize()
+        public int promotedSize()
         {
             TypeSizes typeSizes = TypeSizes.NATIVE;
-            long size = DeletionTime.serializer.serializedSize(deletionInfo.getTopLevelDeletion(), typeSizes);
+            long size = DeletionTime.serializer.serializedSize(deletion, typeSizes);
             size += typeSizes.sizeof(columnsIndex.size()); // number of entries
             for (IndexHelper.IndexInfo info : columnsIndex)
                 size += info.serializedSize(typeSizes);
 
-            size += FilterFactory.serializedSize(bloomFilter);
+            size += bloomFilter instanceof AlwaysPresentFilter ? 0 : FilterFactory.serializedSize(bloomFilter);
             assert size <= Integer.MAX_VALUE;
             return (int)size;
         }

http://git-wip-us.apache.org/repos/asf/cassandra/blob/d7a09825/src/java/org/apache/cassandra/db/columniterator/IndexedSliceReader.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/columniterator/IndexedSliceReader.java b/src/java/org/apache/cassandra/db/columniterator/IndexedSliceReader.java
index 7289ab0..01740ca 100644
--- a/src/java/org/apache/cassandra/db/columniterator/IndexedSliceReader.java
+++ b/src/java/org/apache/cassandra/db/columniterator/IndexedSliceReader.java
@@ -65,7 +65,7 @@ class IndexedSliceReader extends AbstractIterator<OnDiskAtom> implements OnDiskA
      * finish (reverse start) elements. i.e. forward: [a,b],[d,e],[g,h] reverse: [h,g],[e,d],[b,a]. This reader also
      * assumes that validation has been performed in terms of intervals (no overlapping intervals).
      */
-    public IndexedSliceReader(SSTableReader sstable, RowIndexEntry indexEntry, FileDataInput input, ColumnSlice[] slices, boolean reversed)
+    public IndexedSliceReader(SSTableReader sstable, RowIndexEntry rowEntry, FileDataInput input, ColumnSlice[] slices, boolean reversed)
     {
         this.sstable = sstable;
         this.originalInput = input;
@@ -76,34 +76,53 @@ class IndexedSliceReader extends AbstractIterator<OnDiskAtom> implements OnDiskA
         try
         {
             Descriptor.Version version = sstable.descriptor.version;
+            emptyColumnFamily = ColumnFamily.create(sstable.metadata);
+
+            if (version.hasPromotedRowTombstones && !rowEntry.columnsIndex().isEmpty())
+            {
+                // skip the row header entirely
+                indexes = rowEntry.columnsIndex();
+                emptyColumnFamily.delete(new DeletionInfo(rowEntry.deletionTime()));
+                fetcher = new IndexedBlockFetcher(rowEntry.position);
+                return;
+            }
+
+            // skip up to bloom filter where things get a bit more interesting
+            if (input == null)
+            {
+                file = sstable.getFileDataInput(rowEntry.position);
+            }
+            else
+            {
+                file = input;
+                file.seek(rowEntry.position);
+            }
+            this.sstable.decodeKey(ByteBufferUtil.readWithShortLength(file));
+            SSTableReader.readRowSize(file, this.sstable.descriptor);
+
+            // read the row header up to and including the row-level tombstones
             if (version.hasPromotedIndexes)
             {
-                this.indexes = indexEntry.columnsIndex();
-                if (indexes.isEmpty())
-                {
-                    setToRowStart(sstable, indexEntry, input);
-                    this.emptyColumnFamily = ColumnFamily.create(sstable.metadata);
-                    emptyColumnFamily.delete(DeletionInfo.serializer().deserializeFromSSTable(file, version));
-                    fetcher = new SimpleBlockFetcher();
-                }
-                else
-                {
-                    this.emptyColumnFamily = ColumnFamily.create(sstable.metadata);
-                    emptyColumnFamily.delete(indexEntry.deletionInfo());
-                    fetcher = new IndexedBlockFetcher(indexEntry.position);
-                }
+                indexes = rowEntry.columnsIndex();
+                // we'll get row deletion time from the row header below
             }
             else
             {
-                setToRowStart(sstable, indexEntry, input);
                 IndexHelper.skipSSTableBloomFilter(file, version);
-                this.indexes = IndexHelper.deserializeIndex(file);
-                this.emptyColumnFamily = ColumnFamily.create(sstable.metadata);
-                emptyColumnFamily.delete(DeletionInfo.serializer().deserializeFromSSTable(file, version));
-                fetcher = indexes.isEmpty()
-                        ? new SimpleBlockFetcher()
-                        : new IndexedBlockFetcher(file.getFilePointer() + 4); // We still have the column count to
-                                                                              // skip to get the basePosition
+                indexes = IndexHelper.deserializeIndex(file);
+            }
+            emptyColumnFamily.delete(DeletionInfo.serializer().deserializeFromSSTable(file, version));
+
+            if (indexes.isEmpty())
+            {
+                fetcher = new SimpleBlockFetcher();
+            }
+            else
+            {
+                // index offsets changed to be based against the row key start in 1.2
+                fetcher = version.hasPromotedIndexes
+                        ? new IndexedBlockFetcher(rowEntry.position)
+                        : new IndexedBlockFetcher(file.getFilePointer() + 4); // +4 to skip the int column count
             }
         }
         catch (IOException e)
@@ -113,24 +132,6 @@ class IndexedSliceReader extends AbstractIterator<OnDiskAtom> implements OnDiskA
         }
     }
 
-    /**
-     * Sets the seek position to the start of the row for column scanning.
-     */
-    private void setToRowStart(SSTableReader reader, RowIndexEntry indexEntry, FileDataInput input) throws IOException
-    {
-        if (input == null)
-        {
-            this.file = sstable.getFileDataInput(indexEntry.position);
-        }
-        else
-        {
-            this.file = input;
-            input.seek(indexEntry.position);
-        }
-        sstable.decodeKey(ByteBufferUtil.readWithShortLength(file));
-        SSTableReader.readRowSize(file, sstable.descriptor);
-    }
-
     public ColumnFamily getColumnFamily()
     {
         return emptyColumnFamily;
@@ -197,8 +198,6 @@ class IndexedSliceReader extends AbstractIterator<OnDiskAtom> implements OnDiskA
             return reversed ? slices[currentSliceIdx].start : slices[currentSliceIdx].finish;
         }
 
-        protected abstract boolean setNextSlice();
-
         protected abstract boolean fetchMoreData();
 
         protected boolean isColumnBeforeSliceStart(OnDiskAtom column)
@@ -248,7 +247,7 @@ class IndexedSliceReader extends AbstractIterator<OnDiskAtom> implements OnDiskA
             setNextSlice();
         }
 
-        protected boolean setNextSlice()
+        private boolean setNextSlice()
         {
             while (++currentSliceIdx < slices.length)
             {
@@ -350,7 +349,7 @@ class IndexedSliceReader extends AbstractIterator<OnDiskAtom> implements OnDiskA
             /* seek to the correct offset to the data, and calculate the data size */
             long positionToSeek = basePosition + currentIndex.offset;
 
-            // With new promoted indexes, our first seek in the data file will happen at that point.
+            // With 1.2 promoted indexes, our first seek in the data file will happen at this point
             if (file == null)
                 file = originalInput == null ? sstable.getFileDataInput(positionToSeek) : originalInput;
 
@@ -464,7 +463,7 @@ class IndexedSliceReader extends AbstractIterator<OnDiskAtom> implements OnDiskA
             }
         }
 
-        protected boolean setNextSlice()
+        private boolean setNextSlice()
         {
             if (reversed)
             {

http://git-wip-us.apache.org/repos/asf/cassandra/blob/d7a09825/src/java/org/apache/cassandra/db/columniterator/SSTableNamesIterator.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/columniterator/SSTableNamesIterator.java b/src/java/org/apache/cassandra/db/columniterator/SSTableNamesIterator.java
index da4631d..2561ad6 100644
--- a/src/java/org/apache/cassandra/db/columniterator/SSTableNamesIterator.java
+++ b/src/java/org/apache/cassandra/db/columniterator/SSTableNamesIterator.java
@@ -19,25 +19,22 @@ package org.apache.cassandra.db.columniterator;
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.SortedSet;
 
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.ColumnFamilySerializer;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.DeletionInfo;
-import org.apache.cassandra.db.IColumn;
-import org.apache.cassandra.db.RowIndexEntry;
-import org.apache.cassandra.db.OnDiskAtom;
+import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.io.sstable.CorruptSSTableException;
+import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.IndexHelper;
 import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.io.util.FileDataInput;
 import org.apache.cassandra.io.util.FileMark;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.IFilter;
 
 public class SSTableNamesIterator extends SimpleAbstractColumnIterator implements ISSTableColumnIterator
 {
@@ -55,13 +52,13 @@ public class SSTableNamesIterator extends SimpleAbstractColumnIterator implement
         this.columns = columns;
         this.key = key;
 
-        RowIndexEntry indexEntry = sstable.getPosition(key, SSTableReader.Operator.EQ);
-        if (indexEntry == null)
+        RowIndexEntry rowEntry = sstable.getPosition(key, SSTableReader.Operator.EQ);
+        if (rowEntry == null)
             return;
 
         try
         {
-            read(sstable, null, indexEntry);
+            read(sstable, null, rowEntry);
         }
         catch (IOException e)
         {
@@ -75,7 +72,7 @@ public class SSTableNamesIterator extends SimpleAbstractColumnIterator implement
         }
     }
 
-    public SSTableNamesIterator(SSTableReader sstable, FileDataInput file, DecoratedKey key, SortedSet<ByteBuffer> columns, RowIndexEntry indexEntry)
+    public SSTableNamesIterator(SSTableReader sstable, FileDataInput file, DecoratedKey key, SortedSet<ByteBuffer> columns, RowIndexEntry rowEntry)
     {
         assert columns != null;
         this.sstable = sstable;
@@ -84,7 +81,7 @@ public class SSTableNamesIterator extends SimpleAbstractColumnIterator implement
 
         try
         {
-            read(sstable, file, indexEntry);
+            read(sstable, file, rowEntry);
         }
         catch (IOException e)
         {
@@ -104,101 +101,67 @@ public class SSTableNamesIterator extends SimpleAbstractColumnIterator implement
         return sstable;
     }
 
-    private void read(SSTableReader sstable, FileDataInput file, RowIndexEntry indexEntry)
-    throws IOException
+    private void read(SSTableReader sstable, FileDataInput file, RowIndexEntry rowEntry)
+            throws IOException
     {
-        IFilter bf;
         List<IndexHelper.IndexInfo> indexList;
 
-        // If the entry is not indexed or the index is not promoted, read from the row start
-        if (!indexEntry.isIndexed())
+        Descriptor.Version version = sstable.descriptor.version;
+        cf = ColumnFamily.create(sstable.metadata);
+        List<OnDiskAtom> result = new ArrayList<OnDiskAtom>(columns.size());
+
+        if (version.hasPromotedRowTombstones && !rowEntry.columnsIndex().isEmpty())
         {
-            if (file == null)
-                file = createFileDataInput(indexEntry.position);
-            else
-                file.seek(indexEntry.position);
+            // skip the row header entirely
+            cf.delete(new DeletionInfo(rowEntry.deletionTime()));
 
-            DecoratedKey keyInDisk = SSTableReader.decodeKey(sstable.partitioner,
-                                                             sstable.descriptor,
-                                                             ByteBufferUtil.readWithShortLength(file));
-            assert keyInDisk.equals(key) : String.format("%s != %s in %s", keyInDisk, key, file.getPath());
-            SSTableReader.readRowSize(file, sstable.descriptor);
+            readIndexedColumns(sstable.metadata, file, columns, rowEntry.columnsIndex(), rowEntry.position, result);
+            iter = result.iterator();
+            return;
         }
 
-        if (sstable.descriptor.version.hasPromotedIndexes)
-        {
-            bf = indexEntry.isIndexed() ? indexEntry.bloomFilter() : null;
-            indexList = indexEntry.columnsIndex();
-        }
+        if (file == null)
+            file = createFileDataInput(rowEntry.position);
         else
-        {
-            assert file != null;
-            bf = IndexHelper.defreezeBloomFilter(file, sstable.descriptor.version.filterType);
-            indexList = IndexHelper.deserializeIndex(file);
-        }
+            file.seek(rowEntry.position);
+
+        DecoratedKey keyInDisk = SSTableReader.decodeKey(sstable.partitioner,
+                                                         sstable.descriptor,
+                                                         ByteBufferUtil.readWithShortLength(file));
+        assert keyInDisk.equals(key) : String.format("%s != %s in %s", keyInDisk, key, file.getPath());
+        SSTableReader.readRowSize(file, sstable.descriptor);
 
-        if (!indexEntry.isIndexed())
+        if (sstable.descriptor.version.hasPromotedIndexes)
         {
-            // we can stop early if bloom filter says none of the columns actually exist -- but,
-            // we can't stop before initializing the cf above, in case there's a relevant tombstone
-            ColumnFamilySerializer serializer = ColumnFamily.serializer;
-            try
-            {
-                cf = ColumnFamily.create(sstable.metadata);
-                cf.delete(DeletionInfo.serializer().deserializeFromSSTable(file, sstable.descriptor.version));
-            }
-            catch (Exception e)
-            {
-                throw new IOException(serializer + " failed to deserialize " + sstable.getColumnFamilyName() + " with " + sstable.metadata + " from " + file, e);
-            }
+            indexList = rowEntry.columnsIndex();
+            // we'll get row deletion time from the row header below
         }
         else
         {
-            cf = ColumnFamily.create(sstable.metadata);
-            cf.delete(indexEntry.deletionInfo());
+            IndexHelper.skipSSTableBloomFilter(file, version);
+            indexList = IndexHelper.deserializeIndex(file);
         }
 
-        List<OnDiskAtom> result = new ArrayList<OnDiskAtom>();
-        List<ByteBuffer> filteredColumnNames = new ArrayList<ByteBuffer>(columns.size());
-        for (ByteBuffer name : columns)
-        {
-            if (bf == null || bf.isPresent(name))
-            {
-                filteredColumnNames.add(name);
-            }
-        }
-        if (filteredColumnNames.isEmpty())
-            return;
+        cf.delete(DeletionInfo.serializer().deserializeFromSSTable(file, sstable.descriptor.version));
 
         if (indexList.isEmpty())
         {
-            readSimpleColumns(file, columns, filteredColumnNames, result);
+            readSimpleColumns(file, columns, result);
         }
         else
         {
-            long basePosition;
-            if (sstable.descriptor.version.hasPromotedIndexes)
-            {
-                basePosition = indexEntry.position;
-            }
-            else
-            {
-                assert file != null;
-                file.readInt(); // column count
-                basePosition = file.getFilePointer();
-            }
-            readIndexedColumns(sstable.metadata, file, columns, filteredColumnNames, indexList, basePosition, result);
+            long basePosition = version.hasPromotedIndexes ? rowEntry.position : file.getFilePointer() + 4;
+            readIndexedColumns(sstable.metadata, file, columns, indexList, basePosition, result);
         }
 
         // create an iterator view of the columns we read
         iter = result.iterator();
     }
 
-    private void readSimpleColumns(FileDataInput file, SortedSet<ByteBuffer> columnNames, List<ByteBuffer> filteredColumnNames, List<OnDiskAtom> result) throws IOException
+    private void readSimpleColumns(FileDataInput file, SortedSet<ByteBuffer> columnNames, List<OnDiskAtom> result) throws IOException
     {
         OnDiskAtom.Serializer atomSerializer = cf.getOnDiskSerializer();
         int columns = file.readInt();
-        int n = 0;
         for (int i = 0; i < columns; i++)
         {
             OnDiskAtom column = atomSerializer.deserializeFromSSTable(file, sstable.descriptor.version);
@@ -207,7 +170,7 @@ public class SSTableNamesIterator extends SimpleAbstractColumnIterator implement
                 if (columnNames.contains(column.name()))
                 {
                     result.add(column);
-                    if (n++ > filteredColumnNames.size())
+                    if (result.size() >= columnNames.size())
                         break;
                 }
             }
@@ -221,17 +184,16 @@ public class SSTableNamesIterator extends SimpleAbstractColumnIterator implement
     private void readIndexedColumns(CFMetaData metadata,
                                     FileDataInput file,
                                     SortedSet<ByteBuffer> columnNames,
-                                    List<ByteBuffer> filteredColumnNames,
                                     List<IndexHelper.IndexInfo> indexList,
                                     long basePosition,
                                     List<OnDiskAtom> result)
-    throws IOException
+            throws IOException
     {
         /* get the various column ranges we have to read */
         AbstractType<?> comparator = metadata.comparator;
         List<IndexHelper.IndexInfo> ranges = new ArrayList<IndexHelper.IndexInfo>();
         int lastIndexIdx = -1;
-        for (ByteBuffer name : filteredColumnNames)
+        for (ByteBuffer name : columnNames)
         {
             int index = IndexHelper.indexFor(name, indexList, comparator, false, lastIndexIdx);
             if (index < 0 || index == indexList.size())
@@ -251,7 +213,7 @@ public class SSTableNamesIterator extends SimpleAbstractColumnIterator implement
         {
             long positionToSeek = basePosition + indexInfo.offset;
 
-            // With new promoted indexes, our first seek in the data file will happen at that point.
+            // With 1.2 promoted indexes, our first seek in the data file will happen at this point
             if (file == null)
                 file = createFileDataInput(positionToSeek);
 
@@ -262,7 +224,6 @@ public class SSTableNamesIterator extends SimpleAbstractColumnIterator implement
             while (file.bytesPastMark(mark) < indexInfo.width)
             {
                 OnDiskAtom column = atomSerializer.deserializeFromSSTable(file, sstable.descriptor.version);
-                // we check vs the original Set, not the filtered List, for efficiency
                 if (!(column instanceof IColumn) || columnNames.contains(column.name()))
                     result.add(column);
             }

http://git-wip-us.apache.org/repos/asf/cassandra/blob/d7a09825/src/java/org/apache/cassandra/db/columniterator/SimpleSliceReader.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/columniterator/SimpleSliceReader.java b/src/java/org/apache/cassandra/db/columniterator/SimpleSliceReader.java
index b30d360..deda040 100644
--- a/src/java/org/apache/cassandra/db/columniterator/SimpleSliceReader.java
+++ b/src/java/org/apache/cassandra/db/columniterator/SimpleSliceReader.java
@@ -49,7 +49,7 @@ class SimpleSliceReader extends AbstractIterator<OnDiskAtom> implements OnDiskAt
     private FileMark mark;
     private final OnDiskAtom.Serializer atomSerializer;
 
-    public SimpleSliceReader(SSTableReader sstable, RowIndexEntry indexEntry, FileDataInput input, ByteBuffer finishColumn)
+    public SimpleSliceReader(SSTableReader sstable, RowIndexEntry rowEntry, FileDataInput input, ByteBuffer finishColumn)
     {
         this.sstable = sstable;
         this.finishColumn = finishColumn;
@@ -58,13 +58,13 @@ class SimpleSliceReader extends AbstractIterator<OnDiskAtom> implements OnDiskAt
         {
             if (input == null)
             {
-                this.file = sstable.getFileDataInput(indexEntry.position);
+                this.file = sstable.getFileDataInput(rowEntry.position);
                 this.needsClosing = true;
             }
             else
             {
                 this.file = input;
-                input.seek(indexEntry.position);
+                input.seek(rowEntry.position);
                 this.needsClosing = false;
             }
 
@@ -72,6 +72,8 @@ class SimpleSliceReader extends AbstractIterator<OnDiskAtom> implements OnDiskAt
             ByteBufferUtil.skipShortLength(file);
             SSTableReader.readRowSize(file, sstable.descriptor);
 
+            emptyColumnFamily = ColumnFamily.create(sstable.metadata);
+
             Descriptor.Version version = sstable.descriptor.version;
             if (!version.hasPromotedIndexes)
             {
@@ -79,7 +81,6 @@ class SimpleSliceReader extends AbstractIterator<OnDiskAtom> implements OnDiskAt
                 IndexHelper.skipIndex(file);
             }
 
-            emptyColumnFamily = ColumnFamily.create(sstable.metadata);
             emptyColumnFamily.delete(DeletionInfo.serializer().deserializeFromSSTable(file, version));
             atomSerializer = emptyColumnFamily.getOnDiskSerializer();
             columns = file.readInt();

http://git-wip-us.apache.org/repos/asf/cassandra/blob/d7a09825/src/java/org/apache/cassandra/io/sstable/Descriptor.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/io/sstable/Descriptor.java b/src/java/org/apache/cassandra/io/sstable/Descriptor.java
index f21a0d5..7b916cb 100644
--- a/src/java/org/apache/cassandra/io/sstable/Descriptor.java
+++ b/src/java/org/apache/cassandra/io/sstable/Descriptor.java
@@ -47,7 +47,7 @@ public class Descriptor
     public static class Version
     {
         // This needs to be at the begining for initialization sake
-        public static final String current_version = "ib";
+        public static final String current_version = "ic";
 
         public static final Version LEGACY = new Version("a"); // "pre-history"
         // b (0.7.0): added version to sstable filenames
@@ -62,10 +62,11 @@ public class Descriptor
         // hd (1.0.10): includes row tombstones in maxtimestamp
         // he (1.1.3): includes ancestors generation in metadata component
         // hf (1.1.6): marker that replay position corresponds to 1.1.5+ millis-based id (see CASSANDRA-4782)
-        // ia (1.2.0): column indexes are promoted to the index file
+        // ia (1.2.0): column indexes are promoted to the index file.  (this means index offsets are now against the start of the row key, rather than the start of columns data, since the former allows us to skip the row header)
         //             records estimated histogram of deletion times in tombstones
         //             bloom filter (keys and columns) upgraded to Murmur3
         // ib (1.2.1): tracks min client timestamp in metadata component
+        // ic (1.2.6): always promotes row-level tombstones into index file; previously this was unreliable
 
         public static final Version CURRENT = new Version(current_version);
 
@@ -83,6 +84,7 @@ public class Descriptor
         public final boolean hasPartitioner;
         public final boolean tracksTombstones;
         public final boolean hasPromotedIndexes;
+        public final boolean hasPromotedRowTombstones;
         public final FilterFactory.Type filterType;
         public final boolean hasAncestors;
         public final boolean hasBloomFilterSizeInHeader;
@@ -102,6 +104,7 @@ public class Descriptor
             metadataIncludesModernReplayPosition = version.compareTo("hf") >= 0;
             tracksTombstones = version.compareTo("ia") >= 0;
             hasPromotedIndexes = version.compareTo("ia") >= 0;
+            hasPromotedRowTombstones = version.compareTo("ic") >= 0;
             isLatestVersion = version.compareTo(current_version) == 0;
             if (version.compareTo("f") < 0)
                 filterType = FilterFactory.Type.SHA;

http://git-wip-us.apache.org/repos/asf/cassandra/blob/d7a09825/src/java/org/apache/cassandra/io/sstable/IndexHelper.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/io/sstable/IndexHelper.java b/src/java/org/apache/cassandra/io/sstable/IndexHelper.java
index 444ec0b..8be2457 100644
--- a/src/java/org/apache/cassandra/io/sstable/IndexHelper.java
+++ b/src/java/org/apache/cassandra/io/sstable/IndexHelper.java
@@ -120,11 +120,6 @@ public class IndexHelper
         return indexList;
     }
 
-    public static IFilter defreezeBloomFilter(FileDataInput file, FilterFactory.Type type) throws IOException
-    {
-        return defreezeBloomFilter(file, Integer.MAX_VALUE, type);
-    }
-
     /**
      * De-freeze the bloom filter.
      *

http://git-wip-us.apache.org/repos/asf/cassandra/blob/d7a09825/src/java/org/apache/cassandra/io/sstable/SSTableReader.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableReader.java b/src/java/org/apache/cassandra/io/sstable/SSTableReader.java
index 21a8673..61f505d 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableReader.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableReader.java
@@ -323,7 +323,7 @@ public class SSTableReader extends SSTable
     {
         if (!components.contains(Component.FILTER))
         {
-            bf = new AlwaysPresentFilter();
+            bf = AlwaysPresentFilter.instance;
             return;
         }
 

http://git-wip-us.apache.org/repos/asf/cassandra/blob/d7a09825/src/java/org/apache/cassandra/io/sstable/SSTableWriter.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableWriter.java b/src/java/org/apache/cassandra/io/sstable/SSTableWriter.java
index c64fd27..c5f14ea 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableWriter.java
@@ -134,7 +134,7 @@ public class SSTableWriter extends SSTable
         return (lastWrittenKey == null) ? 0 : dataFile.getFilePointer();
     }
 
-    private RowIndexEntry afterAppend(DecoratedKey decoratedKey, long dataPosition, DeletionInfo delInfo, ColumnIndex index)
+    private RowIndexEntry afterAppend(DecoratedKey decoratedKey, long dataPosition, DeletionTime deletion, ColumnIndex index)
     {
         lastWrittenKey = decoratedKey;
         last = lastWrittenKey;
@@ -143,7 +143,7 @@ public class SSTableWriter extends SSTable
 
         if (logger.isTraceEnabled())
             logger.trace("wrote " + decoratedKey + " at " + dataPosition);
-        RowIndexEntry entry = RowIndexEntry.create(dataPosition, delInfo, index);
+        RowIndexEntry entry = RowIndexEntry.create(dataPosition, deletion, index);
         iwriter.append(decoratedKey, entry);
         dbuilder.addPotentialBoundary(dataPosition);
         return entry;
@@ -165,7 +165,7 @@ public class SSTableWriter extends SSTable
             throw new FSWriteError(e, dataFile.getPath());
         }
         sstableMetadataCollector.update(dataFile.getFilePointer() - currentPosition, row.columnStats());
-        return afterAppend(row.key, currentPosition, row.deletionInfo(), row.index());
+        return afterAppend(row.key, currentPosition, row.deletionInfo().getTopLevelDeletion(), row.index());
     }
 
     public void append(DecoratedKey decoratedKey, ColumnFamily cf)
@@ -191,7 +191,7 @@ public class SSTableWriter extends SSTable
             DeletionInfo.serializer().serializeForSSTable(cf.deletionInfo(), dataFile.stream);
             dataFile.stream.writeInt(builder.writtenAtomCount());
             dataFile.stream.write(buffer.getData(), 0, buffer.getLength());
-            afterAppend(decoratedKey, startPosition, cf.deletionInfo(), index);
+            afterAppend(decoratedKey, startPosition, cf.deletionInfo().getTopLevelDeletion(), index);
         }
         catch (IOException e)
         {
@@ -220,12 +220,12 @@ public class SSTableWriter extends SSTable
             throw new FSWriteError(e, dataFile.getPath());
         }
 
-        DeletionInfo deletionInfo = DeletionInfo.serializer().deserializeFromSSTable(in, descriptor.version);
+        DeletionTime deletion = DeletionTime.serializer.deserialize(in);
         int columnCount = in.readInt();
 
         try
         {
-            DeletionInfo.serializer().serializeForSSTable(deletionInfo, dataFile.stream);
+            DeletionTime.serializer.serialize(deletion, dataFile.stream);
             dataFile.stream.writeInt(columnCount);
         }
         catch (IOException e)
@@ -238,7 +238,7 @@ public class SSTableWriter extends SSTable
         long maxTimestamp = Long.MIN_VALUE;
         StreamingHistogram tombstones = new StreamingHistogram(TOMBSTONE_HISTOGRAM_BIN_SIZE);
         ColumnFamily cf = ColumnFamily.create(metadata, ArrayBackedSortedColumns.factory());
-        cf.delete(deletionInfo);
+        cf.delete(new DeletionInfo(deletion));
 
         ColumnIndex.Builder columnIndexer = new ColumnIndex.Builder(cf, key.key, columnCount, dataFile.stream, true);
         OnDiskAtom.Serializer atomSerializer = cf.getOnDiskSerializer();
@@ -288,7 +288,7 @@ public class SSTableWriter extends SSTable
         sstableMetadataCollector.addRowSize(dataFile.getFilePointer() - currentPosition);
         sstableMetadataCollector.addColumnCount(columnCount);
         sstableMetadataCollector.mergeTombstoneHistogram(tombstones);
-        afterAppend(key, currentPosition, deletionInfo, columnIndexer.build());
+        afterAppend(key, currentPosition, deletion, columnIndexer.build());
         return currentPosition;
     }
 

http://git-wip-us.apache.org/repos/asf/cassandra/blob/d7a09825/src/java/org/apache/cassandra/utils/AlwaysPresentFilter.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/utils/AlwaysPresentFilter.java b/src/java/org/apache/cassandra/utils/AlwaysPresentFilter.java
index 67ac111..39b3d5d 100644
--- a/src/java/org/apache/cassandra/utils/AlwaysPresentFilter.java
+++ b/src/java/org/apache/cassandra/utils/AlwaysPresentFilter.java
@@ -26,6 +26,10 @@ import java.nio.ByteBuffer;
 
 public class AlwaysPresentFilter implements IFilter
 {
+    public static final AlwaysPresentFilter instance = new AlwaysPresentFilter();
+
+    private AlwaysPresentFilter() { }
+
     public boolean isPresent(ByteBuffer key)
     {
         return true;

http://git-wip-us.apache.org/repos/asf/cassandra/blob/d7a09825/src/java/org/apache/cassandra/utils/FilterFactory.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/utils/FilterFactory.java b/src/java/org/apache/cassandra/utils/FilterFactory.java
index 88c8973..1b9027d 100644
--- a/src/java/org/apache/cassandra/utils/FilterFactory.java
+++ b/src/java/org/apache/cassandra/utils/FilterFactory.java
@@ -131,7 +131,7 @@ public class FilterFactory
     {
         assert maxFalsePosProbability <= 1.0 : "Invalid probability";
         if (maxFalsePosProbability == 1.0)
-            return new AlwaysPresentFilter();
+            return AlwaysPresentFilter.instance;
         int bucketsPerElement = BloomCalculations.maxBucketsPerElement(numElements);
         BloomCalculations.BloomSpecification spec = BloomCalculations.computeBloomSpec(bucketsPerElement, maxFalsePosProbability);
         return createFilter(spec.K, numElements, spec.bucketsPerElement, type, offheap);

http://git-wip-us.apache.org/repos/asf/cassandra/blob/d7a09825/test/unit/org/apache/cassandra/db/RangeTombstoneTest.java
----------------------------------------------------------------------
diff --git a/test/unit/org/apache/cassandra/db/RangeTombstoneTest.java b/test/unit/org/apache/cassandra/db/RangeTombstoneTest.java
index 1bc846b..c531461 100644
--- a/test/unit/org/apache/cassandra/db/RangeTombstoneTest.java
+++ b/test/unit/org/apache/cassandra/db/RangeTombstoneTest.java
@@ -164,10 +164,6 @@ public class RangeTombstoneTest extends SchemaLoader
         return ByteBufferUtil.bytes(i);
     }
 
-    private static void insertData(ColumnFamilyStore cfs, String key) throws Exception
-    {
-    }
-
     private static void add(RowMutation rm, int value, long timestamp)
     {
         rm.add(new QueryPath(CFNAME, null, b(value)), b(value), timestamp);