You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cassandra.apache.org by sn...@apache.org on 2015/09/14 22:08:49 UTC

cassandra git commit: Update index file format

Repository: cassandra
Updated Branches:
  refs/heads/cassandra-3.0 16497fd93 -> 51b1a1c6d


Update index file format

patch by Robert Stupp; reviewed by Ariel Weisberg for CASSANDRA-10314


Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo
Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/51b1a1c6
Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/51b1a1c6
Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/51b1a1c6

Branch: refs/heads/cassandra-3.0
Commit: 51b1a1c6d3faf2a2bee97fe10c9399119784675d
Parents: 16497fd
Author: Robert Stupp <sn...@snazy.de>
Authored: Mon Sep 14 22:06:37 2015 +0200
Committer: Robert Stupp <sn...@snazy.de>
Committed: Mon Sep 14 22:06:37 2015 +0200

----------------------------------------------------------------------
 CHANGES.txt                                     |   1 +
 .../org/apache/cassandra/db/ColumnIndex.java    |   1 +
 .../org/apache/cassandra/db/RowIndexEntry.java  |  82 +++--
 .../columniterator/AbstractSSTableIterator.java |  18 +-
 .../cassandra/io/sstable/IndexHelper.java       |  29 +-
 .../io/sstable/format/big/BigTableScanner.java  |   2 +-
 .../cassandra/io/util/DataOutputBuffer.java     |  10 +
 .../cassandra/io/util/DataOutputPlus.java       |   9 +
 .../cassandra/io/util/SequentialWriter.java     |   5 +
 .../legacy_ma_clust/ma-1-big-CompressionInfo.db | Bin 83 -> 83 bytes
 .../legacy_ma_clust/ma-1-big-Data.db            | Bin 5045 -> 5049 bytes
 .../legacy_ma_clust/ma-1-big-Digest.crc32       |   2 +-
 .../legacy_ma_clust/ma-1-big-Index.db           | Bin 157123 -> 157553 bytes
 .../legacy_ma_clust/ma-1-big-Statistics.db      | Bin 7045 -> 7045 bytes
 .../ma-1-big-CompressionInfo.db                 | Bin 75 -> 75 bytes
 .../legacy_ma_clust_counter/ma-1-big-Data.db    | Bin 4428 -> 4393 bytes
 .../ma-1-big-Digest.crc32                       |   2 +-
 .../legacy_ma_clust_counter/ma-1-big-Index.db   | Bin 157123 -> 157553 bytes
 .../ma-1-big-Statistics.db                      | Bin 7054 -> 7054 bytes
 .../legacy_ma_simple/ma-1-big-Data.db           | Bin 85 -> 85 bytes
 .../legacy_ma_simple/ma-1-big-Digest.crc32      |   2 +-
 .../legacy_ma_simple/ma-1-big-Statistics.db     | Bin 4598 -> 4598 bytes
 .../legacy_ma_simple_counter/ma-1-big-Data.db   | Bin 106 -> 106 bytes
 .../ma-1-big-Digest.crc32                       |   2 +-
 .../ma-1-big-Statistics.db                      | Bin 4607 -> 4607 bytes
 .../apache/cassandra/cql3/KeyCacheCqlTest.java  | 365 +++++++++++++++++++
 .../apache/cassandra/db/RowIndexEntryTest.java  | 142 +++++++-
 .../cassandra/io/sstable/IndexHelperTest.java   |   8 +-
 28 files changed, 621 insertions(+), 59 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index bacedaf..1a1ddeb 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,5 @@
 3.0.0-rc1
+ * Update index file format (CASSANDRA-10314)
  * Add "shadowable" row tombstones to deal with mv timestamp issues (CASSANDRA-10261)
  * CFS.loadNewSSTables() broken for pre-3.0 sstables
  * Cache selected index in read command to reduce lookups (CASSANDRA-10215)

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/src/java/org/apache/cassandra/db/ColumnIndex.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/ColumnIndex.java b/src/java/org/apache/cassandra/db/ColumnIndex.java
index b350f90..6b2ef59 100644
--- a/src/java/org/apache/cassandra/db/ColumnIndex.java
+++ b/src/java/org/apache/cassandra/db/ColumnIndex.java
@@ -122,6 +122,7 @@ public class ColumnIndex
         {
             IndexHelper.IndexInfo cIndexInfo = new IndexHelper.IndexInfo(firstClustering,
                                                                          lastClustering,
+                                                                         startPosition,
                                                                          currentPosition() - startPosition,
                                                                          openMarker);
             columnsIndex.add(cIndexInfo);

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/src/java/org/apache/cassandra/db/RowIndexEntry.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/RowIndexEntry.java b/src/java/org/apache/cassandra/db/RowIndexEntry.java
index f63e893..7f361d9 100644
--- a/src/java/org/apache/cassandra/db/RowIndexEntry.java
+++ b/src/java/org/apache/cassandra/db/RowIndexEntry.java
@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.db;
 
-import java.io.DataInput;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -28,7 +27,6 @@ import com.google.common.primitives.Ints;
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.cache.IMeasurableMemory;
-import org.apache.cassandra.io.ISerializer;
 import org.apache.cassandra.io.sstable.IndexHelper;
 import org.apache.cassandra.io.sstable.format.Version;
 import org.apache.cassandra.io.util.DataInputPlus;
@@ -109,11 +107,11 @@ public class RowIndexEntry<T> implements IMeasurableMemory
         return EMPTY_SIZE;
     }
 
-    public static interface IndexSerializer<T>
+    public interface IndexSerializer<T>
     {
         void serialize(RowIndexEntry<T> rie, DataOutputPlus out) throws IOException;
         RowIndexEntry<T> deserialize(DataInputPlus in) throws IOException;
-        public int serializedSize(RowIndexEntry<T> rie);
+        int serializedSize(RowIndexEntry<T> rie);
     }
 
     public static class Serializer implements IndexSerializer<IndexHelper.IndexInfo>
@@ -139,8 +137,39 @@ public class RowIndexEntry<T> implements IMeasurableMemory
                 out.writeUnsignedVInt(rie.headerLength());
                 DeletionTime.serializer.serialize(rie.deletionTime(), out);
                 out.writeUnsignedVInt(rie.columnsIndex().size());
-                for (IndexHelper.IndexInfo info : rie.columnsIndex())
-                    idxSerializer.serialize(info, out);
+
+                // Calculate and write the offsets to the IndexInfo objects.
+
+                int[] offsets = new int[rie.columnsIndex().size()];
+
+                if (out.hasFilePointer())
+                {
+                    // Out is usually a SequentialWriter, so using the file-pointer is fine to generate the offsets.
+                    // A DataOutputBuffer also works.
+                    long start = out.getFilePointer();
+                    int i = 0;
+                    for (IndexHelper.IndexInfo info : rie.columnsIndex())
+                    {
+                        offsets[i++] = i == 0 ? 0 : (int)(out.getFilePointer() - start);
+                        idxSerializer.serialize(info, out);
+                    }
+                }
+                else
+                {
+                    // Not sure this branch will ever be needed, but if it is called, it has to calculate the
+                    // serialized sizes instead of simply using the file-pointer.
+                    int i = 0;
+                    int offset = 0;
+                    for (IndexHelper.IndexInfo info : rie.columnsIndex())
+                    {
+                        offsets[i++] = offset;
+                        idxSerializer.serialize(info, out);
+                        offset += idxSerializer.serializedSize(info);
+                    }
+                }
+
+                for (int off : offsets)
+                    out.writeInt(off);
             }
         }
 
@@ -158,19 +187,14 @@ public class RowIndexEntry<T> implements IMeasurableMemory
                     int entries = in.readInt();
                     List<IndexHelper.IndexInfo> columnsIndex = new ArrayList<>(entries);
 
-                    // The old format didn't saved the partition header length per-se, but rather for each entry it's
-                    // offset from the beginning of the row. We don't use that offset anymore, but we do need the
-                    // header length so we basically need the first entry offset. And so we inline the deserialization
-                    // of the first index entry to get that information. While this is a bit ugly, we'll get rid of that
-                    // code once pre-3.0 backward compatibility is dropped so it feels fine as a temporary hack.
-                    ClusteringPrefix firstName = idxSerializer.clusteringSerializer.deserialize(in);
-                    ClusteringPrefix lastName = idxSerializer.clusteringSerializer.deserialize(in);
-                    long headerLength = in.readLong();
-                    long width = in.readLong();
-
-                    columnsIndex.add(new IndexHelper.IndexInfo(firstName, lastName, width, null));
-                    for (int i = 1; i < entries; i++)
-                        columnsIndex.add(idxSerializer.deserialize(in));
+                    long headerLength = 0L;
+                    for (int i = 0; i < entries; i++)
+                    {
+                        IndexHelper.IndexInfo info = idxSerializer.deserialize(in);
+                        columnsIndex.add(info);
+                        if (i == 0)
+                            headerLength = info.offset;
+                    }
 
                     return new IndexedEntry(position, deletionTime, headerLength, columnsIndex);
                 }
@@ -192,6 +216,8 @@ public class RowIndexEntry<T> implements IMeasurableMemory
                 for (int i = 0; i < entries; i++)
                     columnsIndex.add(idxSerializer.deserialize(in));
 
+                FileUtils.skipBytesFully(in, entries * TypeSizes.sizeof(0));
+
                 return new IndexedEntry(position, deletionTime, headerLength, columnsIndex);
             }
             else
@@ -227,20 +253,22 @@ public class RowIndexEntry<T> implements IMeasurableMemory
         {
             assert version.storeRows() : "We read old index files but we should never write them";
 
-            int size = TypeSizes.sizeofUnsignedVInt(rie.position) + TypeSizes.sizeofUnsignedVInt(rie.promotedSize(idxSerializer));
-
+            int indexedSize = 0;
             if (rie.isIndexed())
             {
                 List<IndexHelper.IndexInfo> index = rie.columnsIndex();
 
-                size += TypeSizes.sizeofUnsignedVInt(rie.headerLength());
-                size += DeletionTime.serializer.serializedSize(rie.deletionTime());
-                size += TypeSizes.sizeofUnsignedVInt(index.size());
+                indexedSize += TypeSizes.sizeofUnsignedVInt(rie.headerLength());
+                indexedSize += DeletionTime.serializer.serializedSize(rie.deletionTime());
+                indexedSize += TypeSizes.sizeofUnsignedVInt(index.size());
 
                 for (IndexHelper.IndexInfo info : index)
-                    size += idxSerializer.serializedSize(info);
+                    indexedSize += idxSerializer.serializedSize(info);
+
+                indexedSize += index.size() * TypeSizes.sizeof(0);
             }
-            return size;
+
+            return TypeSizes.sizeofUnsignedVInt(rie.position) + TypeSizes.sizeofUnsignedVInt(indexedSize) + indexedSize;
         }
     }
 
@@ -295,6 +323,8 @@ public class RowIndexEntry<T> implements IMeasurableMemory
             for (IndexHelper.IndexInfo info : columnsIndex)
                 size += idxSerializer.serializedSize(info);
 
+            size += columnsIndex.size() * TypeSizes.sizeof(0);
+
             return Ints.checkedCast(size);
         }
 

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/src/java/org/apache/cassandra/db/columniterator/AbstractSSTableIterator.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/columniterator/AbstractSSTableIterator.java b/src/java/org/apache/cassandra/db/columniterator/AbstractSSTableIterator.java
index c075a2b..837f0a0 100644
--- a/src/java/org/apache/cassandra/db/columniterator/AbstractSSTableIterator.java
+++ b/src/java/org/apache/cassandra/db/columniterator/AbstractSSTableIterator.java
@@ -412,7 +412,6 @@ abstract class AbstractSSTableIterator implements SliceableUnfilteredRowIterator
 
         private final RowIndexEntry indexEntry;
         private final List<IndexHelper.IndexInfo> indexes;
-        private final long[] blockOffsets;
         private final boolean reversed;
 
         private int currentIndexIdx;
@@ -428,14 +427,6 @@ abstract class AbstractSSTableIterator implements SliceableUnfilteredRowIterator
             this.indexes = indexEntry.columnsIndex();
             this.reversed = reversed;
             this.currentIndexIdx = reversed ? indexEntry.columnsIndex().size() : -1;
-
-            this.blockOffsets = new long[indexes.size()];
-            long offset = indexEntry.position + indexEntry.headerLength();
-            for (int i = 0; i < blockOffsets.length; i++)
-            {
-                blockOffsets[i] = offset;
-                offset += indexes.get(i).width;
-            }
         }
 
         public boolean isDone()
@@ -447,13 +438,18 @@ abstract class AbstractSSTableIterator implements SliceableUnfilteredRowIterator
         public void setToBlock(int blockIdx) throws IOException
         {
             if (blockIdx >= 0 && blockIdx < indexes.size())
-                reader.seekToPosition(blockOffsets[blockIdx]);
+                reader.seekToPosition(columnOffset(blockIdx));
 
             currentIndexIdx = blockIdx;
             reader.openMarker = blockIdx > 0 ? indexes.get(blockIdx - 1).endOpenMarker : null;
             mark = reader.file.mark();
         }
 
+        private long columnOffset(int i)
+        {
+            return indexEntry.position + indexes.get(i).offset;
+        }
+
         public int blocksCount()
         {
             return indexes.size();
@@ -470,7 +466,7 @@ abstract class AbstractSSTableIterator implements SliceableUnfilteredRowIterator
 
                 // We have to set the mark, and we have to set it at the beginning of the block. So if we're not at the beginning of the block, this forces us to a weird seek dance.
                 // This can only happen when reading old file however.
-                long startOfBlock = blockOffsets[currentIndexIdx];
+                long startOfBlock = columnOffset(currentIndexIdx);
                 long currentFilePointer = reader.file.getFilePointer();
                 if (startOfBlock == currentFilePointer)
                 {

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/src/java/org/apache/cassandra/io/sstable/IndexHelper.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/io/sstable/IndexHelper.java b/src/java/org/apache/cassandra/io/sstable/IndexHelper.java
index e95af29..74a0fc5 100644
--- a/src/java/org/apache/cassandra/io/sstable/IndexHelper.java
+++ b/src/java/org/apache/cassandra/io/sstable/IndexHelper.java
@@ -19,7 +19,6 @@ package org.apache.cassandra.io.sstable;
 
 import java.io.*;
 import java.util.Collections;
-import java.util.Comparator;
 import java.util.List;
 
 import org.apache.cassandra.config.CFMetaData;
@@ -28,15 +27,17 @@ import org.apache.cassandra.io.ISerializer;
 import org.apache.cassandra.io.sstable.format.Version;
 import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.io.util.FileDataInput;
-import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.utils.*;
 
 /**
  * Provides helper to serialize, deserialize and use column indexes.
  */
-public class IndexHelper
+public final class IndexHelper
 {
+    private IndexHelper()
+    {
+    }
+
     /**
      * The index of the IndexInfo in which a scan starting with @name should begin.
      *
@@ -50,7 +51,7 @@ public class IndexHelper
      */
     public static int indexFor(ClusteringPrefix name, List<IndexInfo> indexList, ClusteringComparator comparator, boolean reversed, int lastIndex)
     {
-        IndexInfo target = new IndexInfo(name, name, 0, null);
+        IndexInfo target = new IndexInfo(name, name, 0, 0, null);
         /*
         Take the example from the unit test, and say your index looks like this:
         [0..5][10..15][20..25]
@@ -87,8 +88,9 @@ public class IndexHelper
 
     public static class IndexInfo
     {
-        private static final long EMPTY_SIZE = ObjectSizes.measure(new IndexInfo(null, null, 0, null));
+        private static final long EMPTY_SIZE = ObjectSizes.measure(new IndexInfo(null, null, 0, 0, null));
 
+        public final long offset;
         public final long width;
         public final ClusteringPrefix firstName;
         public final ClusteringPrefix lastName;
@@ -99,11 +101,13 @@ public class IndexHelper
 
         public IndexInfo(ClusteringPrefix firstName,
                          ClusteringPrefix lastName,
+                         long offset,
                          long width,
                          DeletionTime endOpenMarker)
         {
             this.firstName = firstName;
             this.lastName = lastName;
+            this.offset = offset;
             this.width = width;
             this.endOpenMarker = endOpenMarker;
         }
@@ -114,10 +118,9 @@ public class IndexHelper
             // This is imperfect as user can change the index size and ideally we would save the index size used with each index file
             // to use as base. However, that's a bit more involved a change that we want for now and very seldom do use change the index
             // size so using the default is almost surely better than using no base at all.
-            private static final long WIDTH_BASE = 64 * 1024;
+            public static final long WIDTH_BASE = 64 * 1024;
 
-            // TODO: Only public for use in RowIndexEntry for backward compatibility code. Can be made private once backward compatibility is dropped.
-            public final ISerializer<ClusteringPrefix> clusteringSerializer;
+            private final ISerializer<ClusteringPrefix> clusteringSerializer;
             private final Version version;
 
             public Serializer(CFMetaData metadata, Version version, SerializationHeader header)
@@ -132,6 +135,7 @@ public class IndexHelper
 
                 clusteringSerializer.serialize(info.firstName, out);
                 clusteringSerializer.serialize(info.lastName, out);
+                out.writeUnsignedVInt(info.offset);
                 out.writeVInt(info.width - WIDTH_BASE);
 
                 out.writeBoolean(info.endOpenMarker != null);
@@ -143,20 +147,22 @@ public class IndexHelper
             {
                 ClusteringPrefix firstName = clusteringSerializer.deserialize(in);
                 ClusteringPrefix lastName = clusteringSerializer.deserialize(in);
+                long offset;
                 long width;
                 DeletionTime endOpenMarker = null;
                 if (version.storeRows())
                 {
+                    offset = in.readUnsignedVInt();
                     width = in.readVInt() + WIDTH_BASE;
                     if (in.readBoolean())
                         endOpenMarker = DeletionTime.serializer.deserialize(in);
                 }
                 else
                 {
-                    in.readLong(); // skip offset
+                    offset = in.readLong();
                     width = in.readLong();
                 }
-                return new IndexInfo(firstName, lastName, width, endOpenMarker);
+                return new IndexInfo(firstName, lastName, offset, width, endOpenMarker);
             }
 
             public long serializedSize(IndexInfo info)
@@ -165,6 +171,7 @@ public class IndexHelper
 
                 long size = clusteringSerializer.serializedSize(info.firstName)
                           + clusteringSerializer.serializedSize(info.lastName)
+                          + TypeSizes.sizeofUnsignedVInt(info.offset)
                           + TypeSizes.sizeofVInt(info.width - WIDTH_BASE)
                           + TypeSizes.sizeof(info.endOpenMarker != null);
 

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java
index 1a4ac21..fd413fd 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java
@@ -50,7 +50,7 @@ import static org.apache.cassandra.dht.AbstractBounds.minRight;
 
 public class BigTableScanner implements ISSTableScanner
 {
-    private AtomicBoolean isClosed = new AtomicBoolean(false);
+    private final AtomicBoolean isClosed = new AtomicBoolean(false);
     protected final RandomAccessReader dfile;
     protected final RandomAccessReader ifile;
     public final SSTableReader sstable;

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/src/java/org/apache/cassandra/io/util/DataOutputBuffer.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/io/util/DataOutputBuffer.java b/src/java/org/apache/cassandra/io/util/DataOutputBuffer.java
index d9a290e..c4d6f54 100644
--- a/src/java/org/apache/cassandra/io/util/DataOutputBuffer.java
+++ b/src/java/org/apache/cassandra/io/util/DataOutputBuffer.java
@@ -114,6 +114,16 @@ public class DataOutputBuffer extends BufferedDataOutputStreamPlus
         return buffer.position();
     }
 
+    public boolean hasFilePointer()
+    {
+        return true;
+    }
+
+    public long getFilePointer()
+    {
+        return getLength();
+    }
+
     public byte[] toByteArray()
     {
         ByteBuffer buffer = buffer();

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/src/java/org/apache/cassandra/io/util/DataOutputPlus.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/io/util/DataOutputPlus.java b/src/java/org/apache/cassandra/io/util/DataOutputPlus.java
index 5f3f384..551d386 100644
--- a/src/java/org/apache/cassandra/io/util/DataOutputPlus.java
+++ b/src/java/org/apache/cassandra/io/util/DataOutputPlus.java
@@ -60,4 +60,13 @@ public interface DataOutputPlus extends DataOutput
         VIntCoding.writeUnsignedVInt(i, this);
     }
 
+    default long getFilePointer()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    default boolean hasFilePointer()
+    {
+        return false;
+    }
 }

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/src/java/org/apache/cassandra/io/util/SequentialWriter.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/io/util/SequentialWriter.java b/src/java/org/apache/cassandra/io/util/SequentialWriter.java
index 0375e23..1814aca 100644
--- a/src/java/org/apache/cassandra/io/util/SequentialWriter.java
+++ b/src/java/org/apache/cassandra/io/util/SequentialWriter.java
@@ -253,6 +253,11 @@ public class SequentialWriter extends BufferedDataOutputStreamPlus implements Tr
             runPostFlush.run();
     }
 
+    public boolean hasFilePointer()
+    {
+        return true;
+    }
+
     public long getFilePointer()
     {
         return current();

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-CompressionInfo.db
----------------------------------------------------------------------
diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-CompressionInfo.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-CompressionInfo.db
index 4955d4b..3ef3cfe 100644
Binary files a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-CompressionInfo.db and b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-CompressionInfo.db differ

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Data.db
----------------------------------------------------------------------
diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Data.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Data.db
index f4ce9a9..32c6650 100644
Binary files a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Data.db and b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Data.db differ

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Digest.crc32
----------------------------------------------------------------------
diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Digest.crc32 b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Digest.crc32
index 7eb78a6..dccca14 100644
--- a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Digest.crc32
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Digest.crc32
@@ -1 +1 @@
-3136403356
\ No newline at end of file
+8667123
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Index.db
----------------------------------------------------------------------
diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Index.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Index.db
index cffcec7..7acec1c 100644
Binary files a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Index.db and b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Index.db differ

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Statistics.db
----------------------------------------------------------------------
diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Statistics.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Statistics.db
index 7fdadbd..9c43cdf 100644
Binary files a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Statistics.db and b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Statistics.db differ

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-CompressionInfo.db
----------------------------------------------------------------------
diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-CompressionInfo.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-CompressionInfo.db
index 11c796b..d9beadf 100644
Binary files a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-CompressionInfo.db and b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-CompressionInfo.db differ

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Data.db
----------------------------------------------------------------------
diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Data.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Data.db
index 54c6779..a3add4a 100644
Binary files a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Data.db and b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Data.db differ

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Digest.crc32
----------------------------------------------------------------------
diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Digest.crc32 b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Digest.crc32
index 00efe75..ff76dda 100644
--- a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Digest.crc32
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Digest.crc32
@@ -1 +1 @@
-4233302171
\ No newline at end of file
+1871673227
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Index.db
----------------------------------------------------------------------
diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Index.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Index.db
index d703649..0e0bf4d 100644
Binary files a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Index.db and b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Index.db differ

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Statistics.db
----------------------------------------------------------------------
diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Statistics.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Statistics.db
index 1e23010..6a9adbf 100644
Binary files a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Statistics.db and b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Statistics.db differ

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Data.db
----------------------------------------------------------------------
diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Data.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Data.db
index e1c7a53..3312ada 100644
Binary files a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Data.db and b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Data.db differ

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Digest.crc32
----------------------------------------------------------------------
diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Digest.crc32 b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Digest.crc32
index 0c67ac7..8960de1 100644
--- a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Digest.crc32
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Digest.crc32
@@ -1 +1 @@
-2379822960
\ No newline at end of file
+4191595388
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Statistics.db
----------------------------------------------------------------------
diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Statistics.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Statistics.db
index 96552fd..cad7de8 100644
Binary files a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Statistics.db and b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Statistics.db differ

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Data.db
----------------------------------------------------------------------
diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Data.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Data.db
index afc7092..221484e 100644
Binary files a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Data.db and b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Data.db differ

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Digest.crc32
----------------------------------------------------------------------
diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Digest.crc32 b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Digest.crc32
index 15dd21c..6ac1679 100644
--- a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Digest.crc32
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Digest.crc32
@@ -1 +1 @@
-2816492267
\ No newline at end of file
+2738692503
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Statistics.db
----------------------------------------------------------------------
diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Statistics.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Statistics.db
index 9e0a949..a0ee5b7 100644
Binary files a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Statistics.db and b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Statistics.db differ

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/test/unit/org/apache/cassandra/cql3/KeyCacheCqlTest.java
----------------------------------------------------------------------
diff --git a/test/unit/org/apache/cassandra/cql3/KeyCacheCqlTest.java b/test/unit/org/apache/cassandra/cql3/KeyCacheCqlTest.java
new file mode 100644
index 0000000..9fef63f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/KeyCacheCqlTest.java
@@ -0,0 +1,365 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.metrics.CacheMetrics;
+import org.apache.cassandra.metrics.CassandraMetricsRegistry;
+import org.apache.cassandra.service.CacheService;
+import org.apache.cassandra.service.StorageService;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class KeyCacheCqlTest extends CQLTester
+{
+
+    static final String commonColumnsDef =
+    "part_key_a     int," +
+    "part_key_b     text," +
+    "clust_key_a    int," +
+    "clust_key_b    text," +
+    "clust_key_c    frozen<list<text>>," + // to make it really big
+    "col_text       text," +
+    "col_int        int," +
+    "col_long       bigint," +
+    "col_blob       blob,";
+    static final String commonColumns =
+    "part_key_a," +
+    "part_key_b," +
+    "clust_key_a," +
+    "clust_key_b," +
+    "clust_key_c," + // to make it really big
+    "col_text," +
+    "col_int," +
+    "col_long";
+
+    // 1200 chars
+    static final String longString = "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789";
+
+    @Test
+    public void testSliceQueries() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk text, ck1 int, ck2 int, val text, vpk text, vck1 int, vck2 int, PRIMARY KEY (pk, ck1, ck2))");
+
+        for (int pkInt = 0; pkInt < 20; pkInt++)
+        {
+            String pk = Integer.toString(pkInt);
+            for (int ck1 = 0; ck1 < 10; ck1++)
+            {
+                for (int ck2 = 0; ck2 < 10; ck2++)
+                {
+                    execute("INSERT INTO %s (pk, ck1, ck2, val, vpk, vck1, vck2) VALUES (?, ?, ?, ?, ?, ?, ?)",
+                            pk, ck1, ck2, makeStringValue(pk, ck1, ck2), pk, ck1, ck2);
+                }
+            }
+        }
+
+        StorageService.instance.forceKeyspaceFlush(KEYSPACE);
+
+        for (int pkInt = 0; pkInt < 20; pkInt++)
+        {
+            String pk = Integer.toString(pkInt);
+            assertClusterRows(execute("SELECT val, vpk, vck1, vck2 FROM %s WHERE pk=?", pk),
+                              pk, 0, 10, 0, 10);
+
+            for (int ck1 = 0; ck1 < 10; ck1++)
+            {
+                assertClusterRows(execute("SELECT val, vpk, vck1, vck2 FROM %s WHERE pk=? AND ck1=?", pk, ck1),
+                                  pk, ck1, ck1+1, 0, 10);
+
+                assertClusterRows(execute("SELECT val, vpk, vck1, vck2 FROM %s WHERE pk=? AND ck1<?", pk, ck1),
+                                  pk, 0, ck1, 0, 10);
+                assertClusterRows(execute("SELECT val, vpk, vck1, vck2 FROM %s WHERE pk=? AND ck1>?", pk, ck1),
+                                  pk, ck1+1, 10, 0, 10);
+                assertClusterRows(execute("SELECT val, vpk, vck1, vck2 FROM %s WHERE pk=? AND ck1<=?", pk, ck1),
+                                  pk, 0, ck1+1, 0, 10);
+                assertClusterRows(execute("SELECT val, vpk, vck1, vck2 FROM %s WHERE pk=? AND ck1>=?", pk, ck1),
+                                  pk, ck1, 10, 0, 10);
+
+                for (int ck2 = 0; ck2 < 10; ck2++)
+                {
+                    assertRows(execute("SELECT val, vpk, vck1, vck2 FROM %s WHERE pk=? AND ck1=? AND ck2=?", pk, ck1, ck2),
+                               new Object[]{ makeStringValue(pk, ck1, ck2), pk, ck1, ck2 });
+
+                    assertClusterRows(execute("SELECT val, vpk, vck1, vck2 FROM %s WHERE pk=? AND ck1=? AND ck2<?", pk, ck1, ck2),
+                                      pk, ck1, ck1+1, 0, ck2);
+                    assertClusterRows(execute("SELECT val, vpk, vck1, vck2 FROM %s WHERE pk=? AND ck1=? AND ck2>?", pk, ck1, ck2),
+                                      pk, ck1, ck1+1, ck2+1, 10);
+                    assertClusterRows(execute("SELECT val, vpk, vck1, vck2 FROM %s WHERE pk=? AND ck1=? AND ck2<=?", pk, ck1, ck2),
+                                      pk, ck1, ck1+1, 0, ck2+1);
+                    assertClusterRows(execute("SELECT val, vpk, vck1, vck2 FROM %s WHERE pk=? AND ck1=? AND ck2>=?", pk, ck1, ck2),
+                                      pk, ck1, ck1+1, ck2, 10);
+                }
+            }
+        }
+    }
+
+    private static void assertClusterRows(UntypedResultSet rows, String pk, int ck1from, int ck1to, int ck2from, int ck2to)
+    {
+        String info = "pk=" + pk + ", ck1from=" + ck1from + ", ck1to=" + ck1to + ", ck2from=" + ck2from + ", ck2to=" + ck2to;
+        Iterator<UntypedResultSet.Row> iter = rows.iterator();
+        int cnt = 0;
+        int expect = (ck1to - ck1from) * (ck2to - ck2from);
+        for (int ck1 = ck1from; ck1 < ck1to; ck1++)
+        {
+            for (int ck2 = ck2from; ck2 < ck2to; ck2++)
+            {
+                assertTrue("expected " + expect + " (already got " + cnt + ") rows, but more rows are available for " + info, iter.hasNext());
+                UntypedResultSet.Row row = iter.next();
+                assertEquals(makeStringValue(pk, ck1, ck2), row.getString("val"));
+                assertEquals(pk, row.getString("vpk"));
+                assertEquals(ck1, row.getInt("vck1"));
+                assertEquals(ck2, row.getInt("vck2"));
+            }
+        }
+        assertFalse("expected " + expect + " (already got " + cnt + ") rows, but more rows are available for " + info, iter.hasNext());
+    }
+
+    private static String makeStringValue(String pk, int ck1, int ck2)
+    {
+        return longString + ',' + pk + ',' + ck1 + ',' + ck2;
+    }
+
+    @Test
+    public void test2iKeyCachePaths() throws Throwable
+    {
+        String table = createTable("CREATE TABLE %s ("
+                                   + commonColumnsDef
+                                   + "PRIMARY KEY ((part_key_a, part_key_b),clust_key_a,clust_key_b,clust_key_c))");
+        createIndex("CREATE INDEX some_index ON %s (col_int)");
+        insertData(table, "some_index", true);
+        clearCache();
+
+        for (int i = 0; i < 10; i++)
+        {
+            UntypedResultSet result = execute("SELECT part_key_a FROM %s WHERE col_int = ?", i);
+            assertEquals(500, result.size());
+        }
+
+        CacheMetrics metrics = CacheService.instance.keyCache.getMetrics();
+        long hits = metrics.hits.getCount();
+        long requests = metrics.requests.getCount();
+        assertEquals(0, hits);
+        assertEquals(210, requests);
+
+        //
+
+        for (int i = 0; i < 10; i++)
+        {
+            UntypedResultSet result = execute("SELECT part_key_a FROM %s WHERE col_int = ?", i);
+            // 100 part-keys * 50 clust-keys
+            // indexed on part-key % 10 = 10 index partitions
+            // (50 clust-keys  *  100-part-keys  /  10 possible index-values) = 500
+            assertEquals(500, result.size());
+        }
+
+        metrics = CacheService.instance.keyCache.getMetrics();
+        hits = metrics.hits.getCount();
+        requests = metrics.requests.getCount();
+        assertEquals(200, hits);
+        assertEquals(420, requests);
+    }
+
+    @Test
+    public void testKeyCacheNonClustered() throws Throwable
+    {
+        String table = createTable("CREATE TABLE %s ("
+                                   + commonColumnsDef
+                                   + "PRIMARY KEY ((part_key_a, part_key_b)))");
+        insertData(table, null, false);
+        clearCache();
+
+        for (int i = 0; i < 10; i++)
+        {
+            assertRows(execute("SELECT col_text FROM %s WHERE part_key_a = ? AND part_key_b = ?", i, Integer.toOctalString(i)),
+                       new Object[]{ String.valueOf(i) + '-' + String.valueOf(0) });
+        }
+
+        CacheMetrics metrics = CacheService.instance.keyCache.getMetrics();
+        long hits = metrics.hits.getCount();
+        long requests = metrics.requests.getCount();
+        assertEquals(0, hits);
+        assertEquals(10, requests);
+
+        for (int i = 0; i < 100; i++)
+        {
+            assertRows(execute("SELECT col_text FROM %s WHERE part_key_a = ? AND part_key_b = ?", i, Integer.toOctalString(i)),
+                       new Object[]{ String.valueOf(i) + '-' + String.valueOf(0) });
+        }
+
+        hits = metrics.hits.getCount();
+        requests = metrics.requests.getCount();
+        assertEquals(10, hits);
+        assertEquals(120, requests);
+    }
+
+    @Test
+    public void testKeyCacheClustered() throws Throwable
+    {
+        String table = createTable("CREATE TABLE %s ("
+                                   + commonColumnsDef
+                                   + "PRIMARY KEY ((part_key_a, part_key_b),clust_key_a,clust_key_b,clust_key_c))");
+        insertData(table, null, true);
+        clearCache();
+
+        // query on partition key
+
+        // 10 queries, each 50 result rows
+        for (int i = 0; i < 10; i++)
+        {
+            assertEquals(50, execute("SELECT col_text FROM %s WHERE part_key_a = ? AND part_key_b = ?", i, Integer.toOctalString(i)).size());
+        }
+
+        CacheMetrics metrics = CacheService.instance.keyCache.getMetrics();
+        long hits = metrics.hits.getCount();
+        long requests = metrics.requests.getCount();
+        assertEquals(0, hits);
+        assertEquals(10, requests);
+
+        // 10 queries, each 50 result rows
+        for (int i = 0; i < 10; i++)
+        {
+            assertEquals(50, execute("SELECT col_text FROM %s WHERE part_key_a = ? AND part_key_b = ?", i, Integer.toOctalString(i)).size());
+        }
+
+        metrics = CacheService.instance.keyCache.getMetrics();
+        hits = metrics.hits.getCount();
+        requests = metrics.requests.getCount();
+        assertEquals(10, hits);
+        assertEquals(10 + 10, requests);
+
+        // 100 queries - must get a hit in key-cache
+        for (int i = 0; i < 10; i++)
+        {
+            for (int c = 0; c < 10; c++)
+            {
+                assertRows(execute("SELECT col_text, col_long FROM %s WHERE part_key_a = ? AND part_key_b = ? and clust_key_a = ?", i, Integer.toOctalString(i), c),
+                           new Object[]{ String.valueOf(i) + '-' + String.valueOf(c), (long) c });
+            }
+        }
+
+        metrics = CacheService.instance.keyCache.getMetrics();
+        hits = metrics.hits.getCount();
+        requests = metrics.requests.getCount();
+        assertEquals(10 + 100, hits);
+        assertEquals(20 + 100, requests);
+
+        // 5000 queries - first 10 partitions already in key cache
+        for (int i = 0; i < 100; i++)
+        {
+            for (int c = 0; c < 50; c++)
+            {
+                assertRows(execute("SELECT col_text, col_long FROM %s WHERE part_key_a = ? AND part_key_b = ? and clust_key_a = ?", i, Integer.toOctalString(i), c),
+                           new Object[]{ String.valueOf(i) + '-' + String.valueOf(c), (long) c });
+            }
+        }
+
+        hits = metrics.hits.getCount();
+        requests = metrics.requests.getCount();
+        assertEquals(110 + 4910, hits);
+        assertEquals(120 + 5500, requests);
+    }
+
+    // Inserts 100 partitions split over 10 sstables (flush after 10 partitions).
+    // Clustered tables receive 50 CQL rows per partition.
+    private void insertData(String table, String index, boolean withClustering) throws Throwable
+    {
+        prepareTable(table);
+        if (index != null)
+        {
+            StorageService.instance.disableAutoCompaction(KEYSPACE, table + '.' + index);
+            Keyspace.open(KEYSPACE).getColumnFamilyStore(table).indexManager.getIndexByName(index).getBlockingFlushTask().call();
+        }
+
+        for (int i = 0; i < 100; i++)
+        {
+            int partKeyA = i;
+            String partKeyB = Integer.toOctalString(i);
+            for (int c = 0; c < (withClustering ? 50 : 1); c++)
+            {
+                int clustKeyA = c;
+                String clustKeyB = Integer.toOctalString(c);
+                List<String> clustKeyC = makeList(clustKeyB);
+                String colText = String.valueOf(i) + '-' + String.valueOf(c);
+                int colInt = i % 10;
+                long colLong = c;
+                execute("INSERT INTO %s (" + commonColumns + ") VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
+                        partKeyA, partKeyB,
+                        clustKeyA, clustKeyB, clustKeyC,
+                        colText, colInt, colLong);
+            }
+
+            if (i % 10 == 9)
+            {
+                Keyspace.open(KEYSPACE).getColumnFamilyStore(table).forceFlush().get();
+                if (index != null)
+                    Keyspace.open(KEYSPACE).getColumnFamilyStore(table).indexManager.getIndexByName(index).getBlockingFlushTask().call();
+            }
+        }
+    }
+
+    private static void prepareTable(String table) throws IOException, InterruptedException, java.util.concurrent.ExecutionException
+    {
+        StorageService.instance.disableAutoCompaction(KEYSPACE, table);
+        Keyspace.open(KEYSPACE).getColumnFamilyStore(table).forceFlush().get();
+        Keyspace.open(KEYSPACE).getColumnFamilyStore(table).truncateBlocking();
+    }
+
+    private static List<String> makeList(String value)
+    {
+        List<String> list = new ArrayList<>(50);
+        for (int i = 0; i < 50; i++)
+        {
+            list.add(value + i);
+        }
+        return list;
+    }
+
+    private static void clearCache()
+    {
+        CassandraMetricsRegistry.Metrics.getNames().forEach(CassandraMetricsRegistry.Metrics::remove);
+        CacheService.instance.keyCache.clear();
+        CacheMetrics metrics = CacheService.instance.keyCache.getMetrics();
+        Assert.assertEquals(0, metrics.entries.getValue().intValue());
+        Assert.assertEquals(0L, metrics.hits.getCount());
+        Assert.assertEquals(0L, metrics.requests.getCount());
+        Assert.assertEquals(0L, metrics.size.getValue().longValue());
+    }
+}

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/test/unit/org/apache/cassandra/db/RowIndexEntryTest.java
----------------------------------------------------------------------
diff --git a/test/unit/org/apache/cassandra/db/RowIndexEntryTest.java b/test/unit/org/apache/cassandra/db/RowIndexEntryTest.java
index e4ae9ac..d004d45 100644
--- a/test/unit/org/apache/cassandra/db/RowIndexEntryTest.java
+++ b/test/unit/org/apache/cassandra/db/RowIndexEntryTest.java
@@ -18,16 +18,27 @@
 package org.apache.cassandra.db;
 
 import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Collections;
+import java.util.List;
 
 import org.apache.cassandra.Util;
+import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.LongType;
 import org.apache.cassandra.db.rows.EncodingStats;
 import org.apache.cassandra.db.partitions.*;
 import org.apache.cassandra.io.sstable.IndexHelper;
 import org.apache.cassandra.io.sstable.format.big.BigFormat;
+import org.apache.cassandra.io.util.DataInputBuffer;
 import org.apache.cassandra.io.util.DataOutputBuffer;
 import org.apache.cassandra.io.util.SequentialWriter;
+import org.apache.cassandra.utils.FBUtilities;
+
+import org.junit.Assert;
 import org.junit.Test;
 
 import static junit.framework.Assert.assertEquals;
@@ -35,6 +46,70 @@ import static junit.framework.Assert.assertTrue;
 
 public class RowIndexEntryTest extends CQLTester
 {
+    private static final List<AbstractType<?>> clusterTypes = Collections.<AbstractType<?>>singletonList(LongType.instance);
+    private static final ClusteringComparator comp = new ClusteringComparator(clusterTypes);
+    private static ClusteringPrefix cn(long l)
+    {
+        return Util.clustering(comp, l);
+    }
+
+    @Test
+    public void testArtificialIndexOf() throws IOException
+    {
+        CFMetaData cfMeta = CFMetaData.compile("CREATE TABLE pipe.dev_null (pk bigint, ck bigint, val text, PRIMARY KEY(pk, ck))", "foo");
+
+        DeletionTime deletionInfo = new DeletionTime(FBUtilities.timestampMicros(), FBUtilities.nowInSeconds());
+
+        SerializationHeader header = new SerializationHeader(cfMeta, cfMeta.partitionColumns(), EncodingStats.NO_STATS);
+        IndexHelper.IndexInfo.Serializer indexSerializer = new IndexHelper.IndexInfo.Serializer(cfMeta, BigFormat.latestVersion, header);
+
+        DataOutputBuffer dob = new DataOutputBuffer();
+        dob.writeUnsignedVInt(0);
+        DeletionTime.serializer.serialize(DeletionTime.LIVE, dob);
+        dob.writeUnsignedVInt(3);
+        int off0 = dob.getLength();
+        indexSerializer.serialize(new IndexHelper.IndexInfo(cn(0L), cn(5L), 0, 0, deletionInfo), dob);
+        int off1 = dob.getLength();
+        indexSerializer.serialize(new IndexHelper.IndexInfo(cn(10L), cn(15L), 0, 0, deletionInfo), dob);
+        int off2 = dob.getLength();
+        indexSerializer.serialize(new IndexHelper.IndexInfo(cn(20L), cn(25L), 0, 0, deletionInfo), dob);
+        dob.writeInt(off0);
+        dob.writeInt(off1);
+        dob.writeInt(off2);
+
+        @SuppressWarnings("resource") DataOutputBuffer dobRie = new DataOutputBuffer();
+        dobRie.writeUnsignedVInt(42L);
+        dobRie.writeUnsignedVInt(dob.getLength());
+        dobRie.write(dob.buffer());
+
+        ByteBuffer buf = dobRie.buffer();
+
+        RowIndexEntry<IndexHelper.IndexInfo> rie = new RowIndexEntry.Serializer(cfMeta, BigFormat.latestVersion, header).deserialize(new DataInputBuffer(buf, false));
+
+        Assert.assertEquals(42L, rie.position);
+
+        Assert.assertEquals(0, IndexHelper.indexFor(cn(-1L), rie.columnsIndex(), comp, false, -1));
+        Assert.assertEquals(0, IndexHelper.indexFor(cn(5L), rie.columnsIndex(), comp, false, -1));
+        Assert.assertEquals(1, IndexHelper.indexFor(cn(12L), rie.columnsIndex(), comp, false, -1));
+        Assert.assertEquals(2, IndexHelper.indexFor(cn(17L), rie.columnsIndex(), comp, false, -1));
+        Assert.assertEquals(3, IndexHelper.indexFor(cn(100L), rie.columnsIndex(), comp, false, -1));
+        Assert.assertEquals(3, IndexHelper.indexFor(cn(100L), rie.columnsIndex(), comp, false, 0));
+        Assert.assertEquals(3, IndexHelper.indexFor(cn(100L), rie.columnsIndex(), comp, false, 1));
+        Assert.assertEquals(3, IndexHelper.indexFor(cn(100L), rie.columnsIndex(), comp, false, 2));
+        Assert.assertEquals(3, IndexHelper.indexFor(cn(100L), rie.columnsIndex(), comp, false, 3));
+
+        Assert.assertEquals(-1, IndexHelper.indexFor(cn(-1L), rie.columnsIndex(), comp, true, -1));
+        Assert.assertEquals(0, IndexHelper.indexFor(cn(5L), rie.columnsIndex(), comp, true, 3));
+        Assert.assertEquals(0, IndexHelper.indexFor(cn(5L), rie.columnsIndex(), comp, true, 2));
+        Assert.assertEquals(1, IndexHelper.indexFor(cn(17L), rie.columnsIndex(), comp, true, 3));
+        Assert.assertEquals(2, IndexHelper.indexFor(cn(100L), rie.columnsIndex(), comp, true, 3));
+        Assert.assertEquals(2, IndexHelper.indexFor(cn(100L), rie.columnsIndex(), comp, true, 4));
+        Assert.assertEquals(1, IndexHelper.indexFor(cn(12L), rie.columnsIndex(), comp, true, 3));
+        Assert.assertEquals(1, IndexHelper.indexFor(cn(12L), rie.columnsIndex(), comp, true, 2));
+        Assert.assertEquals(1, IndexHelper.indexFor(cn(100L), rie.columnsIndex(), comp, true, 1));
+        Assert.assertEquals(2, IndexHelper.indexFor(cn(100L), rie.columnsIndex(), comp, true, 2));
+    }
+
     @Test
     public void testSerializedSize() throws Throwable
     {
@@ -55,7 +130,6 @@ public class RowIndexEntryTest extends CQLTester
         for (int i = 0; i <= DatabaseDescriptor.getColumnIndexSize() / 4; i++)
             execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 0, "" + i, i);
 
-        buffer = new DataOutputBuffer();
         ImmutableBTreePartition partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs).build());
 
         File tempFile = File.createTempFile("row_index_entry_test", null);
@@ -63,11 +137,77 @@ public class RowIndexEntryTest extends CQLTester
         SequentialWriter writer = SequentialWriter.open(tempFile);
         ColumnIndex columnIndex = ColumnIndex.writeAndBuildIndex(partition.unfilteredIterator(), writer, header, BigFormat.latestVersion);
         RowIndexEntry<IndexHelper.IndexInfo> withIndex = RowIndexEntry.create(0xdeadbeef, DeletionTime.LIVE, columnIndex);
+        IndexHelper.IndexInfo.Serializer indexSerializer = new IndexHelper.IndexInfo.Serializer(cfs.metadata, BigFormat.latestVersion, header);
 
         // sanity check
         assertTrue(columnIndex.columnsIndex.size() >= 3);
 
+        buffer = new DataOutputBuffer();
         serializer.serialize(withIndex, buffer);
         assertEquals(buffer.getLength(), serializer.serializedSize(withIndex));
+
+        // serialization check
+
+        ByteBuffer bb = buffer.buffer();
+        DataInputBuffer input = new DataInputBuffer(bb, false);
+        serializationCheck(withIndex, indexSerializer, bb, input);
+
+        // test with an output stream that doesn't support a file-pointer
+        buffer = new DataOutputBuffer()
+        {
+            public boolean hasFilePointer()
+            {
+                return false;
+            }
+
+            public long getFilePointer()
+            {
+                throw new UnsupportedOperationException();
+            }
+        };
+        serializer.serialize(withIndex, buffer);
+        bb = buffer.buffer();
+        input = new DataInputBuffer(bb, false);
+        serializationCheck(withIndex, indexSerializer, bb, input);
+
+        //
+
+        bb = buffer.buffer();
+        input = new DataInputBuffer(bb, false);
+        RowIndexEntry.Serializer.skip(input, BigFormat.latestVersion);
+        Assert.assertEquals(0, bb.remaining());
+    }
+
+    private void serializationCheck(RowIndexEntry<IndexHelper.IndexInfo> withIndex, IndexHelper.IndexInfo.Serializer indexSerializer, ByteBuffer bb, DataInputBuffer input) throws IOException
+    {
+        Assert.assertEquals(0xdeadbeef, input.readUnsignedVInt());
+        Assert.assertEquals(withIndex.promotedSize(indexSerializer), input.readUnsignedVInt());
+
+        Assert.assertEquals(withIndex.headerLength(), input.readUnsignedVInt());
+        Assert.assertEquals(withIndex.deletionTime(), DeletionTime.serializer.deserialize(input));
+        Assert.assertEquals(withIndex.columnsIndex().size(), input.readUnsignedVInt());
+
+        int offset = bb.position();
+        int[] offsets = new int[withIndex.columnsIndex().size()];
+        for (int i = 0; i < withIndex.columnsIndex().size(); i++)
+        {
+            int pos = bb.position();
+            offsets[i] = pos - offset;
+            IndexHelper.IndexInfo info = indexSerializer.deserialize(input);
+            int end = bb.position();
+
+            Assert.assertEquals(indexSerializer.serializedSize(info), end - pos);
+
+            Assert.assertEquals(withIndex.columnsIndex().get(i).offset, info.offset);
+            Assert.assertEquals(withIndex.columnsIndex().get(i).width, info.width);
+            Assert.assertEquals(withIndex.columnsIndex().get(i).endOpenMarker, info.endOpenMarker);
+            Assert.assertEquals(withIndex.columnsIndex().get(i).firstName, info.firstName);
+            Assert.assertEquals(withIndex.columnsIndex().get(i).lastName, info.lastName);
+        }
+
+        for (int i = 0; i < withIndex.columnsIndex().size(); i++)
+            Assert.assertEquals(offsets[i], input.readInt());
+
+        Assert.assertEquals(0, bb.remaining());
     }
 }

http://git-wip-us.apache.org/repos/asf/cassandra/blob/51b1a1c6/test/unit/org/apache/cassandra/io/sstable/IndexHelperTest.java
----------------------------------------------------------------------
diff --git a/test/unit/org/apache/cassandra/io/sstable/IndexHelperTest.java b/test/unit/org/apache/cassandra/io/sstable/IndexHelperTest.java
index 2c967d0..e6328de 100644
--- a/test/unit/org/apache/cassandra/io/sstable/IndexHelperTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/IndexHelperTest.java
@@ -29,7 +29,6 @@ import org.apache.cassandra.db.ClusteringComparator;
 import org.apache.cassandra.db.ClusteringPrefix;
 import org.apache.cassandra.db.DeletionTime;
 import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.IntegerType;
 import org.apache.cassandra.db.marshal.LongType;
 import org.apache.cassandra.utils.FBUtilities;
 
@@ -51,10 +50,9 @@ public class IndexHelperTest
         DeletionTime deletionInfo = new DeletionTime(FBUtilities.timestampMicros(), FBUtilities.nowInSeconds());
 
         List<IndexInfo> indexes = new ArrayList<>();
-        indexes.add(new IndexInfo(cn(0L), cn(5L), 0, deletionInfo));
-        indexes.add(new IndexInfo(cn(10L), cn(15L), 0,deletionInfo));
-        indexes.add(new IndexInfo(cn(20L), cn(25L), 0, deletionInfo));
-
+        indexes.add(new IndexInfo(cn(0L), cn(5L), 0, 0, deletionInfo));
+        indexes.add(new IndexInfo(cn(10L), cn(15L), 0, 0, deletionInfo));
+        indexes.add(new IndexInfo(cn(20L), cn(25L), 0, 0, deletionInfo));
 
         assertEquals(0, IndexHelper.indexFor(cn(-1L), indexes, comp, false, -1));
         assertEquals(0, IndexHelper.indexFor(cn(5L), indexes, comp, false, -1));