You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cassandra.apache.org by jb...@apache.org on 2014/04/03 03:37:25 UTC

git commit: optimize fetching multiple cells by name from CF patch by Benedict Elliott Smith; reviewed by jbellis and ayeschenko for CASSANDRA-6933

Repository: cassandra
Updated Branches:
  refs/heads/trunk 6471d0caa -> 6e9140ab6


optimize fetching multiple cells by name from CF
patch by Benedict Elliott Smith; reviewed by jbellis and ayeschenko for CASSANDRA-6933


Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo
Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/6e9140ab
Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/6e9140ab
Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/6e9140ab

Branch: refs/heads/trunk
Commit: 6e9140ab6004e6eb6eac6f05073d158a40c0645f
Parents: 6471d0c
Author: Jonathan Ellis <jb...@apache.org>
Authored: Wed Apr 2 20:36:56 2014 -0500
Committer: Jonathan Ellis <jb...@apache.org>
Committed: Wed Apr 2 20:37:06 2014 -0500

----------------------------------------------------------------------
 .../cassandra/db/ArrayBackedSortedColumns.java  | 52 +++++++++++
 .../apache/cassandra/db/AtomicBTreeColumns.java |  7 ++
 .../cassandra/db/CollationController.java       |  6 +-
 .../org/apache/cassandra/db/ColumnFamily.java   |  2 +
 .../cassandra/db/filter/NamesQueryFilter.java   | 15 +--
 .../cassandra/service/AbstractRowResolver.java  |  7 +-
 .../apache/cassandra/utils/SearchIterator.java  | 26 ++++++
 .../utils/btree/BTreeSearchIterator.java        | 67 ++++++++++++++
 .../apache/cassandra/utils/btree/Cursor.java    | 12 +--
 .../org/apache/cassandra/utils/btree/Path.java  | 46 +++++++---
 .../apache/cassandra/utils/LongBTreeTest.java   | 96 ++++++++++++++++++++
 .../db/ArrayBackedSortedColumnsTest.java        | 43 +++++++++
 12 files changed, 349 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cassandra/blob/6e9140ab/src/java/org/apache/cassandra/db/ArrayBackedSortedColumns.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/ArrayBackedSortedColumns.java b/src/java/org/apache/cassandra/db/ArrayBackedSortedColumns.java
index e04867a..0fe4448 100644
--- a/src/java/org/apache/cassandra/db/ArrayBackedSortedColumns.java
+++ b/src/java/org/apache/cassandra/db/ArrayBackedSortedColumns.java
@@ -35,6 +35,7 @@ import org.apache.cassandra.db.composites.CellNameType;
 import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.filter.ColumnSlice;
 import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.SearchIterator;
 
 /**
  * A ColumnFamily backed by an array.
@@ -437,6 +438,57 @@ public class ArrayBackedSortedColumns extends ColumnFamily
         return new SlicesIterator(Arrays.asList(cells).subList(0, size), getComparator(), slices, !reversed);
     }
 
+    public SearchIterator<CellName, Cell> searchIterator()
+    {
+        maybeSortCells();
+        return new SearchIterator<CellName, Cell>()
+        {
+            // the first index that we could find the next key at, i.e. one larger
+            // than the last key's location
+            private int i = 0;
+
+            // We assume a uniform distribution of keys,
+            // so we keep track of how many keys were skipped to satisfy last lookup, and only look at twice that
+            // many keys for next lookup initially, extending to whole range only if we couldn't find it in that subrange
+            private int range = size / 2;
+
+            public boolean hasNext()
+            {
+                return i < size;
+            }
+
+            public Cell next(CellName name)
+            {
+                assert sortedSize == size;
+                assert hasNext();
+
+                // optimize for runs of sequential matches, as in CollationController
+                // checking to see if we've found the desired cells yet (CASSANDRA-6933)
+                if (metadata.comparator.compare(name, cells[i].name()) == 0)
+                    return cells[i++];
+
+                // use range to manually force a better bsearch "pivot" by breaking it into two calls:
+                // first for i..i+range, then i+range..size if necessary.
+                // https://issues.apache.org/jira/browse/CASSANDRA-6933?focusedCommentId=13958264&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-13958264
+                int limit = Math.min(size, i + range);
+                int i2 = binarySearch(i + 1, limit, name, internalComparator());
+                if (-1 - i2 == limit)
+                    i2 = binarySearch(limit, size, name, internalComparator());
+                // i2 can't be zero since we already checked cells[i] above
+                if (i2 > 0)
+                {
+                    range = i2 - i;
+                    i = i2 + 1;
+                    return cells[i2];
+                }
+                i2 = -1 - i2;
+                range = i2 - i;
+                i = i2;
+                return null;
+            }
+        };
+    }
+
     private static class SlicesIterator extends AbstractIterator<Cell>
     {
         private final List<Cell> cells;

http://git-wip-us.apache.org/repos/asf/cassandra/blob/6e9140ab/src/java/org/apache/cassandra/db/AtomicBTreeColumns.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/AtomicBTreeColumns.java b/src/java/org/apache/cassandra/db/AtomicBTreeColumns.java
index 8cbeb83..8419235 100644
--- a/src/java/org/apache/cassandra/db/AtomicBTreeColumns.java
+++ b/src/java/org/apache/cassandra/db/AtomicBTreeColumns.java
@@ -36,7 +36,9 @@ import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.index.SecondaryIndexManager;
 import org.apache.cassandra.db.filter.ColumnSlice;
 import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.SearchIterator;
 import org.apache.cassandra.utils.btree.BTree;
+import org.apache.cassandra.utils.btree.BTreeSearchIterator;
 import org.apache.cassandra.utils.btree.BTreeSet;
 import org.apache.cassandra.utils.btree.UpdateFunction;
 
@@ -119,6 +121,11 @@ public class AtomicBTreeColumns extends ColumnFamily
         delete(new DeletionInfo(tombstone, getComparator()));
     }
 
+    public SearchIterator<CellName, Cell> searchIterator()
+    {
+        return new BTreeSearchIterator<>(ref.tree, asymmetricComparator());
+    }
+
     public void delete(DeletionInfo info)
     {
         if (info.isLive())

http://git-wip-us.apache.org/repos/asf/cassandra/blob/6e9140ab/src/java/org/apache/cassandra/db/CollationController.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/CollationController.java b/src/java/org/apache/cassandra/db/CollationController.java
index 151a7c5..74113e0 100644
--- a/src/java/org/apache/cassandra/db/CollationController.java
+++ b/src/java/org/apache/cassandra/db/CollationController.java
@@ -34,6 +34,7 @@ import org.apache.cassandra.db.marshal.CounterColumnType;
 import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.tracing.Tracing;
+import org.apache.cassandra.utils.SearchIterator;
 import org.apache.cassandra.utils.memory.HeapAllocator;
 
 public class CollationController
@@ -171,10 +172,11 @@ public class CollationController
         if (container == null)
             return;
 
-        for (Iterator<CellName> iterator = ((NamesQueryFilter) filter.filter).columns.iterator(); iterator.hasNext(); )
+        SearchIterator<CellName, Cell> searchIter = container.searchIterator();
+        for (Iterator<CellName> iterator = ((NamesQueryFilter) filter.filter).columns.iterator(); iterator.hasNext() && searchIter.hasNext(); )
         {
             CellName filterColumn = iterator.next();
-            Cell cell = container.getColumn(filterColumn);
+            Cell cell = searchIter.next(filterColumn);
             if (cell != null && cell.timestamp() > sstableTimestamp)
                 iterator.remove();
         }

http://git-wip-us.apache.org/repos/asf/cassandra/blob/6e9140ab/src/java/org/apache/cassandra/db/ColumnFamily.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/ColumnFamily.java b/src/java/org/apache/cassandra/db/ColumnFamily.java
index e7aab37..e9eb05a 100644
--- a/src/java/org/apache/cassandra/db/ColumnFamily.java
+++ b/src/java/org/apache/cassandra/db/ColumnFamily.java
@@ -189,6 +189,8 @@ public abstract class ColumnFamily implements Iterable<Cell>, IRowCacheEntry
     public abstract void delete(DeletionTime deletionTime);
     protected abstract void delete(RangeTombstone tombstone);
 
+    public abstract SearchIterator<CellName, Cell> searchIterator();
+
     /**
      * Purges top-level and range tombstones whose localDeletionTime is older than gcBefore.
      * @param gcBefore a timestamp (in seconds) before which tombstones should be purged

http://git-wip-us.apache.org/repos/asf/cassandra/blob/6e9140ab/src/java/org/apache/cassandra/db/filter/NamesQueryFilter.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/filter/NamesQueryFilter.java b/src/java/org/apache/cassandra/db/filter/NamesQueryFilter.java
index d6d1332..03e3f12 100644
--- a/src/java/org/apache/cassandra/db/filter/NamesQueryFilter.java
+++ b/src/java/org/apache/cassandra/db/filter/NamesQueryFilter.java
@@ -39,6 +39,7 @@ import org.apache.cassandra.io.IVersionedSerializer;
 import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.io.util.FileDataInput;
+import org.apache.cassandra.utils.SearchIterator;
 
 public class NamesQueryFilter implements IDiskAtomFilter
 {
@@ -183,13 +184,15 @@ public class NamesQueryFilter implements IDiskAtomFilter
     {
         private final ColumnFamily cf;
         private final DecoratedKey key;
-        private final Iterator<CellName> iter;
+        private final Iterator<CellName> names;
+        private final SearchIterator<CellName, Cell> cells;
 
-        public ByNameColumnIterator(Iterator<CellName> iter, ColumnFamily cf, DecoratedKey key)
+        public ByNameColumnIterator(Iterator<CellName> names, ColumnFamily cf, DecoratedKey key)
         {
-            this.iter = iter;
+            this.names = names;
             this.cf = cf;
             this.key = key;
+            this.cells = cf.searchIterator();
         }
 
         public ColumnFamily getColumnFamily()
@@ -204,10 +207,10 @@ public class NamesQueryFilter implements IDiskAtomFilter
 
         protected OnDiskAtom computeNext()
         {
-            while (iter.hasNext())
+            while (names.hasNext() && cells.hasNext())
             {
-                CellName current = iter.next();
-                Cell cell = cf.getColumn(current);
+                CellName current = names.next();
+                Cell cell = cells.next(current);
                 if (cell != null)
                     return cell;
             }

http://git-wip-us.apache.org/repos/asf/cassandra/blob/6e9140ab/src/java/org/apache/cassandra/service/AbstractRowResolver.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/service/AbstractRowResolver.java b/src/java/org/apache/cassandra/service/AbstractRowResolver.java
index 47a00da..e27dc00 100644
--- a/src/java/org/apache/cassandra/service/AbstractRowResolver.java
+++ b/src/java/org/apache/cassandra/service/AbstractRowResolver.java
@@ -18,9 +18,10 @@
 package org.apache.cassandra.service;
 
 import java.nio.ByteBuffer;
-import java.util.Set;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
 
-import org.cliffc.high_scale_lib.NonBlockingHashSet;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -34,7 +35,7 @@ public abstract class AbstractRowResolver implements IResponseResolver<ReadRespo
     protected static final Logger logger = LoggerFactory.getLogger(AbstractRowResolver.class);
 
     protected final String keyspaceName;
-    protected final Set<MessageIn<ReadResponse>> replies = new NonBlockingHashSet<MessageIn<ReadResponse>>();
+    protected final List<MessageIn<ReadResponse>> replies = Collections.synchronizedList(new ArrayList<MessageIn<ReadResponse>>());
     protected final DecoratedKey key;
 
     public AbstractRowResolver(ByteBuffer key, String keyspaceName)

http://git-wip-us.apache.org/repos/asf/cassandra/blob/6e9140ab/src/java/org/apache/cassandra/utils/SearchIterator.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/utils/SearchIterator.java b/src/java/org/apache/cassandra/utils/SearchIterator.java
new file mode 100644
index 0000000..004b02a
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/SearchIterator.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils;
+
+public interface SearchIterator<K, V>
+{
+
+    public boolean hasNext();
+    public V next(K key);
+
+}

http://git-wip-us.apache.org/repos/asf/cassandra/blob/6e9140ab/src/java/org/apache/cassandra/utils/btree/BTreeSearchIterator.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/utils/btree/BTreeSearchIterator.java b/src/java/org/apache/cassandra/utils/btree/BTreeSearchIterator.java
new file mode 100644
index 0000000..7a83238
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/btree/BTreeSearchIterator.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.btree;
+
+import java.util.Comparator;
+
+import org.apache.cassandra.utils.SearchIterator;
+
+import static org.apache.cassandra.utils.btree.BTree.getKeyEnd;
+
+public class BTreeSearchIterator<CK, K extends CK, V> extends Path implements SearchIterator<K, V>
+{
+
+    final Comparator<CK> comparator;
+    public BTreeSearchIterator(Object[] btree, Comparator<CK> comparator)
+    {
+        init(btree);
+        this.comparator = comparator;
+    }
+
+    public V next(K target)
+    {
+        while (depth > 0)
+        {
+            byte successorParentDepth = findSuccessorParentDepth();
+            if (successorParentDepth < 0)
+                break; // we're in last section of tree, so can only search down
+            int successorParentIndex = indexes[successorParentDepth] + 1;
+            Object[] successParentNode = path[successorParentDepth];
+            Object successorParentKey = successParentNode[successorParentIndex];
+            int c = BTree.compare(comparator, target, successorParentKey);
+            if (c < 0)
+                break;
+            if (c == 0)
+            {
+                depth = successorParentDepth;
+                indexes[successorParentDepth]++;
+                return (V) successorParentKey;
+            }
+            depth = successorParentDepth;
+            indexes[successorParentDepth]++;
+        }
+        if (find(comparator, target, Op.CEIL, true))
+            return (V) currentKey();
+        return null;
+    }
+
+    public boolean hasNext()
+    {
+        return depth != 0 || indexes[0] != getKeyEnd(path[0]);
+    }
+}

http://git-wip-us.apache.org/repos/asf/cassandra/blob/6e9140ab/src/java/org/apache/cassandra/utils/btree/Cursor.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/utils/btree/Cursor.java b/src/java/org/apache/cassandra/utils/btree/Cursor.java
index bc88442..132a776 100644
--- a/src/java/org/apache/cassandra/utils/btree/Cursor.java
+++ b/src/java/org/apache/cassandra/utils/btree/Cursor.java
@@ -93,7 +93,7 @@ public final class Cursor<V> extends Path implements Iterator<V>
 
     private void _reset(Object[] btree, Comparator<V> comparator, Object lowerBound, boolean inclusiveLowerBound, Object upperBound, boolean inclusiveUpperBound, boolean forwards)
     {
-        ensureDepth(btree);
+        init(btree);
         if (lowerBound == null)
             lowerBound = NEGATIVE_INFINITY;
         if (upperBound == null)
@@ -101,16 +101,16 @@ public final class Cursor<V> extends Path implements Iterator<V>
 
         this.forwards = forwards;
 
-        Path findLast = new Path(this.path.length);
+        Path findLast = new Path(this.path.length, btree);
         if (forwards)
         {
-            findLast.find(btree, comparator, upperBound, inclusiveUpperBound ? Op.HIGHER : Op.CEIL, true);
-            find(btree, comparator, lowerBound, inclusiveLowerBound ? Op.CEIL : Op.HIGHER, true);
+            findLast.find(comparator, upperBound, inclusiveUpperBound ? Op.HIGHER : Op.CEIL, true);
+            find(comparator, lowerBound, inclusiveLowerBound ? Op.CEIL : Op.HIGHER, true);
         }
         else
         {
-            findLast.find(btree, comparator, lowerBound, inclusiveLowerBound ? Op.LOWER : Op.FLOOR, false);
-            find(btree, comparator, upperBound, inclusiveUpperBound ? Op.FLOOR : Op.LOWER, false);
+            findLast.find(comparator, lowerBound, inclusiveLowerBound ? Op.LOWER : Op.FLOOR, false);
+            find(comparator, upperBound, inclusiveUpperBound ? Op.FLOOR : Op.LOWER, false);
         }
         int c = this.compareTo(findLast, forwards);
         if (forwards ? c > 0 : c < 0)

http://git-wip-us.apache.org/repos/asf/cassandra/blob/6e9140ab/src/java/org/apache/cassandra/utils/btree/Path.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/utils/btree/Path.java b/src/java/org/apache/cassandra/utils/btree/Path.java
index 148c713..49e2d4b 100644
--- a/src/java/org/apache/cassandra/utils/btree/Path.java
+++ b/src/java/org/apache/cassandra/utils/btree/Path.java
@@ -34,7 +34,7 @@ import static org.apache.cassandra.utils.btree.BTree.isLeaf;
  *
  * Path is only intended to be used via Cursor.
  */
-class Path
+public class Path<V>
 {
     // operations corresponding to the ones in NavigableSet
     static enum Op
@@ -50,16 +50,17 @@ class Path
     // the index within the node of our path at a given depth
     byte[] indexes;
     // current depth.  nothing in path[i] for i > depth is valid.
-    byte depth = -1;
+    byte depth;
 
     Path() { }
-    Path(int depth)
+    Path(int depth, Object[] btree)
     {
         this.path = new Object[depth][];
         this.indexes = new byte[depth];
+        this.path[0] = btree;
     }
 
-    void ensureDepth(Object[] btree)
+    void init(Object[] btree)
     {
         int depth = BTree.depth(btree);
         if (path == null || path.length < depth)
@@ -67,32 +68,36 @@ class Path
             path = new Object[depth][];
             indexes = new byte[depth];
         }
+        path[0] = btree;
     }
 
     /**
      * Find the provided key in the tree rooted at node, and store the root to it in the path
      *
-     * @param node       the tree to search in
      * @param comparator the comparator defining the order on the tree
      * @param target     the key to search for
      * @param mode       the type of search to perform
      * @param forwards   if the path should be setup for forward or backward iteration
      * @param <V>
      */
-    <V> void find(Object[] node, Comparator<V> comparator, Object target, Op mode, boolean forwards)
+    <V> boolean find(Comparator<V> comparator, Object target, Op mode, boolean forwards)
     {
         // TODO : should not require parameter 'forwards' - consider modifying index to represent both
         // child and key position, as opposed to just key position (which necessitates a different value depending
         // on which direction you're moving in. Prerequisite for making Path public and using to implement general
         // search
 
-        depth = -1;
+        Object[] node = path[depth];
+        int lb = indexes[depth];
+        assert lb == 0 || forwards;
+        pop();
         while (true)
         {
             int keyEnd = getKeyEnd(node);
 
             // search for the target in the current node
-            int i = BTree.find(comparator, target, node, 0, keyEnd);
+            int i = BTree.find(comparator, target, node, lb, keyEnd);
+            lb = 0;
             if (i >= 0)
             {
                 // exact match. transform exclusive bounds into the correct index by moving back or forwards one
@@ -105,7 +110,7 @@ class Path
                     case LOWER:
                         predecessor();
                 }
-                return;
+                return true;
             }
 
             // traverse into the appropriate child
@@ -141,16 +146,16 @@ class Path
                 push(node, i);
             }
 
-            return;
+            return false;
         }
     }
 
-    private boolean isRoot()
+    boolean isRoot()
     {
         return depth == 0;
     }
 
-    private void pop()
+    void pop()
     {
         depth--;
     }
@@ -165,7 +170,7 @@ class Path
         return indexes[depth];
     }
 
-    private void push(Object[] node, int index)
+    void push(Object[] node, int index)
     {
         path[++depth] = node;
         indexes[depth] = (byte) index;
@@ -176,6 +181,21 @@ class Path
         indexes[depth] = (byte) index;
     }
 
+    byte findSuccessorParentDepth()
+    {
+        byte depth = this.depth;
+        depth--;
+        while (depth >= 0)
+        {
+            int ub = indexes[depth] + 1;
+            Object[] node = path[depth];
+            if (ub < getBranchKeyEnd(node))
+                return depth;
+            depth--;
+        }
+        return -1;
+    }
+
     // move to the next key in the tree
     void successor()
     {

http://git-wip-us.apache.org/repos/asf/cassandra/blob/6e9140ab/test/long/org/apache/cassandra/utils/LongBTreeTest.java
----------------------------------------------------------------------
diff --git a/test/long/org/apache/cassandra/utils/LongBTreeTest.java b/test/long/org/apache/cassandra/utils/LongBTreeTest.java
index 514d166..498a9c9 100644
--- a/test/long/org/apache/cassandra/utils/LongBTreeTest.java
+++ b/test/long/org/apache/cassandra/utils/LongBTreeTest.java
@@ -20,11 +20,21 @@ package org.apache.cassandra.utils;
 
 import java.util.*;
 import java.util.concurrent.Callable;
+import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
+import java.util.concurrent.Semaphore;
+import java.util.concurrent.ThreadLocalRandom;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicLong;
 
+import javax.annotation.Nullable;
+
+import com.google.common.base.Function;
+import com.google.common.base.Predicate;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Iterators;
 import com.google.common.util.concurrent.Futures;
 import com.google.common.util.concurrent.ListenableFuture;
 import com.google.common.util.concurrent.ListenableFutureTask;
@@ -37,7 +47,9 @@ import com.yammer.metrics.core.TimerContext;
 import com.yammer.metrics.stats.Snapshot;
 import org.apache.cassandra.concurrent.NamedThreadFactory;
 import org.apache.cassandra.utils.btree.BTree;
+import org.apache.cassandra.utils.btree.BTreeSearchIterator;
 import org.apache.cassandra.utils.btree.BTreeSet;
+import org.apache.cassandra.utils.btree.UpdateFunction;
 
 // TODO : should probably lower fan-factor for tests to make them more intensive
 public class LongBTreeTest
@@ -103,6 +115,51 @@ public class LongBTreeTest
         testInsertions(10000, 50, 10, 10, false);
     }
 
+    @Test
+    public void testSearchIterator() throws InterruptedException
+    {
+        int threads = Runtime.getRuntime().availableProcessors();
+        final CountDownLatch latch = new CountDownLatch(threads);
+        final AtomicLong errors = new AtomicLong();
+        final AtomicLong count = new AtomicLong();
+        final int perThreadTrees = 100;
+        final int perTreeSelections = 100;
+        final long totalCount = threads * perThreadTrees * perTreeSelections;
+        for (int t = 0 ; t < threads ; t++)
+        {
+            MODIFY.execute(new Runnable()
+            {
+                public void run()
+                {
+                    ThreadLocalRandom random = ThreadLocalRandom.current();
+                    for (int i = 0 ; i < perThreadTrees ; i++)
+                    {
+                        Object[] tree = randomTree(10000, random);
+                        for (int j = 0 ; j < perTreeSelections ; j++)
+                        {
+                            BTreeSearchIterator<Integer, Integer, Integer> searchIterator = new BTreeSearchIterator<>(tree, ICMP);
+                            for (Integer key : randomSelection(tree, random))
+                                if (key != searchIterator.next(key))
+                                    errors.incrementAndGet();
+                            for (Integer key : randomMix(tree, random))
+                                if (key != searchIterator.next(key))
+                                    if (BTree.find(tree, ICMP, key) == key)
+                                        errors.incrementAndGet();
+                            count.incrementAndGet();
+                        }
+                    }
+                    latch.countDown();
+                }
+            });
+        }
+        while (latch.getCount() > 0)
+        {
+            latch.await(10L, TimeUnit.SECONDS);
+            System.out.println(String.format("%.0f%% complete %s", 100 * count.get() / (double) totalCount, errors.get() > 0 ? ("Errors: " + errors.get()) : ""));
+            assert errors.get() == 0;
+        }
+    }
+
     private static void testInsertions(int totalCount, int perTestCount, int testKeyRatio, int modificationBatchSize, boolean quickEquality) throws ExecutionException, InterruptedException
     {
         int batchesPerTest = perTestCount / modificationBatchSize;
@@ -354,4 +411,43 @@ public class LongBTreeTest
         };
     }
 
+    private static Object[] randomTree(int maxSize, Random random)
+    {
+        TreeSet<Integer> build = new TreeSet<>();
+        int size = random.nextInt(maxSize);
+        for (int i = 0 ; i < size ; i++)
+        {
+            build.add(random.nextInt());
+        }
+        return BTree.build(build, ICMP, true, UpdateFunction.NoOp.<Integer>instance());
+    }
+
+    private static Iterable<Integer> randomSelection(Object[] iter, final Random rnd)
+    {
+        final float proportion = rnd.nextFloat();
+        return Iterables.filter(new BTreeSet<>(iter, ICMP), new Predicate<Integer>()
+        {
+            public boolean apply(@Nullable Integer integer)
+            {
+                return rnd.nextFloat() < proportion;
+            }
+        });
+    }
+
+    private static Iterable<Integer> randomMix(Object[] iter, final Random rnd)
+    {
+        final float proportion = rnd.nextFloat();
+        return Iterables.transform(new BTreeSet<>(iter, ICMP), new Function<Integer, Integer>()
+        {
+            int last = Integer.MIN_VALUE;
+
+            public Integer apply(Integer v)
+            {
+                if (rnd.nextFloat() < proportion)
+                    return last = v;
+                return last = (v - last) / 2;
+            }
+        });
+    }
+
 }

http://git-wip-us.apache.org/repos/asf/cassandra/blob/6e9140ab/test/unit/org/apache/cassandra/db/ArrayBackedSortedColumnsTest.java
----------------------------------------------------------------------
diff --git a/test/unit/org/apache/cassandra/db/ArrayBackedSortedColumnsTest.java b/test/unit/org/apache/cassandra/db/ArrayBackedSortedColumnsTest.java
index a1c98f3..33d3599 100644
--- a/test/unit/org/apache/cassandra/db/ArrayBackedSortedColumnsTest.java
+++ b/test/unit/org/apache/cassandra/db/ArrayBackedSortedColumnsTest.java
@@ -32,6 +32,7 @@ import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.filter.ColumnSlice;
 import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.utils.SearchIterator;
 
 public class ArrayBackedSortedColumnsTest extends SchemaLoader
 {
@@ -213,6 +214,43 @@ public class ArrayBackedSortedColumnsTest extends SchemaLoader
         assertSame(map.iterator(), map.iterator(ColumnSlice.ALL_COLUMNS_ARRAY));
     }
 
+    @Test
+    public void testSearchIterator()
+    {
+        CellNameType type = new SimpleDenseCellNameType(Int32Type.instance);
+        ColumnFamily map = ArrayBackedSortedColumns.factory.create(metadata(), false);
+
+        int[] values = new int[]{ 1, 2, 3, 5, 9, 15, 21, 22 };
+
+        for (int i = 0; i < values.length; ++i)
+            map.addColumn(new Cell(type.makeCellName(values[i])));
+
+        SearchIterator<CellName, Cell> iter = map.searchIterator();
+        for (int i = 0 ; i < values.length ; i++)
+            assertSame(values[i], iter.next(type.makeCellName(values[i])));
+
+        iter = map.searchIterator();
+        for (int i = 0 ; i < values.length ; i+=2)
+            assertSame(values[i], iter.next(type.makeCellName(values[i])));
+
+        iter = map.searchIterator();
+        for (int i = 0 ; i < values.length ; i+=4)
+            assertSame(values[i], iter.next(type.makeCellName(values[i])));
+
+        iter = map.searchIterator();
+        for (int i = 0 ; i < values.length ; i+=1)
+        {
+            if (i % 2 == 0)
+            {
+                Cell cell = iter.next(type.makeCellName(values[i] - 1));
+                if (i > 0 && values[i - 1] == values[i] - 1)
+                    assertSame(values[i - 1], cell);
+                else
+                    assertNull(cell);
+            }
+        }
+    }
+
     private <T> void assertSame(Iterable<T> c1, Iterable<T> c2)
     {
         assertSame(c1.iterator(), c2.iterator());
@@ -226,6 +264,11 @@ public class ArrayBackedSortedColumnsTest extends SchemaLoader
             fail("The collection don't have the same size");
     }
 
+    private void assertSame(int name, Cell cell)
+    {
+        int value = ByteBufferUtil.toInt(cell.name().toByteBuffer());
+        assert name == value : "Expected " + name + " but got " + value;
+    }
     private void assertSame(int[] names, Iterator<Cell> iter)
     {
         for (int name : names)