You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cassandra.apache.org by ty...@apache.org on 2015/01/09 18:15:37 UTC

cassandra git commit: Fix DISTINCT queries w/ limits/paging and tombstoned partitions

Repository: cassandra
Updated Branches:
  refs/heads/cassandra-2.0 ed54e8085 -> dd62f7bf7


Fix DISTINCT queries w/ limits/paging and tombstoned partitions

Patch by Tyler Hobbs; reviewed by Sylvain Lebresne for CASSANDRA-8490


Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo
Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/dd62f7bf
Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/dd62f7bf
Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/dd62f7bf

Branch: refs/heads/cassandra-2.0
Commit: dd62f7bf7977dd40eedb1c81ab7900b778f84540
Parents: ed54e80
Author: Tyler Hobbs <ty...@datastax.com>
Authored: Fri Jan 9 11:14:54 2015 -0600
Committer: Tyler Hobbs <ty...@datastax.com>
Committed: Fri Jan 9 11:14:54 2015 -0600

----------------------------------------------------------------------
 CHANGES.txt                                            |  2 ++
 .../cassandra/cql3/statements/SelectStatement.java     |  6 +++++-
 .../org/apache/cassandra/db/AbstractRangeCommand.java  | 13 +++++++++++++
 .../org/apache/cassandra/db/ColumnFamilyStore.java     |  4 +++-
 src/java/org/apache/cassandra/db/DataRange.java        | 12 ++++++++++++
 .../org/apache/cassandra/db/filter/ExtendedFilter.java |  6 ++++++
 .../apache/cassandra/db/filter/SliceQueryFilter.java   |  6 ++++++
 .../org/apache/cassandra/service/StorageProxy.java     | 13 +++++++------
 8 files changed, 54 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cassandra/blob/dd62f7bf/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index adb374a..0c7e9a2 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,6 @@
 2.0.12:
+ * Fix DISTINCT queries with LIMITs or paging when some partitions
+   contain only tombstones (CASSANDRA-8490)
  * Introduce background cache refreshing to permissions cache
    (CASSANDRA-8194)
  * Fix race condition in StreamTransferTask that could lead to

http://git-wip-us.apache.org/repos/asf/cassandra/blob/dd62f7bf/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
index f08f6b8..19615b6 100644
--- a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
@@ -450,7 +450,11 @@ public class SelectStatement implements CQLStatement, MeasurableForPreparedCache
             // For distinct, we only care about fetching the beginning of each partition. If we don't have
             // static columns, we in fact only care about the first cell, so we query only that (we don't "group").
             // If we do have static columns, we do need to fetch the first full group (to have the static columns values).
-            return new SliceQueryFilter(ColumnSlice.ALL_COLUMNS_ARRAY, false, 1, selectsStaticColumns ? toGroup : -1);
+
+            // See the comments on IGNORE_TOMBSTONED_PARTITIONS and CASSANDRA-8490 for why we use a special value for
+            // DISTINCT queries on the partition key only.
+            toGroup = selectsStaticColumns ? toGroup : SliceQueryFilter.IGNORE_TOMBSTONED_PARTITIONS;
+            return new SliceQueryFilter(ColumnSlice.ALL_COLUMNS_ARRAY, false, 1, toGroup);
         }
         else if (isColumnRange())
         {

http://git-wip-us.apache.org/repos/asf/cassandra/blob/dd62f7bf/src/java/org/apache/cassandra/db/AbstractRangeCommand.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/AbstractRangeCommand.java b/src/java/org/apache/cassandra/db/AbstractRangeCommand.java
index 45302e2..4ddcb8d 100644
--- a/src/java/org/apache/cassandra/db/AbstractRangeCommand.java
+++ b/src/java/org/apache/cassandra/db/AbstractRangeCommand.java
@@ -57,6 +57,19 @@ public abstract class AbstractRangeCommand implements IReadCommand
 
     public abstract int limit();
     public abstract boolean countCQL3Rows();
+
+    /**
+     * Returns true if tombstoned partitions should not be included in results or count towards the limit.
+     * See CASSANDRA-8490 for more details on why this is needed (and done this way).
+     * */
+    public boolean ignoredTombstonedPartitions()
+    {
+        if (!(predicate instanceof SliceQueryFilter))
+            return false;
+
+        return ((SliceQueryFilter) predicate).compositesToGroup == SliceQueryFilter.IGNORE_TOMBSTONED_PARTITIONS;
+    }
+
     public abstract List<Row> executeLocally();
 
     public long getTimeout()

http://git-wip-us.apache.org/repos/asf/cassandra/blob/dd62f7bf/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
index 7bd2a59..e936473 100644
--- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
+++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
@@ -1749,6 +1749,7 @@ public class ColumnFamilyStore implements ColumnFamilyStoreMBean
         List<Row> rows = new ArrayList<Row>();
         int columnsCount = 0;
         int total = 0, matched = 0;
+        boolean ignoreTombstonedPartitions = filter.ignoreTombstonedPartitions();
 
         try
         {
@@ -1784,7 +1785,8 @@ public class ColumnFamilyStore implements ColumnFamilyStoreMBean
                 }
 
                 rows.add(new Row(rawRow.key, data));
-                matched++;
+                if (!ignoreTombstonedPartitions || !data.hasOnlyTombstones(filter.timestamp))
+                    matched++;
 
                 if (data != null)
                     columnsCount += filter.lastCounted(data);

http://git-wip-us.apache.org/repos/asf/cassandra/blob/dd62f7bf/src/java/org/apache/cassandra/db/DataRange.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/DataRange.java b/src/java/org/apache/cassandra/db/DataRange.java
index b8b8daf..774a3aa 100644
--- a/src/java/org/apache/cassandra/db/DataRange.java
+++ b/src/java/org/apache/cassandra/db/DataRange.java
@@ -87,6 +87,18 @@ public class DataRange
         return keyRange.right;
     }
 
+    /**
+     * Returns true if tombstoned partitions should not be included in results or count towards the limit.
+     * See CASSANDRA-8490 for more details on why this is needed (and done this way).
+     * */
+    public boolean ignoredTombstonedPartitions()
+    {
+        if (!(columnFilter instanceof SliceQueryFilter))
+            return false;
+
+        return ((SliceQueryFilter) columnFilter).compositesToGroup == SliceQueryFilter.IGNORE_TOMBSTONED_PARTITIONS;
+    }
+
     // Whether the bounds of this DataRange actually wraps around.
     public boolean isWrapAround()
     {

http://git-wip-us.apache.org/repos/asf/cassandra/blob/dd62f7bf/src/java/org/apache/cassandra/db/filter/ExtendedFilter.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/filter/ExtendedFilter.java b/src/java/org/apache/cassandra/db/filter/ExtendedFilter.java
index 5c3662b..82e889d 100644
--- a/src/java/org/apache/cassandra/db/filter/ExtendedFilter.java
+++ b/src/java/org/apache/cassandra/db/filter/ExtendedFilter.java
@@ -127,6 +127,12 @@ public abstract class ExtendedFilter
      */
     public abstract ColumnFamily prune(DecoratedKey key, ColumnFamily data);
 
+    /** Returns true if tombstoned partitions should not be included in results or count towards the limit, false otherwise. */
+    public boolean ignoreTombstonedPartitions()
+    {
+        return dataRange.ignoredTombstonedPartitions();
+    }
+
     /**
      * @return true if the provided data satisfies all the expressions from
      * the clause of this filter.

http://git-wip-us.apache.org/repos/asf/cassandra/blob/dd62f7bf/src/java/org/apache/cassandra/db/filter/SliceQueryFilter.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/db/filter/SliceQueryFilter.java b/src/java/org/apache/cassandra/db/filter/SliceQueryFilter.java
index 58a0303..858578f 100644
--- a/src/java/org/apache/cassandra/db/filter/SliceQueryFilter.java
+++ b/src/java/org/apache/cassandra/db/filter/SliceQueryFilter.java
@@ -42,6 +42,12 @@ public class SliceQueryFilter implements IDiskAtomFilter
     private static final Logger logger = LoggerFactory.getLogger(SliceQueryFilter.class);
     public static final Serializer serializer = new Serializer();
 
+    /**
+     * A special value for compositesToGroup that indicates that partitioned tombstones should not be included in results
+     * or count towards the limit.  See CASSANDRA-8490 for more details on why this is needed (and done this way).
+     **/
+    public static final int IGNORE_TOMBSTONED_PARTITIONS = -2;
+
     public final ColumnSlice[] slices;
     public final boolean reversed;
     public volatile int count;

http://git-wip-us.apache.org/repos/asf/cassandra/blob/dd62f7bf/src/java/org/apache/cassandra/service/StorageProxy.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/service/StorageProxy.java b/src/java/org/apache/cassandra/service/StorageProxy.java
index 1e1a2a3..45af1c8 100644
--- a/src/java/org/apache/cassandra/service/StorageProxy.java
+++ b/src/java/org/apache/cassandra/service/StorageProxy.java
@@ -1499,7 +1499,8 @@ public class StorageProxy implements StorageProxyMBean
         // now scan until we have enough results
         try
         {
-            int cql3RowCount = 0;
+            int liveRowCount = 0;
+            boolean countLiveRows = command.countCQL3Rows() || command.ignoredTombstonedPartitions();
             rows = new ArrayList<>();
 
             // when dealing with LocalStrategy keyspaces, we can skip the range splitting and merging (which can be
@@ -1594,8 +1595,8 @@ public class StorageProxy implements StorageProxyMBean
                     for (Row row : handler.get())
                     {
                         rows.add(row);
-                        if (nodeCmd.countCQL3Rows())
-                            cql3RowCount += row.getLiveCount(command.predicate, command.timestamp);
+                        if (countLiveRows)
+                            liveRowCount += row.getLiveCount(command.predicate, command.timestamp);
                     }
                     FBUtilities.waitOnFutures(resolver.repairResults, DatabaseDescriptor.getWriteRpcTimeout());
                 }
@@ -1636,7 +1637,7 @@ public class StorageProxy implements StorageProxyMBean
                 }
 
                 // if we're done, great, otherwise, move to the next range
-                int count = nodeCmd.countCQL3Rows() ? cql3RowCount : rows.size();
+                int count = countLiveRows ? liveRowCount : rows.size();
                 if (count >= nodeCmd.limit())
                     break;
             }
@@ -1652,8 +1653,8 @@ public class StorageProxy implements StorageProxyMBean
 
     private static List<Row> trim(AbstractRangeCommand command, List<Row> rows)
     {
-        // When maxIsColumns, we let the caller trim the result.
-        if (command.countCQL3Rows())
+        // for CQL3 queries, let the caller trim the results
+        if (command.countCQL3Rows() || command.ignoredTombstonedPartitions())
             return rows;
         else
             return rows.size() > command.limit() ? rows.subList(0, command.limit()) : rows;