You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cassandra.apache.org by sl...@apache.org on 2015/06/29 09:32:04 UTC

[3/4] cassandra git commit: Ensure unique timestamp locally for paxos (2.1+ version)

Ensure unique timestamp locally for paxos (2.1+ version)

patch by slebresne; reviewed by stefania for CASSANDRA-9649


Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo
Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/fe65707f
Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/fe65707f
Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/fe65707f

Branch: refs/heads/cassandra-2.2
Commit: fe65707f09dd2c17cc6339407c689b42ac487256
Parents: 3651db2
Author: Sylvain Lebresne <sy...@datastax.com>
Authored: Thu Jun 25 12:50:39 2015 +0200
Committer: Sylvain Lebresne <sy...@datastax.com>
Committed: Mon Jun 29 09:27:26 2015 +0200

----------------------------------------------------------------------
 CHANGES.txt                                      |  1 +
 .../cql3/statements/ModificationStatement.java   |  4 +---
 .../apache/cassandra/service/ClientState.java    | 19 ++++++++++---------
 .../apache/cassandra/service/StorageProxy.java   | 19 +++++++------------
 src/java/org/apache/cassandra/utils/UUIDGen.java |  7 +++++++
 5 files changed, 26 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cassandra/blob/fe65707f/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 3e4fd36..7aee45b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,5 @@
 2.1.8
+ * Avoids ballot clash in Paxos (CASSANDRA-9649)
  * Fix IndexOutOfBoundsException when inserting tuple with too many
    elements using the string literal notation (CASSANDRA-9559)
  * Allow JMX over SSL directly from nodetool (CASSANDRA-9090)

http://git-wip-us.apache.org/repos/asf/cassandra/blob/fe65707f/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
index 6b0901a..3838909 100644
--- a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
@@ -655,9 +655,7 @@ public abstract class ModificationStatement implements CQLStatement
     static ColumnFamily casInternal(CQL3CasRequest request, QueryState state)
     throws InvalidRequestException
     {
-        long millis = state.getTimestamp() / 1000;
-        long nanos = ((state.getTimestamp() - (millis * 1000)) + 1) * 10;
-        UUID ballot = UUIDGen.getTimeUUID(millis, nanos);
+        UUID ballot = UUIDGen.getTimeUUIDFromMicros(state.getTimestamp());
         CFMetaData metadata = Schema.instance.getCFMetaData(request.cfm.ksName, request.cfm.cfName);
 
         ReadCommand readCommand = ReadCommand.create(request.cfm.ksName, request.key, request.cfm.cfName, request.now, request.readFilter());

http://git-wip-us.apache.org/repos/asf/cassandra/blob/fe65707f/src/java/org/apache/cassandra/service/ClientState.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/service/ClientState.java b/src/java/org/apache/cassandra/service/ClientState.java
index 5ea3d0c..23eec73 100644
--- a/src/java/org/apache/cassandra/service/ClientState.java
+++ b/src/java/org/apache/cassandra/service/ClientState.java
@@ -98,8 +98,9 @@ public class ClientState
     // The remote address of the client - null for internal clients.
     private final SocketAddress remoteAddress;
 
-    // The biggest timestamp that was returned by getTimestamp/assigned to a query
-    private final AtomicLong lastTimestampMicros = new AtomicLong(0);
+    // The biggest timestamp that was returned by getTimestamp/assigned to a query. This is global to the VM
+    // for the sake of paxos (see #9649).
+    private static final AtomicLong lastTimestampMicros = new AtomicLong(0);
 
     /**
      * Construct a new, empty ClientState for internal calls.
@@ -151,18 +152,18 @@ public class ClientState
     }
 
     /**
-     * Can be use when a timestamp has been assigned by a query, but that timestamp is
-     * not directly one returned by getTimestamp() (see SP.beginAndRepairPaxos()).
-     * This ensure following calls to getTimestamp() will return a timestamp strictly
-     * greated than the one provided to this method.
+     * This is the same than {@link #getTimestamp()} but this guarantees that the returned timestamp
+     * will not be smaller than the provided {@code minTimestampToUse}.
      */
-    public void updateLastTimestamp(long tstampMicros)
+    public long getTimestamp(long minTimestampToUse)
     {
         while (true)
         {
+            long current = Math.max(System.currentTimeMillis() * 1000, minTimestampToUse);
             long last = lastTimestampMicros.get();
-            if (tstampMicros <= last || lastTimestampMicros.compareAndSet(last, tstampMicros))
-                return;
+            long tstamp = last >= current ? last + 1 : current;
+            if (lastTimestampMicros.compareAndSet(last, tstamp))
+                return tstamp;
         }
     }
 

http://git-wip-us.apache.org/repos/asf/cassandra/blob/fe65707f/src/java/org/apache/cassandra/service/StorageProxy.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/service/StorageProxy.java b/src/java/org/apache/cassandra/service/StorageProxy.java
index b76c231..0045006 100644
--- a/src/java/org/apache/cassandra/service/StorageProxy.java
+++ b/src/java/org/apache/cassandra/service/StorageProxy.java
@@ -356,14 +356,13 @@ public class StorageProxy implements StorageProxyMBean
         int contentions = 0;
         while (System.nanoTime() - start < timeout)
         {
-            // We don't want to use a timestamp that is older than the last one assigned by the ClientState or operations
-            // may appear out-of-order (#7801). But note that state.getTimestamp() is in microseconds while the ballot
-            // timestamp is only in milliseconds
-            long currentTime = (state.getTimestamp() / 1000) + 1;
-            long ballotMillis = summary == null
-                              ? currentTime
-                              : Math.max(currentTime, 1 + UUIDGen.unixTimestamp(summary.mostRecentInProgressCommit.ballot));
-            UUID ballot = UUIDGen.getTimeUUID(ballotMillis);
+            // We want a timestamp that is guaranteed to be unique for that node (so that the ballot is globally unique), but if we've got a prepare rejected
+            // already we also want to make sure we pick a timestamp that has a chance to be promised, i.e. one that is greater that the most recently known
+            // in progress (#5667). Lastly, we don't want to use a timestamp that is older than the last one assigned by ClientState or operations may appear
+            // out-of-order (#7801).
+            long minTimestampMicrosToUse = summary == null ? Long.MIN_VALUE : 1 + UUIDGen.microsTimestamp(summary.mostRecentInProgressCommit.ballot);
+            long ballotMicros = state.getTimestamp(minTimestampMicrosToUse);
+            UUID ballot = UUIDGen.getTimeUUIDFromMicros(ballotMicros);
 
             // prepare
             Tracing.trace("Preparing {}", ballot);
@@ -429,10 +428,6 @@ public class StorageProxy implements StorageProxyMBean
                 continue;
             }
 
-            // We might commit this ballot and we want to ensure operations starting after this CAS succeed will be assigned
-            // a timestamp greater that the one of this ballot, so operation order is preserved (#7801)
-            state.updateLastTimestamp(ballotMillis * 1000);
-
             return Pair.create(ballot, contentions);
         }
 

http://git-wip-us.apache.org/repos/asf/cassandra/blob/fe65707f/src/java/org/apache/cassandra/utils/UUIDGen.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/utils/UUIDGen.java b/src/java/org/apache/cassandra/utils/UUIDGen.java
index 54347ff..706c8a6 100644
--- a/src/java/org/apache/cassandra/utils/UUIDGen.java
+++ b/src/java/org/apache/cassandra/utils/UUIDGen.java
@@ -82,6 +82,13 @@ public class UUIDGen
         return new UUID(createTime(fromUnixTimestamp(when)), clockSeqAndNode);
     }
 
+    public static UUID getTimeUUIDFromMicros(long whenInMicros)
+    {
+        long whenInMillis = whenInMicros / 1000;
+        long nanos = (whenInMicros - (whenInMillis * 1000)) * 10;
+        return getTimeUUID(whenInMillis, nanos);
+    }
+
     public static UUID getTimeUUID(long when, long nanos)
     {
         return new UUID(createTime(fromUnixTimestamp(when, nanos)), clockSeqAndNode);