You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cassandra.apache.org by sl...@apache.org on 2016/06/23 07:58:28 UTC
[04/10] cassandra git commit: Avoid clock skew corrupting other nodes
through paxos
Avoid clock skew corrupting other nodes through paxos
patch by slebresne; reviewed by jasobrown for CASSANDRA-11991
Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo
Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/720870bd
Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/720870bd
Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/720870bd
Branch: refs/heads/trunk
Commit: 720870bd7f152aea81ca16a5a81e00ef701221cd
Parents: 85f2bbf
Author: Sylvain Lebresne <sy...@datastax.com>
Authored: Thu Jun 16 11:59:18 2016 +0200
Committer: Sylvain Lebresne <sy...@datastax.com>
Committed: Thu Jun 23 09:52:52 2016 +0200
----------------------------------------------------------------------
CHANGES.txt | 1 +
.../apache/cassandra/service/ClientState.java | 56 +++++++++++++++++---
.../apache/cassandra/service/StorageProxy.java | 6 ++-
.../org/apache/cassandra/utils/UUIDGen.java | 33 ++++++++++++
4 files changed, 88 insertions(+), 8 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/cassandra/blob/720870bd/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 7944967..7474045 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,5 @@
2.1.15
+ * Fix clock skew corrupting other nodes with paxos (CASSANDRA-11991)
* Remove distinction between non-existing static columns and existing but null in LWTs (CASSANDRA-9842)
* Support mlockall on IBM POWER arch (CASSANDRA-11576)
* Cache local ranges when calculating repair neighbors (CASSANDRA-11933)
http://git-wip-us.apache.org/repos/asf/cassandra/blob/720870bd/src/java/org/apache/cassandra/service/ClientState.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/service/ClientState.java b/src/java/org/apache/cassandra/service/ClientState.java
index 23eec73..f2e3f1c 100644
--- a/src/java/org/apache/cassandra/service/ClientState.java
+++ b/src/java/org/apache/cassandra/service/ClientState.java
@@ -98,8 +98,10 @@ public class ClientState
// The remote address of the client - null for internal clients.
private final SocketAddress remoteAddress;
- // The biggest timestamp that was returned by getTimestamp/assigned to a query. This is global to the VM
- // for the sake of paxos (see #9649).
+ // The biggest timestamp that was returned by getTimestamp/assigned to a query. This is global to ensure that the
+ // timestamp assigned are strictly monotonic on a node, which is likely what user expect intuitively (more likely,
+ // most new user will intuitively expect timestamp to be strictly monotonic cluster-wise, but while that last part
+ // is unrealistic expectation, doing it node-wise is easy).
private static final AtomicLong lastTimestampMicros = new AtomicLong(0);
/**
@@ -152,17 +154,59 @@ public class ClientState
}
/**
- * This is the same than {@link #getTimestamp()} but this guarantees that the returned timestamp
- * will not be smaller than the provided {@code minTimestampToUse}.
+ * Returns a timestamp suitable for paxos given the timestamp of the last known commit (or in progress update).
+ * <p>
+ * Paxos ensures that the timestamp it uses for commits respects the serial order of those commits. It does so
+ * by having each replica reject any proposal whose timestamp is not strictly greater than the last proposal it
+ * accepted. So in practice, which timestamp we use for a given proposal doesn't affect correctness but it does
+ * affect the chance of making progress (if we pick a timestamp lower than what has been proposed before, our
+ * new proposal will just get rejected).
+ * <p>
+ * As during the prepared phase replica send us the last propose they accepted, a first option would be to take
+ * the maximum of those last accepted proposal timestamp plus 1 (and use a default value, say 0, if it's the
+ * first known proposal for the partition). This would most work (giving commits the timestamp 0, 1, 2, ...
+ * in the order they are commited) up to 2 important caveats:
+ * 1) it would give a very poor experience when Paxos and non-Paxos updates are mixed in the same partition,
+ * since paxos operations wouldn't be using microseconds timestamps. And while you shouldn't theoretically
+ * mix the 2 kind of operations, this would still be pretty unintuitive. And what if you started writing
+ * normal updates and realize later you should switch to Paxos to enforce a property you want?
+ * 2) this wouldn't actually be safe due to the expiration set on the Paxos state table.
+ * <p>
+ * So instead, we initially chose to use the current time in microseconds as for normal update. Which works in
+ * general but mean that clock skew creates unavailability periods for Paxos updates (either a node has his clock
+ * in the past and he may no be able to get commit accepted until its clock catch up, or a node has his clock in
+ * the future and then once one of its commit his accepted, other nodes ones won't be until they catch up). This
+ * is ok for small clock skew (few ms) but can be pretty bad for large one.
+ * <p>
+ * Hence our current solution: we mix both approaches. That is, we compare the timestamp of the last known
+ * accepted proposal and the local time. If the local time is greater, we use it, thus keeping paxos timestamps
+ * locked to the current time in general (making mixing Paxos and non-Paxos more friendly, and behaving correctly
+ * when the paxos state expire (as long as your maximum clock skew is lower than the Paxos state expiration
+ * time)). Otherwise (the local time is lower than the last proposal, meaning that this last proposal was done
+ * with a clock in the future compared to the local one), we use the last proposal timestamp plus 1, ensuring
+ * progress.
+ *
+ * @param minTimestampToUse the max timestamp of the last proposal accepted by replica having responded
+ * to the prepare phase of the paxos round this is for. In practice, that's the minimum timestamp this method
+ * may return.
+ * @return a timestamp suitable for a Paxos proposal (using the reasoning described above). Note that
+ * contrarily to the {@link #getTimestamp()} method, the return value is not guaranteed to be unique (nor
+ * monotonic) across calls since it can return it's argument (so if the same argument is passed multiple times,
+ * it may be returned multiple times). Note that we still ensure Paxos "ballot" are unique (for different
+ * proposal) by (securely) randomizing the non-timestamp part of the UUID.
*/
- public long getTimestamp(long minTimestampToUse)
+ public long getTimestampForPaxos(long minTimestampToUse)
{
while (true)
{
long current = Math.max(System.currentTimeMillis() * 1000, minTimestampToUse);
long last = lastTimestampMicros.get();
long tstamp = last >= current ? last + 1 : current;
- if (lastTimestampMicros.compareAndSet(last, tstamp))
+ // Note that if we ended up picking minTimestampMicrosToUse (it was "in the future"), we don't
+ // want to change the local clock, otherwise a single node in the future could corrupt the clock
+ // of all nodes and for all inserts (since non-paxos inserts also use lastTimestampMicros).
+ // See CASSANDRA-11991
+ if (tstamp == minTimestampToUse || lastTimestampMicros.compareAndSet(last, tstamp))
return tstamp;
}
}
http://git-wip-us.apache.org/repos/asf/cassandra/blob/720870bd/src/java/org/apache/cassandra/service/StorageProxy.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/service/StorageProxy.java b/src/java/org/apache/cassandra/service/StorageProxy.java
index 845a732..af0693b 100644
--- a/src/java/org/apache/cassandra/service/StorageProxy.java
+++ b/src/java/org/apache/cassandra/service/StorageProxy.java
@@ -364,8 +364,10 @@ public class StorageProxy implements StorageProxyMBean
// in progress (#5667). Lastly, we don't want to use a timestamp that is older than the last one assigned by ClientState or operations may appear
// out-of-order (#7801).
long minTimestampMicrosToUse = summary == null ? Long.MIN_VALUE : 1 + UUIDGen.microsTimestamp(summary.mostRecentInProgressCommit.ballot);
- long ballotMicros = state.getTimestamp(minTimestampMicrosToUse);
- UUID ballot = UUIDGen.getTimeUUIDFromMicros(ballotMicros);
+ long ballotMicros = state.getTimestampForPaxos(minTimestampMicrosToUse);
+ // Note that ballotMicros is not guaranteed to be unique if two proposal are being handled concurrently by the same coordinator. But we still
+ // need ballots to be unique for each proposal so we have to use getRandomTimeUUIDFromMicros.
+ UUID ballot = UUIDGen.getRandomTimeUUIDFromMicros(ballotMicros);
// prepare
Tracing.trace("Preparing {}", ballot);
http://git-wip-us.apache.org/repos/asf/cassandra/blob/720870bd/src/java/org/apache/cassandra/utils/UUIDGen.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/utils/UUIDGen.java b/src/java/org/apache/cassandra/utils/UUIDGen.java
index 706c8a6..2c5b605 100644
--- a/src/java/org/apache/cassandra/utils/UUIDGen.java
+++ b/src/java/org/apache/cassandra/utils/UUIDGen.java
@@ -21,6 +21,7 @@ import java.net.InetAddress;
import java.nio.ByteBuffer;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
+import java.security.SecureRandom;
import java.util.Collection;
import java.util.Random;
import java.util.UUID;
@@ -51,6 +52,8 @@ public class UUIDGen
private static final long MIN_CLOCK_SEQ_AND_NODE = 0x8080808080808080L;
private static final long MAX_CLOCK_SEQ_AND_NODE = 0x7f7f7f7f7f7f7f7fL;
+ private static final SecureRandom secureRandom = new SecureRandom();
+
// placement of this singleton is important. It needs to be instantiated *AFTER* the other statics.
private static final UUIDGen instance = new UUIDGen();
@@ -82,6 +85,17 @@ public class UUIDGen
return new UUID(createTime(fromUnixTimestamp(when)), clockSeqAndNode);
}
+ /**
+ * Returns a version 1 UUID using the provided timestamp and the local clock and sequence.
+ * <p>
+ * Note that this method is generally only safe to use if you can guarantee that the provided
+ * parameter is unique across calls (otherwise the returned UUID won't be unique accross calls).
+ *
+ * @param whenInMicros a unix time in microseconds.
+ * @return a new UUID {@code id} such that {@code microsTimestamp(id) == whenInMicros}. Please not that
+ * multiple calls to this method with the same value of {@code whenInMicros} will return the <b>same</b>
+ * UUID.
+ */
public static UUID getTimeUUIDFromMicros(long whenInMicros)
{
long whenInMillis = whenInMicros / 1000;
@@ -89,6 +103,25 @@ public class UUIDGen
return getTimeUUID(whenInMillis, nanos);
}
+ /**
+ * Similar to {@link getTimeUUIDFromMicros}, but randomize (using SecureRandom) the clock and sequence.
+ * <p>
+ * If you can guarantee that the {@code whenInMicros} argument is unique (for this JVM instance) for
+ * every call, then you should prefer {@link getTimeUUIDFromMicros} which is faster. If you can't
+ * guarantee this however, this method will ensure the returned UUID are still unique (accross calls)
+ * through randomization.
+ *
+ * @param whenInMicros a unix time in microseconds.
+ * @return a new UUID {@code id} such that {@code microsTimestamp(id) == whenInMicros}. The UUID returned
+ * by different calls will be unique even if {@code whenInMicros} is not.
+ */
+ public static UUID getRandomTimeUUIDFromMicros(long whenInMicros)
+ {
+ long whenInMillis = whenInMicros / 1000;
+ long nanos = (whenInMicros - (whenInMillis * 1000)) * 10;
+ return new UUID(createTime(fromUnixTimestamp(whenInMillis, nanos)), secureRandom.nextLong());
+ }
+
public static UUID getTimeUUID(long when, long nanos)
{
return new UUID(createTime(fromUnixTimestamp(when, nanos)), clockSeqAndNode);