You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cassandra.apache.org by yu...@apache.org on 2013/08/07 19:55:26 UTC
[1/3] git commit: Log Merkle tree precision stats
Updated Branches:
refs/heads/cassandra-2.0 3205c10de -> 18f0234bf
refs/heads/trunk 426caf7c0 -> 9cf1068b7
Log Merkle tree precision stats
patch by Benedict; reviewed by yukim for CASSANDRA-2698
Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo
Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/18f0234b
Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/18f0234b
Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/18f0234b
Branch: refs/heads/cassandra-2.0
Commit: 18f0234bf908d1f970bb694d0bb58539ab427387
Parents: 3205c10
Author: Benedict <su...@laerad.com>
Authored: Wed Aug 7 12:19:09 2013 -0500
Committer: Yuki Morishita <yu...@apache.org>
Committed: Wed Aug 7 12:19:58 2013 -0500
----------------------------------------------------------------------
CHANGES.txt | 1 +
.../org/apache/cassandra/repair/Validator.java | 61 ++++-
.../cassandra/utils/EstimatedHistogram.java | 65 ++++++
.../cassandra/utils/HistogramBuilder.java | 158 +++++++++++++
.../org/apache/cassandra/utils/MerkleTree.java | 233 ++++++++++++++-----
.../cassandra/repair/DifferencerTest.java | 2 +-
.../cassandra/utils/HistogramBuilderTest.java | 109 +++++++++
.../apache/cassandra/utils/MerkleTreeTest.java | 9 +-
8 files changed, 572 insertions(+), 66 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/cassandra/blob/18f0234b/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 9509113..4dca1b8 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,6 @@
2.0.1
* Notify indexer of columns shadowed by range tombstones (CASSANDRA-5614)
+ * Log Merkle tree stats (CASSANDRA-2698)
2.0.0
http://git-wip-us.apache.org/repos/asf/cassandra/blob/18f0234b/src/java/org/apache/cassandra/repair/Validator.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/repair/Validator.java b/src/java/org/apache/cassandra/repair/Validator.java
index 4e8e865..e7efcaa 100644
--- a/src/java/org/apache/cassandra/repair/Validator.java
+++ b/src/java/org/apache/cassandra/repair/Validator.java
@@ -61,8 +61,6 @@ public class Validator implements Runnable
private transient MerkleTree.TreeRangeIterator ranges;
private transient DecoratedKey lastKey;
- public final static MerkleTree.RowHash EMPTY_ROW = new MerkleTree.RowHash(null, new byte[0]);
-
/**
* Create Validator with default size of initial Merkle Tree.
*/
@@ -145,7 +143,7 @@ public class Validator implements Runnable
while (!range.contains(row.key.token))
{
// add the empty hash, and move to the next range
- range.addHash(EMPTY_ROW);
+ range.ensureHashInitialised();
range = ranges.next();
}
@@ -153,13 +151,52 @@ public class Validator implements Runnable
range.addHash(rowHash(row));
}
+ static class CountingDigest extends MessageDigest
+ {
+ private long count;
+ private MessageDigest underlying;
+
+ public CountingDigest(MessageDigest underlying)
+ {
+ super(underlying.getAlgorithm());
+ this.underlying = underlying;
+ }
+
+ @Override
+ protected void engineUpdate(byte input)
+ {
+ underlying.update(input);
+ count += 1;
+ }
+
+ @Override
+ protected void engineUpdate(byte[] input, int offset, int len)
+ {
+ underlying.update(input, offset, len);
+ count += len;
+ }
+
+ @Override
+ protected byte[] engineDigest()
+ {
+ return underlying.digest();
+ }
+
+ @Override
+ protected void engineReset()
+ {
+ underlying.reset();
+ }
+
+ }
+
private MerkleTree.RowHash rowHash(AbstractCompactedRow row)
{
validated++;
// MerkleTree uses XOR internally, so we want lots of output bits here
- MessageDigest digest = FBUtilities.newMessageDigest("SHA-256");
+ CountingDigest digest = new CountingDigest(FBUtilities.newMessageDigest("SHA-256"));
row.update(digest);
- return new MerkleTree.RowHash(row.key.token, digest.digest());
+ return new MerkleTree.RowHash(row.key.token, digest.digest(), digest.count);
}
/**
@@ -170,7 +207,15 @@ public class Validator implements Runnable
completeTree();
StageManager.getStage(Stage.ANTI_ENTROPY).execute(this);
- logger.debug("Validated " + validated + " rows into AEService tree for " + desc);
+
+ if (logger.isDebugEnabled())
+ {
+ // log distribution of rows in tree
+ logger.debug("Validated " + validated + " rows into AEService tree for " + desc + " with row count distribution:");
+ tree.histogramOfRowCountPerLeaf().log(logger);
+ logger.debug("Validated " + validated + " rows into AEService tree for " + desc + " with row size distribution:");
+ tree.histogramOfRowSizePerLeaf().log(logger);
+ }
}
@VisibleForTesting
@@ -179,11 +224,11 @@ public class Validator implements Runnable
assert ranges != null : "Validator was not prepared()";
if (range != null)
- range.addHash(EMPTY_ROW);
+ range.ensureHashInitialised();
while (ranges.hasNext())
{
range = ranges.next();
- range.addHash(EMPTY_ROW);
+ range.ensureHashInitialised();
}
}
http://git-wip-us.apache.org/repos/asf/cassandra/blob/18f0234b/src/java/org/apache/cassandra/utils/EstimatedHistogram.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/utils/EstimatedHistogram.java b/src/java/org/apache/cassandra/utils/EstimatedHistogram.java
index 54407a4..3a30290 100644
--- a/src/java/org/apache/cassandra/utils/EstimatedHistogram.java
+++ b/src/java/org/apache/cassandra/utils/EstimatedHistogram.java
@@ -27,6 +27,7 @@ import com.google.common.base.Objects;
import org.apache.cassandra.db.TypeSizes;
import org.apache.cassandra.io.ISerializer;
+import org.slf4j.Logger;
public class EstimatedHistogram
{
@@ -226,6 +227,70 @@ public class EstimatedHistogram
return buckets.get(buckets.length() - 1) > 0;
}
+ /**
+ * log.debug() every record in the histogram
+ *
+ * @param log
+ */
+ public void log(Logger log)
+ {
+
+ // only print overflow if there is any
+ int nameCount;
+ if (buckets.get(buckets.length() - 1) == 0)
+ nameCount = buckets.length() - 1;
+ else
+ nameCount = buckets.length();
+ String[] names = new String[nameCount];
+
+ int maxNameLength = 0;
+ for (int i = 0; i < nameCount; i++)
+ {
+ names[i] = nameOfRange(bucketOffsets, i);
+ maxNameLength = Math.max(maxNameLength, names[i].length());
+ }
+
+ // emit log records
+ String formatstr = "%" + maxNameLength + "s: %d";
+ for (int i = 0; i < nameCount; i++)
+ {
+ long count = buckets.get(i);
+ // sort-of-hack to not print empty ranges at the start that are only used to demarcate the
+ // first populated range. for code clarity we don't omit this record from the maxNameLength
+ // calculation, and accept the unnecessary whitespace prefixes that will occasionally occur
+ if (i == 0 && count == 0)
+ continue;
+ log.debug(String.format(formatstr, names[i], count));
+ }
+ }
+
+ private static String nameOfRange(long[] bucketOffsets, int index)
+ {
+ StringBuilder sb = new StringBuilder();
+ appendRange(sb, bucketOffsets, index);
+ return sb.toString();
+ }
+
+ private static void appendRange(StringBuilder sb, long[] bucketOffsets, int index)
+ {
+ sb.append("[");
+ if (index == 0)
+ if (bucketOffsets[0] > 0)
+ // by original definition, this histogram is for values greater than zero only;
+ // if values of 0 or less are required, an entry of lb-1 must be inserted at the start
+ sb.append("1");
+ else
+ sb.append("-Inf");
+ else
+ sb.append(bucketOffsets[index - 1] + 1);
+ sb.append("..");
+ if (index == bucketOffsets.length)
+ sb.append("Inf");
+ else
+ sb.append(bucketOffsets[index]);
+ sb.append("]");
+ }
+
@Override
public boolean equals(Object o)
{
http://git-wip-us.apache.org/repos/asf/cassandra/blob/18f0234b/src/java/org/apache/cassandra/utils/HistogramBuilder.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/utils/HistogramBuilder.java b/src/java/org/apache/cassandra/utils/HistogramBuilder.java
new file mode 100644
index 0000000..c2accb2
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/HistogramBuilder.java
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils;
+
+import java.util.Arrays;
+
+/**
+ * Simple class for constructing an EsimtatedHistogram from a set of predetermined values
+ */
+public class HistogramBuilder
+{
+
+ public HistogramBuilder() {}
+ public HistogramBuilder(long[] values)
+ {
+ for (long value : values)
+ {
+ add(value);
+ }
+ }
+
+ private long[] values = new long[10];
+ int count = 0;
+
+ public void add(long value)
+ {
+ if (count == values.length)
+ values = Arrays.copyOf(values, values.length << 1);
+ values[count++] = value;
+ }
+
+ /**
+ * See {@link #buildWithStdevRangesAroundMean(int)}
+ * @return buildWithStdevRangesAroundMean(3)
+ */
+ public EstimatedHistogram buildWithStdevRangesAroundMean()
+ {
+ return buildWithStdevRangesAroundMean(3);
+ }
+
+ /**
+ * Calculate the min, mean, max and standard deviation of the items in the builder, and
+ * generate an EstimatedHistogram with upto <code>maxdev</code> stdev size ranges either
+ * side of the mean, until min/max are hit; if either min/max are not reached a further range is
+ * inserted at the relevant ends. e.g., with a <code>maxdevs</code> of 3, there may be <i>up to</i> 8 ranges
+ * (between 9 boundaries, the middle being the mean); the middle 6 will have the same size (stdev)
+ * with the outermost two stretching out to min and max.
+ *
+ * @param maxdevs
+ * @return
+ */
+ public EstimatedHistogram buildWithStdevRangesAroundMean(int maxdevs)
+ {
+ if (maxdevs < 0)
+ throw new IllegalArgumentException("maxdevs must be greater than or equal to zero");
+
+ final int count = this.count;
+ final long[] values = this.values;
+
+ if (count == 0)
+ return new EstimatedHistogram(new long[] { }, new long[] { 0 });
+
+ long min = Long.MAX_VALUE, max = Long.MIN_VALUE;
+ double sum = 0, sumsq = 0;
+ for (int i = 0 ; i < count ; i++)
+ {
+ final long value = values[i];
+ sum += value;
+ sumsq += value * value;
+ if (value < min)
+ min = value;
+ if (value > max)
+ max = value;
+ }
+
+ final long mean = (long)Math.round(sum / count);
+ final double stdev =
+ Math.sqrt((sumsq / count) - (mean * (double) mean));
+
+ // build the ranges either side of the mean
+ final long[] lowhalf = buildRange(mean, min, true, stdev, maxdevs);
+ final long[] highhalf = buildRange(mean, max, false, stdev, maxdevs);
+
+ // combine the ranges
+ final long[] ranges = new long[lowhalf.length + highhalf.length + 1];
+ System.arraycopy(lowhalf, 0, ranges, 0, lowhalf.length);
+ ranges[lowhalf.length] = mean;
+ System.arraycopy(highhalf, 0, ranges, lowhalf.length + 1, highhalf.length);
+
+ final EstimatedHistogram hist = new EstimatedHistogram(ranges, new long[ranges.length + 1]);
+ for (int i = 0 ; i < count ; i++)
+ hist.add(values[i]);
+ return hist;
+ }
+
+ private static long[] buildRange(long mean, long minormax, boolean ismin, double stdev, int maxdevs)
+ {
+ if (minormax == mean)
+ // minormax == mean we have no range to produce, but given the exclusive starts
+ // that begin at zero by default (or -Inf) in EstimatedHistogram we have to generate a min range
+ // to indicate where we start from
+ return ismin ? new long[] { mean - 1 } : new long[0];
+
+ if (stdev < 1)
+ {
+ // deal with stdevs too small to generate sensible ranges
+ return ismin ? new long[] { minormax - 1, mean - 1 } :
+ new long[] { minormax };
+ }
+
+ long larger, smaller;
+ if (ismin) { larger = mean; smaller = minormax; }
+ else { larger = minormax; smaller = mean; }
+
+ double stdevsTo = (larger - smaller) / stdev;
+ if (stdevsTo > 0 && stdevsTo < 1)
+ // always round up if there's just one non-empty range
+ stdevsTo = 1;
+ else
+ // otherwise round to the nearest half stdev, to avoid tiny ranges at the start/end
+ stdevsTo = Math.round(stdevsTo);
+
+ // limit to 4 stdev ranges - last range will contain everything to boundary
+ final int len = Math.min(maxdevs + 1, (int) stdevsTo);
+ final long[] range = new long[len];
+ long next = ismin ? minormax - 1 : minormax;
+ for (int i = 0 ; i < range.length ; i++)
+ {
+ long delta = (range.length - (i + 1)) * (long) stdev;
+ if (ismin)
+ {
+ range[i] = next;
+ next = mean - delta;
+ }
+ else
+ {
+ range[len - 1 - i] = next;
+ next = mean + delta;
+ }
+ }
+ return range;
+ }
+}
http://git-wip-us.apache.org/repos/asf/cassandra/blob/18f0234b/src/java/org/apache/cassandra/utils/MerkleTree.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/utils/MerkleTree.java b/src/java/org/apache/cassandra/utils/MerkleTree.java
index a16c2b9..ce71ec4 100644
--- a/src/java/org/apache/cassandra/utils/MerkleTree.java
+++ b/src/java/org/apache/cassandra/utils/MerkleTree.java
@@ -64,6 +64,7 @@ public class MerkleTree implements Serializable
public static final int CONSISTENT = 0;
public static final int FULLY_INCONSISTENT = 1;
public static final int PARTIALLY_INCONSISTENT = 2;
+ private static final byte[] EMPTY_HASH = new byte[0];
public final byte hashdepth;
@@ -227,11 +228,14 @@ public class MerkleTree implements Serializable
if (!ltree.fullRange.equals(rtree.fullRange))
throw new IllegalArgumentException("Difference only make sense on tree covering the same range (but " + ltree.fullRange + " != " + rtree.fullRange + ")");
- List<TreeRange> diff = new ArrayList<TreeRange>();
- TreeRange active = new TreeRange(null, ltree.fullRange.left, ltree.fullRange.right, (byte)0, null);
+ List<TreeRange> diff = new ArrayList<>();
+ TreeDifference active = new TreeDifference(ltree.fullRange.left, ltree.fullRange.right, (byte)0);
- byte[] lhash = ltree.hash(active);
- byte[] rhash = rtree.hash(active);
+ Hashable lnode = ltree.find(active);
+ Hashable rnode = rtree.find(active);
+ byte[] lhash = lnode.hash();
+ byte[] rhash = rnode.hash();
+ active.setSize(lnode.sizeOfRange(), rnode.sizeOfRange());
if (lhash != null && rhash != null && !Arrays.equals(lhash, rhash))
{
@@ -256,14 +260,19 @@ public class MerkleTree implements Serializable
return CONSISTENT;
Token midpoint = ltree.partitioner().midpoint(active.left, active.right);
- TreeRange left = new TreeRange(null, active.left, midpoint, inc(active.depth), null);
- TreeRange right = new TreeRange(null, midpoint, active.right, inc(active.depth), null);
- byte[] lhash;
- byte[] rhash;
+ TreeDifference left = new TreeDifference(active.left, midpoint, inc(active.depth));
+ TreeDifference right = new TreeDifference(midpoint, active.right, inc(active.depth));
+ byte[] lhash, rhash;
+ Hashable lnode, rnode;
// see if we should recurse left
- lhash = ltree.hash(left);
- rhash = rtree.hash(left);
+ lnode = ltree.find(left);
+ rnode = rtree.find(left);
+ lhash = lnode.hash();
+ rhash = rnode.hash();
+ left.setSize(lnode.sizeOfRange(), rnode.sizeOfRange());
+ left.setRows(lnode.rowsInRange(), rnode.rowsInRange());
+
int ldiff = CONSISTENT;
boolean lreso = lhash != null && rhash != null;
if (lreso && !Arrays.equals(lhash, rhash))
@@ -271,10 +280,14 @@ public class MerkleTree implements Serializable
else if (!lreso)
ldiff = FULLY_INCONSISTENT;
-
// see if we should recurse right
- lhash = ltree.hash(right);
- rhash = rtree.hash(right);
+ lnode = ltree.find(right);
+ rnode = rtree.find(right);
+ lhash = lnode.hash();
+ rhash = rnode.hash();
+ right.setSize(lnode.sizeOfRange(), rnode.sizeOfRange());
+ right.setRows(lnode.rowsInRange(), rnode.rowsInRange());
+
int rdiff = CONSISTENT;
boolean rreso = lhash != null && rhash != null;
if (rreso && !Arrays.equals(lhash, rhash))
@@ -363,54 +376,57 @@ public class MerkleTree implements Serializable
*/
public byte[] hash(Range<Token> range)
{
+ return find(range).hash();
+ }
+
+ /**
+ * Find the {@link Hashable} node that matches the given {@code range}.
+ *
+ * @param range Range to find
+ * @return {@link Hashable} found. If nothing found, return {@link Leaf} with null hash.
+ */
+ private Hashable find(Range<Token> range)
+ {
try
{
- return hashHelper(root, new Range<Token>(fullRange.left, fullRange.right), range);
+ return findHelper(root, new Range<Token>(fullRange.left, fullRange.right), range);
}
catch (StopRecursion e)
{
- return null;
+ return new Leaf();
}
}
/**
* @throws StopRecursion If no match could be found for the range.
*/
- private byte[] hashHelper(Hashable hashable, Range<Token> active, Range<Token> range) throws StopRecursion
+ private Hashable findHelper(Hashable current, Range<Token> activeRange, Range<Token> find) throws StopRecursion
{
- if (hashable instanceof Leaf)
+ if (current instanceof Leaf)
{
- if (!range.contains(active))
+ if (!find.contains(activeRange))
// we are not fully contained in this range!
throw new StopRecursion.BadRange();
- return hashable.hash();
+ return current;
}
// else: node.
- Inner node = (Inner)hashable;
- Range<Token> leftactive = new Range<Token>(active.left, node.token);
- Range<Token> rightactive = new Range<Token>(node.token, active.right);
+ Inner node = (Inner)current;
+ Range<Token> leftRange = new Range<Token>(activeRange.left, node.token);
+ Range<Token> rightRange = new Range<Token>(node.token, activeRange.right);
- if (range.contains(active))
- {
+ if (find.contains(activeRange))
// this node is fully contained in the range
- if (node.hash() != null)
- // we had a cached value
- return node.hash();
- // continue recursing to hash our children
- byte[] lhash = hashHelper(node.lchild(), leftactive, range);
- byte[] rhash = hashHelper(node.rchild(), rightactive, range);
- // cache the computed value (even if it is null)
- node.hash(lhash, rhash);
- return node.hash();
- } // else: one of our children contains the range
-
- if (leftactive.contains(range))
+ return node.calc();
+
+ // else: one of our children contains the range
+
+ if (leftRange.contains(find))
// left child contains/matches the range
- return hashHelper(node.lchild, leftactive, range);
- else if (rightactive.contains(range))
+ return findHelper(node.lchild, leftRange, find);
+ else if (rightRange.contains(find))
// right child contains/matches the range
- return hashHelper(node.rchild, rightactive, range);
+ return findHelper(node.rchild, rightRange, find);
else
throw new StopRecursion.BadRange();
}
@@ -479,6 +495,26 @@ public class MerkleTree implements Serializable
return new TreeRangeIterator(this);
}
+ public EstimatedHistogram histogramOfRowSizePerLeaf()
+ {
+ HistogramBuilder histbuild = new HistogramBuilder();
+ for (TreeRange range : new TreeRangeIterator(this))
+ {
+ histbuild.add(range.hashable.sizeOfRange);
+ }
+ return histbuild.buildWithStdevRangesAroundMean();
+ }
+
+ public EstimatedHistogram histogramOfRowCountPerLeaf()
+ {
+ HistogramBuilder histbuild = new HistogramBuilder();
+ for (TreeRange range : new TreeRangeIterator(this))
+ {
+ histbuild.add(range.hashable.rowsInRange);
+ }
+ return histbuild.buildWithStdevRangesAroundMean();
+ }
+
@Override
public String toString()
{
@@ -489,6 +525,59 @@ public class MerkleTree implements Serializable
return buff.toString();
}
+ public static class TreeDifference extends TreeRange
+ {
+ private static final long serialVersionUID = 6363654174549968183L;
+
+ private long sizeOnLeft;
+ private long sizeOnRight;
+ private long rowsOnLeft;
+ private long rowsOnRight;
+
+ void setSize(long sizeOnLeft, long sizeOnRight)
+ {
+ this.sizeOnLeft = sizeOnLeft;
+ this.sizeOnRight = sizeOnRight;
+ }
+
+ void setRows(long rowsOnLeft, long rowsOnRight)
+ {
+ this.rowsOnLeft = rowsOnLeft;
+ this.rowsOnRight = rowsOnRight;
+ }
+
+ public long sizeOnLeft()
+ {
+ return sizeOnLeft;
+ }
+
+ public long sizeOnRight()
+ {
+ return sizeOnRight;
+ }
+
+ public long rowsOnLeft()
+ {
+ return rowsOnLeft;
+ }
+
+ public long rowsOnRight()
+ {
+ return rowsOnRight;
+ }
+
+ public TreeDifference(Token left, Token right, byte depth)
+ {
+ super(null, left, right, depth, null);
+ }
+
+ public long totalRows()
+ {
+ return rowsOnLeft + rowsOnRight;
+ }
+
+ }
+
/**
* The public interface to a range in the tree.
*
@@ -531,7 +620,16 @@ public class MerkleTree implements Serializable
assert tree != null : "Not intended for modification!";
assert hashable instanceof Leaf;
- hashable.addHash(entry.hash);
+ hashable.addHash(entry.hash, entry.size);
+ }
+
+ public void ensureHashInitialised()
+ {
+ assert tree != null : "Not intended for modification!";
+ assert hashable instanceof Leaf;
+
+ if (hashable.hash == null)
+ hashable.hash = EMPTY_HASH;
}
public void addAll(Iterator<RowHash> entries)
@@ -662,6 +760,21 @@ public class MerkleTree implements Serializable
rchild = child;
}
+ Hashable calc()
+ {
+ if (hash == null)
+ {
+ // hash and size haven't been calculated; calc children then compute
+ Hashable lnode = lchild.calc();
+ Hashable rnode = rchild.calc();
+ // cache the computed value
+ hash(lnode.hash, rnode.hash);
+ sizeOfRange = lnode.sizeOfRange + rnode.sizeOfRange;
+ rowsInRange = lnode.rowsInRange + rnode.rowsInRange;
+ }
+ return this;
+ }
+
/**
* Recursive toString.
*/
@@ -729,12 +842,12 @@ public class MerkleTree implements Serializable
public long serializedSize(Inner inner, int version)
{
int size = inner.hash == null
- ? TypeSizes.NATIVE.sizeof(-1)
- : TypeSizes.NATIVE.sizeof(inner.hash().length) + inner.hash().length;
+ ? TypeSizes.NATIVE.sizeof(-1)
+ : TypeSizes.NATIVE.sizeof(inner.hash().length) + inner.hash().length;
size += Token.serializer.serializedSize(inner.token, TypeSizes.NATIVE)
- + Hashable.serializer.serializedSize(inner.lchild, version)
- + Hashable.serializer.serializedSize(inner.rchild, version);
+ + Hashable.serializer.serializedSize(inner.lchild, version)
+ + Hashable.serializer.serializedSize(inner.rchild, version);
return size;
}
}
@@ -768,11 +881,6 @@ public class MerkleTree implements Serializable
super(hash);
}
- public Leaf(byte[] lefthash, byte[] righthash)
- {
- super(Hashable.binaryHash(lefthash, righthash));
- }
-
public void toString(StringBuilder buff, int maxdepth)
{
buff.append(toString());
@@ -826,16 +934,18 @@ public class MerkleTree implements Serializable
{
public final Token token;
public final byte[] hash;
- public RowHash(Token token, byte[] hash)
+ public final long size;
+ public RowHash(Token token, byte[] hash, long size)
{
this.token = token;
this.hash = hash;
+ this.size = size;
}
@Override
public String toString()
{
- return "#<RowHash " + token + " " + Hashable.toString(hash) + ">";
+ return "#<RowHash " + token + " " + Hashable.toString(hash) + " @ " + size + " bytes>";
}
}
@@ -848,6 +958,8 @@ public class MerkleTree implements Serializable
private static final IVersionedSerializer<Hashable> serializer = new HashableSerializer();
protected byte[] hash;
+ protected long sizeOfRange;
+ protected long rowsInRange;
protected Hashable(byte[] hash)
{
@@ -859,11 +971,26 @@ public class MerkleTree implements Serializable
return hash;
}
+ public long sizeOfRange()
+ {
+ return sizeOfRange;
+ }
+
+ public long rowsInRange()
+ {
+ return rowsInRange;
+ }
+
void hash(byte[] hash)
{
this.hash = hash;
}
+ Hashable calc()
+ {
+ return this;
+ }
+
/**
* Sets the value of this hash to binaryHash of its children.
* @param lefthash Hash of left child.
@@ -878,12 +1005,14 @@ public class MerkleTree implements Serializable
* Mixes the given value into our hash. If our hash is null,
* our hash will become the given value.
*/
- void addHash(byte[] righthash)
+ void addHash(byte[] righthash, long sizeOfRow)
{
if (hash == null)
hash = righthash;
else
hash = binaryHash(hash, righthash);
+ this.sizeOfRange += sizeOfRow;
+ this.rowsInRange += 1;
}
/**
http://git-wip-us.apache.org/repos/asf/cassandra/blob/18f0234b/test/unit/org/apache/cassandra/repair/DifferencerTest.java
----------------------------------------------------------------------
diff --git a/test/unit/org/apache/cassandra/repair/DifferencerTest.java b/test/unit/org/apache/cassandra/repair/DifferencerTest.java
index 2502620..3f259f2 100644
--- a/test/unit/org/apache/cassandra/repair/DifferencerTest.java
+++ b/test/unit/org/apache/cassandra/repair/DifferencerTest.java
@@ -132,7 +132,7 @@ public class DifferencerTest extends SchemaLoader
tree.init();
for (MerkleTree.TreeRange r : tree.invalids())
{
- r.addHash(Validator.EMPTY_ROW);
+ r.ensureHashInitialised();
}
return tree;
}
http://git-wip-us.apache.org/repos/asf/cassandra/blob/18f0234b/test/unit/org/apache/cassandra/utils/HistogramBuilderTest.java
----------------------------------------------------------------------
diff --git a/test/unit/org/apache/cassandra/utils/HistogramBuilderTest.java b/test/unit/org/apache/cassandra/utils/HistogramBuilderTest.java
new file mode 100644
index 0000000..dfceaf3
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/HistogramBuilderTest.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils;
+
+import java.util.concurrent.atomic.AtomicLongArray;
+
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+
+public class HistogramBuilderTest
+{
+
+ @Test
+ public void testStdevEmpty()
+ {
+ EstimatedHistogram hist = new HistogramBuilder().buildWithStdevRangesAroundMean();
+ assertArrayEquals(new long[] { }, hist.getBucketOffsets());
+ assertArrayEquals(new long[] { 0 }, toArray(hist.buckets));
+ }
+
+ @Test
+ public void testStdevSingletonRanges()
+ {
+ EstimatedHistogram hist;
+ hist = new HistogramBuilder(new long[] { 5, 5, 5, 5, 5 }).buildWithStdevRangesAroundMean();
+ assertArrayEquals(new long[] { 4, 5 }, hist.getBucketOffsets());
+ assertArrayEquals(new long[] { 0, 5, 0 }, toArray(hist.buckets));
+ // should behave exactly the same for negative numbers
+ hist = new HistogramBuilder(new long[] { -1 }).buildWithStdevRangesAroundMean();
+ assertArrayEquals(new long[] { -2, -1 }, hist.getBucketOffsets());
+ assertArrayEquals(new long[] { 0, 1, 0 }, toArray(hist.buckets));
+ }
+
+ @Test
+ public void testStdevNearZeroStdev()
+ {
+ EstimatedHistogram hist;
+ long[] vals = new long[100000];
+ vals[0] = 99;
+ vals[1] = 101;
+ for (int i = 2 ; i < vals.length ; i++)
+ vals[i] = 100;
+ hist = new HistogramBuilder(vals).buildWithStdevRangesAroundMean();
+ assertArrayEquals(new long[] { 98, 99, 100, 101 }, hist.getBucketOffsets());
+ assertArrayEquals(new long[] { 0, 1, vals.length - 2, 1, 0 }, toArray(hist.buckets));
+ }
+
+ @Test
+ public void testStdev()
+ {
+ long[] vals;
+ EstimatedHistogram hist;
+ vals = new long[] { -10, -3, -2, -2, -1, -1, -1, -1, -1, -1, -1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 10 };
+ hist = new HistogramBuilder(vals).buildWithStdevRangesAroundMean(2);
+ assertArrayEquals(new long[] { -11, -6, -3, 0, 3, 6, 10 }, hist.getBucketOffsets());
+ assertArrayEquals(new long[] { 0, 1, 1, 10, 10, 0, 1, 0 }, toArray(hist.buckets));
+ }
+
+ private static long[] toArray(AtomicLongArray a)
+ {
+ final long[] r = new long[a.length()];
+ for (int i = 0 ; i < r.length ; i++)
+ r[i] = a.get(i);
+ return r;
+ }
+
+ @Test
+ public void testStdevLargeNumbers()
+ {
+ long[] vals;
+ EstimatedHistogram hist;
+ vals = new long[100000];
+ for (int i = 0 ; i < vals.length ; i++)
+ {
+ if (i < vals.length * 0.6f)
+ vals[i] = 60;
+ else if (i < vals.length * 0.8f)
+ vals[i] = 120;
+ else if (i < vals.length * 0.9f)
+ vals[i] = 180;
+ else if (i < vals.length * 0.95f)
+ vals[i] = 240;
+ else if (i < vals.length * 0.98f)
+ vals[i] = 320;
+ else
+ vals[i] = 1000;
+ }
+ hist = new HistogramBuilder(vals).buildWithStdevRangesAroundMean(2);
+ assertArrayEquals(new long[] { 59, 120, 260, 400, 1000 }, hist.getBucketOffsets());
+ assertArrayEquals(new long[] { 0, 80000, 15000, 3000, 2000, 0 }, toArray(hist.buckets));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/cassandra/blob/18f0234b/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java
----------------------------------------------------------------------
diff --git a/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java b/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java
index 70fe3a0..2b459c6 100644
--- a/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java
+++ b/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java
@@ -25,16 +25,15 @@ import com.google.common.collect.AbstractIterator;
import com.google.common.io.ByteArrayDataInput;
import com.google.common.io.ByteArrayDataOutput;
import com.google.common.io.ByteStreams;
-import org.apache.hadoop.hdfs.server.common.Storage;
+
import org.junit.Before;
import org.junit.Test;
-
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.dht.*;
import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.utils.MerkleTree.Hashable;
import org.apache.cassandra.utils.MerkleTree.RowHash;
+import org.apache.cassandra.utils.MerkleTree.TreeDifference;
import org.apache.cassandra.utils.MerkleTree.TreeRange;
import org.apache.cassandra.utils.MerkleTree.TreeRangeIterator;
@@ -322,7 +321,7 @@ public class MerkleTreeTest
// validate the tree
TreeRangeIterator ranges = mt.invalids();
for (TreeRange range : ranges)
- range.addHash(new RowHash(range.right, new byte[0]));
+ range.addHash(new RowHash(range.right, new byte[0], 0));
assert mt.hash(new Range<>(tok(-1), tok(-1))) != null :
"Could not hash tree " + mt;
@@ -491,7 +490,7 @@ public class MerkleTreeTest
public RowHash computeNext()
{
if (tokens.hasNext())
- return new RowHash(tokens.next(), DUMMY);
+ return new RowHash(tokens.next(), DUMMY, DUMMY.length);
return endOfData();
}
}
[2/3] git commit: Log Merkle tree precision stats
Posted by yu...@apache.org.
Log Merkle tree precision stats
patch by Benedict; reviewed by yukim for CASSANDRA-2698
Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo
Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/18f0234b
Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/18f0234b
Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/18f0234b
Branch: refs/heads/trunk
Commit: 18f0234bf908d1f970bb694d0bb58539ab427387
Parents: 3205c10
Author: Benedict <su...@laerad.com>
Authored: Wed Aug 7 12:19:09 2013 -0500
Committer: Yuki Morishita <yu...@apache.org>
Committed: Wed Aug 7 12:19:58 2013 -0500
----------------------------------------------------------------------
CHANGES.txt | 1 +
.../org/apache/cassandra/repair/Validator.java | 61 ++++-
.../cassandra/utils/EstimatedHistogram.java | 65 ++++++
.../cassandra/utils/HistogramBuilder.java | 158 +++++++++++++
.../org/apache/cassandra/utils/MerkleTree.java | 233 ++++++++++++++-----
.../cassandra/repair/DifferencerTest.java | 2 +-
.../cassandra/utils/HistogramBuilderTest.java | 109 +++++++++
.../apache/cassandra/utils/MerkleTreeTest.java | 9 +-
8 files changed, 572 insertions(+), 66 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/cassandra/blob/18f0234b/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 9509113..4dca1b8 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,6 @@
2.0.1
* Notify indexer of columns shadowed by range tombstones (CASSANDRA-5614)
+ * Log Merkle tree stats (CASSANDRA-2698)
2.0.0
http://git-wip-us.apache.org/repos/asf/cassandra/blob/18f0234b/src/java/org/apache/cassandra/repair/Validator.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/repair/Validator.java b/src/java/org/apache/cassandra/repair/Validator.java
index 4e8e865..e7efcaa 100644
--- a/src/java/org/apache/cassandra/repair/Validator.java
+++ b/src/java/org/apache/cassandra/repair/Validator.java
@@ -61,8 +61,6 @@ public class Validator implements Runnable
private transient MerkleTree.TreeRangeIterator ranges;
private transient DecoratedKey lastKey;
- public final static MerkleTree.RowHash EMPTY_ROW = new MerkleTree.RowHash(null, new byte[0]);
-
/**
* Create Validator with default size of initial Merkle Tree.
*/
@@ -145,7 +143,7 @@ public class Validator implements Runnable
while (!range.contains(row.key.token))
{
// add the empty hash, and move to the next range
- range.addHash(EMPTY_ROW);
+ range.ensureHashInitialised();
range = ranges.next();
}
@@ -153,13 +151,52 @@ public class Validator implements Runnable
range.addHash(rowHash(row));
}
+ static class CountingDigest extends MessageDigest
+ {
+ private long count;
+ private MessageDigest underlying;
+
+ public CountingDigest(MessageDigest underlying)
+ {
+ super(underlying.getAlgorithm());
+ this.underlying = underlying;
+ }
+
+ @Override
+ protected void engineUpdate(byte input)
+ {
+ underlying.update(input);
+ count += 1;
+ }
+
+ @Override
+ protected void engineUpdate(byte[] input, int offset, int len)
+ {
+ underlying.update(input, offset, len);
+ count += len;
+ }
+
+ @Override
+ protected byte[] engineDigest()
+ {
+ return underlying.digest();
+ }
+
+ @Override
+ protected void engineReset()
+ {
+ underlying.reset();
+ }
+
+ }
+
private MerkleTree.RowHash rowHash(AbstractCompactedRow row)
{
validated++;
// MerkleTree uses XOR internally, so we want lots of output bits here
- MessageDigest digest = FBUtilities.newMessageDigest("SHA-256");
+ CountingDigest digest = new CountingDigest(FBUtilities.newMessageDigest("SHA-256"));
row.update(digest);
- return new MerkleTree.RowHash(row.key.token, digest.digest());
+ return new MerkleTree.RowHash(row.key.token, digest.digest(), digest.count);
}
/**
@@ -170,7 +207,15 @@ public class Validator implements Runnable
completeTree();
StageManager.getStage(Stage.ANTI_ENTROPY).execute(this);
- logger.debug("Validated " + validated + " rows into AEService tree for " + desc);
+
+ if (logger.isDebugEnabled())
+ {
+ // log distribution of rows in tree
+ logger.debug("Validated " + validated + " rows into AEService tree for " + desc + " with row count distribution:");
+ tree.histogramOfRowCountPerLeaf().log(logger);
+ logger.debug("Validated " + validated + " rows into AEService tree for " + desc + " with row size distribution:");
+ tree.histogramOfRowSizePerLeaf().log(logger);
+ }
}
@VisibleForTesting
@@ -179,11 +224,11 @@ public class Validator implements Runnable
assert ranges != null : "Validator was not prepared()";
if (range != null)
- range.addHash(EMPTY_ROW);
+ range.ensureHashInitialised();
while (ranges.hasNext())
{
range = ranges.next();
- range.addHash(EMPTY_ROW);
+ range.ensureHashInitialised();
}
}
http://git-wip-us.apache.org/repos/asf/cassandra/blob/18f0234b/src/java/org/apache/cassandra/utils/EstimatedHistogram.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/utils/EstimatedHistogram.java b/src/java/org/apache/cassandra/utils/EstimatedHistogram.java
index 54407a4..3a30290 100644
--- a/src/java/org/apache/cassandra/utils/EstimatedHistogram.java
+++ b/src/java/org/apache/cassandra/utils/EstimatedHistogram.java
@@ -27,6 +27,7 @@ import com.google.common.base.Objects;
import org.apache.cassandra.db.TypeSizes;
import org.apache.cassandra.io.ISerializer;
+import org.slf4j.Logger;
public class EstimatedHistogram
{
@@ -226,6 +227,70 @@ public class EstimatedHistogram
return buckets.get(buckets.length() - 1) > 0;
}
+ /**
+ * log.debug() every record in the histogram
+ *
+ * @param log
+ */
+ public void log(Logger log)
+ {
+
+ // only print overflow if there is any
+ int nameCount;
+ if (buckets.get(buckets.length() - 1) == 0)
+ nameCount = buckets.length() - 1;
+ else
+ nameCount = buckets.length();
+ String[] names = new String[nameCount];
+
+ int maxNameLength = 0;
+ for (int i = 0; i < nameCount; i++)
+ {
+ names[i] = nameOfRange(bucketOffsets, i);
+ maxNameLength = Math.max(maxNameLength, names[i].length());
+ }
+
+ // emit log records
+ String formatstr = "%" + maxNameLength + "s: %d";
+ for (int i = 0; i < nameCount; i++)
+ {
+ long count = buckets.get(i);
+ // sort-of-hack to not print empty ranges at the start that are only used to demarcate the
+ // first populated range. for code clarity we don't omit this record from the maxNameLength
+ // calculation, and accept the unnecessary whitespace prefixes that will occasionally occur
+ if (i == 0 && count == 0)
+ continue;
+ log.debug(String.format(formatstr, names[i], count));
+ }
+ }
+
+ private static String nameOfRange(long[] bucketOffsets, int index)
+ {
+ StringBuilder sb = new StringBuilder();
+ appendRange(sb, bucketOffsets, index);
+ return sb.toString();
+ }
+
+ private static void appendRange(StringBuilder sb, long[] bucketOffsets, int index)
+ {
+ sb.append("[");
+ if (index == 0)
+ if (bucketOffsets[0] > 0)
+ // by original definition, this histogram is for values greater than zero only;
+ // if values of 0 or less are required, an entry of lb-1 must be inserted at the start
+ sb.append("1");
+ else
+ sb.append("-Inf");
+ else
+ sb.append(bucketOffsets[index - 1] + 1);
+ sb.append("..");
+ if (index == bucketOffsets.length)
+ sb.append("Inf");
+ else
+ sb.append(bucketOffsets[index]);
+ sb.append("]");
+ }
+
@Override
public boolean equals(Object o)
{
http://git-wip-us.apache.org/repos/asf/cassandra/blob/18f0234b/src/java/org/apache/cassandra/utils/HistogramBuilder.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/utils/HistogramBuilder.java b/src/java/org/apache/cassandra/utils/HistogramBuilder.java
new file mode 100644
index 0000000..c2accb2
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/HistogramBuilder.java
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils;
+
+import java.util.Arrays;
+
+/**
+ * Simple class for constructing an EsimtatedHistogram from a set of predetermined values
+ */
+public class HistogramBuilder
+{
+
+ public HistogramBuilder() {}
+ public HistogramBuilder(long[] values)
+ {
+ for (long value : values)
+ {
+ add(value);
+ }
+ }
+
+ private long[] values = new long[10];
+ int count = 0;
+
+ public void add(long value)
+ {
+ if (count == values.length)
+ values = Arrays.copyOf(values, values.length << 1);
+ values[count++] = value;
+ }
+
+ /**
+ * See {@link #buildWithStdevRangesAroundMean(int)}
+ * @return buildWithStdevRangesAroundMean(3)
+ */
+ public EstimatedHistogram buildWithStdevRangesAroundMean()
+ {
+ return buildWithStdevRangesAroundMean(3);
+ }
+
+ /**
+ * Calculate the min, mean, max and standard deviation of the items in the builder, and
+ * generate an EstimatedHistogram with upto <code>maxdev</code> stdev size ranges either
+ * side of the mean, until min/max are hit; if either min/max are not reached a further range is
+ * inserted at the relevant ends. e.g., with a <code>maxdevs</code> of 3, there may be <i>up to</i> 8 ranges
+ * (between 9 boundaries, the middle being the mean); the middle 6 will have the same size (stdev)
+ * with the outermost two stretching out to min and max.
+ *
+ * @param maxdevs
+ * @return
+ */
+ public EstimatedHistogram buildWithStdevRangesAroundMean(int maxdevs)
+ {
+ if (maxdevs < 0)
+ throw new IllegalArgumentException("maxdevs must be greater than or equal to zero");
+
+ final int count = this.count;
+ final long[] values = this.values;
+
+ if (count == 0)
+ return new EstimatedHistogram(new long[] { }, new long[] { 0 });
+
+ long min = Long.MAX_VALUE, max = Long.MIN_VALUE;
+ double sum = 0, sumsq = 0;
+ for (int i = 0 ; i < count ; i++)
+ {
+ final long value = values[i];
+ sum += value;
+ sumsq += value * value;
+ if (value < min)
+ min = value;
+ if (value > max)
+ max = value;
+ }
+
+ final long mean = (long)Math.round(sum / count);
+ final double stdev =
+ Math.sqrt((sumsq / count) - (mean * (double) mean));
+
+ // build the ranges either side of the mean
+ final long[] lowhalf = buildRange(mean, min, true, stdev, maxdevs);
+ final long[] highhalf = buildRange(mean, max, false, stdev, maxdevs);
+
+ // combine the ranges
+ final long[] ranges = new long[lowhalf.length + highhalf.length + 1];
+ System.arraycopy(lowhalf, 0, ranges, 0, lowhalf.length);
+ ranges[lowhalf.length] = mean;
+ System.arraycopy(highhalf, 0, ranges, lowhalf.length + 1, highhalf.length);
+
+ final EstimatedHistogram hist = new EstimatedHistogram(ranges, new long[ranges.length + 1]);
+ for (int i = 0 ; i < count ; i++)
+ hist.add(values[i]);
+ return hist;
+ }
+
+ private static long[] buildRange(long mean, long minormax, boolean ismin, double stdev, int maxdevs)
+ {
+ if (minormax == mean)
+ // minormax == mean we have no range to produce, but given the exclusive starts
+ // that begin at zero by default (or -Inf) in EstimatedHistogram we have to generate a min range
+ // to indicate where we start from
+ return ismin ? new long[] { mean - 1 } : new long[0];
+
+ if (stdev < 1)
+ {
+ // deal with stdevs too small to generate sensible ranges
+ return ismin ? new long[] { minormax - 1, mean - 1 } :
+ new long[] { minormax };
+ }
+
+ long larger, smaller;
+ if (ismin) { larger = mean; smaller = minormax; }
+ else { larger = minormax; smaller = mean; }
+
+ double stdevsTo = (larger - smaller) / stdev;
+ if (stdevsTo > 0 && stdevsTo < 1)
+ // always round up if there's just one non-empty range
+ stdevsTo = 1;
+ else
+ // otherwise round to the nearest half stdev, to avoid tiny ranges at the start/end
+ stdevsTo = Math.round(stdevsTo);
+
+ // limit to 4 stdev ranges - last range will contain everything to boundary
+ final int len = Math.min(maxdevs + 1, (int) stdevsTo);
+ final long[] range = new long[len];
+ long next = ismin ? minormax - 1 : minormax;
+ for (int i = 0 ; i < range.length ; i++)
+ {
+ long delta = (range.length - (i + 1)) * (long) stdev;
+ if (ismin)
+ {
+ range[i] = next;
+ next = mean - delta;
+ }
+ else
+ {
+ range[len - 1 - i] = next;
+ next = mean + delta;
+ }
+ }
+ return range;
+ }
+}
http://git-wip-us.apache.org/repos/asf/cassandra/blob/18f0234b/src/java/org/apache/cassandra/utils/MerkleTree.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/utils/MerkleTree.java b/src/java/org/apache/cassandra/utils/MerkleTree.java
index a16c2b9..ce71ec4 100644
--- a/src/java/org/apache/cassandra/utils/MerkleTree.java
+++ b/src/java/org/apache/cassandra/utils/MerkleTree.java
@@ -64,6 +64,7 @@ public class MerkleTree implements Serializable
public static final int CONSISTENT = 0;
public static final int FULLY_INCONSISTENT = 1;
public static final int PARTIALLY_INCONSISTENT = 2;
+ private static final byte[] EMPTY_HASH = new byte[0];
public final byte hashdepth;
@@ -227,11 +228,14 @@ public class MerkleTree implements Serializable
if (!ltree.fullRange.equals(rtree.fullRange))
throw new IllegalArgumentException("Difference only make sense on tree covering the same range (but " + ltree.fullRange + " != " + rtree.fullRange + ")");
- List<TreeRange> diff = new ArrayList<TreeRange>();
- TreeRange active = new TreeRange(null, ltree.fullRange.left, ltree.fullRange.right, (byte)0, null);
+ List<TreeRange> diff = new ArrayList<>();
+ TreeDifference active = new TreeDifference(ltree.fullRange.left, ltree.fullRange.right, (byte)0);
- byte[] lhash = ltree.hash(active);
- byte[] rhash = rtree.hash(active);
+ Hashable lnode = ltree.find(active);
+ Hashable rnode = rtree.find(active);
+ byte[] lhash = lnode.hash();
+ byte[] rhash = rnode.hash();
+ active.setSize(lnode.sizeOfRange(), rnode.sizeOfRange());
if (lhash != null && rhash != null && !Arrays.equals(lhash, rhash))
{
@@ -256,14 +260,19 @@ public class MerkleTree implements Serializable
return CONSISTENT;
Token midpoint = ltree.partitioner().midpoint(active.left, active.right);
- TreeRange left = new TreeRange(null, active.left, midpoint, inc(active.depth), null);
- TreeRange right = new TreeRange(null, midpoint, active.right, inc(active.depth), null);
- byte[] lhash;
- byte[] rhash;
+ TreeDifference left = new TreeDifference(active.left, midpoint, inc(active.depth));
+ TreeDifference right = new TreeDifference(midpoint, active.right, inc(active.depth));
+ byte[] lhash, rhash;
+ Hashable lnode, rnode;
// see if we should recurse left
- lhash = ltree.hash(left);
- rhash = rtree.hash(left);
+ lnode = ltree.find(left);
+ rnode = rtree.find(left);
+ lhash = lnode.hash();
+ rhash = rnode.hash();
+ left.setSize(lnode.sizeOfRange(), rnode.sizeOfRange());
+ left.setRows(lnode.rowsInRange(), rnode.rowsInRange());
+
int ldiff = CONSISTENT;
boolean lreso = lhash != null && rhash != null;
if (lreso && !Arrays.equals(lhash, rhash))
@@ -271,10 +280,14 @@ public class MerkleTree implements Serializable
else if (!lreso)
ldiff = FULLY_INCONSISTENT;
-
// see if we should recurse right
- lhash = ltree.hash(right);
- rhash = rtree.hash(right);
+ lnode = ltree.find(right);
+ rnode = rtree.find(right);
+ lhash = lnode.hash();
+ rhash = rnode.hash();
+ right.setSize(lnode.sizeOfRange(), rnode.sizeOfRange());
+ right.setRows(lnode.rowsInRange(), rnode.rowsInRange());
+
int rdiff = CONSISTENT;
boolean rreso = lhash != null && rhash != null;
if (rreso && !Arrays.equals(lhash, rhash))
@@ -363,54 +376,57 @@ public class MerkleTree implements Serializable
*/
public byte[] hash(Range<Token> range)
{
+ return find(range).hash();
+ }
+
+ /**
+ * Find the {@link Hashable} node that matches the given {@code range}.
+ *
+ * @param range Range to find
+ * @return {@link Hashable} found. If nothing found, return {@link Leaf} with null hash.
+ */
+ private Hashable find(Range<Token> range)
+ {
try
{
- return hashHelper(root, new Range<Token>(fullRange.left, fullRange.right), range);
+ return findHelper(root, new Range<Token>(fullRange.left, fullRange.right), range);
}
catch (StopRecursion e)
{
- return null;
+ return new Leaf();
}
}
/**
* @throws StopRecursion If no match could be found for the range.
*/
- private byte[] hashHelper(Hashable hashable, Range<Token> active, Range<Token> range) throws StopRecursion
+ private Hashable findHelper(Hashable current, Range<Token> activeRange, Range<Token> find) throws StopRecursion
{
- if (hashable instanceof Leaf)
+ if (current instanceof Leaf)
{
- if (!range.contains(active))
+ if (!find.contains(activeRange))
// we are not fully contained in this range!
throw new StopRecursion.BadRange();
- return hashable.hash();
+ return current;
}
// else: node.
- Inner node = (Inner)hashable;
- Range<Token> leftactive = new Range<Token>(active.left, node.token);
- Range<Token> rightactive = new Range<Token>(node.token, active.right);
+ Inner node = (Inner)current;
+ Range<Token> leftRange = new Range<Token>(activeRange.left, node.token);
+ Range<Token> rightRange = new Range<Token>(node.token, activeRange.right);
- if (range.contains(active))
- {
+ if (find.contains(activeRange))
// this node is fully contained in the range
- if (node.hash() != null)
- // we had a cached value
- return node.hash();
- // continue recursing to hash our children
- byte[] lhash = hashHelper(node.lchild(), leftactive, range);
- byte[] rhash = hashHelper(node.rchild(), rightactive, range);
- // cache the computed value (even if it is null)
- node.hash(lhash, rhash);
- return node.hash();
- } // else: one of our children contains the range
-
- if (leftactive.contains(range))
+ return node.calc();
+
+ // else: one of our children contains the range
+
+ if (leftRange.contains(find))
// left child contains/matches the range
- return hashHelper(node.lchild, leftactive, range);
- else if (rightactive.contains(range))
+ return findHelper(node.lchild, leftRange, find);
+ else if (rightRange.contains(find))
// right child contains/matches the range
- return hashHelper(node.rchild, rightactive, range);
+ return findHelper(node.rchild, rightRange, find);
else
throw new StopRecursion.BadRange();
}
@@ -479,6 +495,26 @@ public class MerkleTree implements Serializable
return new TreeRangeIterator(this);
}
+ public EstimatedHistogram histogramOfRowSizePerLeaf()
+ {
+ HistogramBuilder histbuild = new HistogramBuilder();
+ for (TreeRange range : new TreeRangeIterator(this))
+ {
+ histbuild.add(range.hashable.sizeOfRange);
+ }
+ return histbuild.buildWithStdevRangesAroundMean();
+ }
+
+ public EstimatedHistogram histogramOfRowCountPerLeaf()
+ {
+ HistogramBuilder histbuild = new HistogramBuilder();
+ for (TreeRange range : new TreeRangeIterator(this))
+ {
+ histbuild.add(range.hashable.rowsInRange);
+ }
+ return histbuild.buildWithStdevRangesAroundMean();
+ }
+
@Override
public String toString()
{
@@ -489,6 +525,59 @@ public class MerkleTree implements Serializable
return buff.toString();
}
+ public static class TreeDifference extends TreeRange
+ {
+ private static final long serialVersionUID = 6363654174549968183L;
+
+ private long sizeOnLeft;
+ private long sizeOnRight;
+ private long rowsOnLeft;
+ private long rowsOnRight;
+
+ void setSize(long sizeOnLeft, long sizeOnRight)
+ {
+ this.sizeOnLeft = sizeOnLeft;
+ this.sizeOnRight = sizeOnRight;
+ }
+
+ void setRows(long rowsOnLeft, long rowsOnRight)
+ {
+ this.rowsOnLeft = rowsOnLeft;
+ this.rowsOnRight = rowsOnRight;
+ }
+
+ public long sizeOnLeft()
+ {
+ return sizeOnLeft;
+ }
+
+ public long sizeOnRight()
+ {
+ return sizeOnRight;
+ }
+
+ public long rowsOnLeft()
+ {
+ return rowsOnLeft;
+ }
+
+ public long rowsOnRight()
+ {
+ return rowsOnRight;
+ }
+
+ public TreeDifference(Token left, Token right, byte depth)
+ {
+ super(null, left, right, depth, null);
+ }
+
+ public long totalRows()
+ {
+ return rowsOnLeft + rowsOnRight;
+ }
+
+ }
+
/**
* The public interface to a range in the tree.
*
@@ -531,7 +620,16 @@ public class MerkleTree implements Serializable
assert tree != null : "Not intended for modification!";
assert hashable instanceof Leaf;
- hashable.addHash(entry.hash);
+ hashable.addHash(entry.hash, entry.size);
+ }
+
+ public void ensureHashInitialised()
+ {
+ assert tree != null : "Not intended for modification!";
+ assert hashable instanceof Leaf;
+
+ if (hashable.hash == null)
+ hashable.hash = EMPTY_HASH;
}
public void addAll(Iterator<RowHash> entries)
@@ -662,6 +760,21 @@ public class MerkleTree implements Serializable
rchild = child;
}
+ Hashable calc()
+ {
+ if (hash == null)
+ {
+ // hash and size haven't been calculated; calc children then compute
+ Hashable lnode = lchild.calc();
+ Hashable rnode = rchild.calc();
+ // cache the computed value
+ hash(lnode.hash, rnode.hash);
+ sizeOfRange = lnode.sizeOfRange + rnode.sizeOfRange;
+ rowsInRange = lnode.rowsInRange + rnode.rowsInRange;
+ }
+ return this;
+ }
+
/**
* Recursive toString.
*/
@@ -729,12 +842,12 @@ public class MerkleTree implements Serializable
public long serializedSize(Inner inner, int version)
{
int size = inner.hash == null
- ? TypeSizes.NATIVE.sizeof(-1)
- : TypeSizes.NATIVE.sizeof(inner.hash().length) + inner.hash().length;
+ ? TypeSizes.NATIVE.sizeof(-1)
+ : TypeSizes.NATIVE.sizeof(inner.hash().length) + inner.hash().length;
size += Token.serializer.serializedSize(inner.token, TypeSizes.NATIVE)
- + Hashable.serializer.serializedSize(inner.lchild, version)
- + Hashable.serializer.serializedSize(inner.rchild, version);
+ + Hashable.serializer.serializedSize(inner.lchild, version)
+ + Hashable.serializer.serializedSize(inner.rchild, version);
return size;
}
}
@@ -768,11 +881,6 @@ public class MerkleTree implements Serializable
super(hash);
}
- public Leaf(byte[] lefthash, byte[] righthash)
- {
- super(Hashable.binaryHash(lefthash, righthash));
- }
-
public void toString(StringBuilder buff, int maxdepth)
{
buff.append(toString());
@@ -826,16 +934,18 @@ public class MerkleTree implements Serializable
{
public final Token token;
public final byte[] hash;
- public RowHash(Token token, byte[] hash)
+ public final long size;
+ public RowHash(Token token, byte[] hash, long size)
{
this.token = token;
this.hash = hash;
+ this.size = size;
}
@Override
public String toString()
{
- return "#<RowHash " + token + " " + Hashable.toString(hash) + ">";
+ return "#<RowHash " + token + " " + Hashable.toString(hash) + " @ " + size + " bytes>";
}
}
@@ -848,6 +958,8 @@ public class MerkleTree implements Serializable
private static final IVersionedSerializer<Hashable> serializer = new HashableSerializer();
protected byte[] hash;
+ protected long sizeOfRange;
+ protected long rowsInRange;
protected Hashable(byte[] hash)
{
@@ -859,11 +971,26 @@ public class MerkleTree implements Serializable
return hash;
}
+ public long sizeOfRange()
+ {
+ return sizeOfRange;
+ }
+
+ public long rowsInRange()
+ {
+ return rowsInRange;
+ }
+
void hash(byte[] hash)
{
this.hash = hash;
}
+ Hashable calc()
+ {
+ return this;
+ }
+
/**
* Sets the value of this hash to binaryHash of its children.
* @param lefthash Hash of left child.
@@ -878,12 +1005,14 @@ public class MerkleTree implements Serializable
* Mixes the given value into our hash. If our hash is null,
* our hash will become the given value.
*/
- void addHash(byte[] righthash)
+ void addHash(byte[] righthash, long sizeOfRow)
{
if (hash == null)
hash = righthash;
else
hash = binaryHash(hash, righthash);
+ this.sizeOfRange += sizeOfRow;
+ this.rowsInRange += 1;
}
/**
http://git-wip-us.apache.org/repos/asf/cassandra/blob/18f0234b/test/unit/org/apache/cassandra/repair/DifferencerTest.java
----------------------------------------------------------------------
diff --git a/test/unit/org/apache/cassandra/repair/DifferencerTest.java b/test/unit/org/apache/cassandra/repair/DifferencerTest.java
index 2502620..3f259f2 100644
--- a/test/unit/org/apache/cassandra/repair/DifferencerTest.java
+++ b/test/unit/org/apache/cassandra/repair/DifferencerTest.java
@@ -132,7 +132,7 @@ public class DifferencerTest extends SchemaLoader
tree.init();
for (MerkleTree.TreeRange r : tree.invalids())
{
- r.addHash(Validator.EMPTY_ROW);
+ r.ensureHashInitialised();
}
return tree;
}
http://git-wip-us.apache.org/repos/asf/cassandra/blob/18f0234b/test/unit/org/apache/cassandra/utils/HistogramBuilderTest.java
----------------------------------------------------------------------
diff --git a/test/unit/org/apache/cassandra/utils/HistogramBuilderTest.java b/test/unit/org/apache/cassandra/utils/HistogramBuilderTest.java
new file mode 100644
index 0000000..dfceaf3
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/HistogramBuilderTest.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils;
+
+import java.util.concurrent.atomic.AtomicLongArray;
+
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+
+public class HistogramBuilderTest
+{
+
+ @Test
+ public void testStdevEmpty()
+ {
+ EstimatedHistogram hist = new HistogramBuilder().buildWithStdevRangesAroundMean();
+ assertArrayEquals(new long[] { }, hist.getBucketOffsets());
+ assertArrayEquals(new long[] { 0 }, toArray(hist.buckets));
+ }
+
+ @Test
+ public void testStdevSingletonRanges()
+ {
+ EstimatedHistogram hist;
+ hist = new HistogramBuilder(new long[] { 5, 5, 5, 5, 5 }).buildWithStdevRangesAroundMean();
+ assertArrayEquals(new long[] { 4, 5 }, hist.getBucketOffsets());
+ assertArrayEquals(new long[] { 0, 5, 0 }, toArray(hist.buckets));
+ // should behave exactly the same for negative numbers
+ hist = new HistogramBuilder(new long[] { -1 }).buildWithStdevRangesAroundMean();
+ assertArrayEquals(new long[] { -2, -1 }, hist.getBucketOffsets());
+ assertArrayEquals(new long[] { 0, 1, 0 }, toArray(hist.buckets));
+ }
+
+ @Test
+ public void testStdevNearZeroStdev()
+ {
+ EstimatedHistogram hist;
+ long[] vals = new long[100000];
+ vals[0] = 99;
+ vals[1] = 101;
+ for (int i = 2 ; i < vals.length ; i++)
+ vals[i] = 100;
+ hist = new HistogramBuilder(vals).buildWithStdevRangesAroundMean();
+ assertArrayEquals(new long[] { 98, 99, 100, 101 }, hist.getBucketOffsets());
+ assertArrayEquals(new long[] { 0, 1, vals.length - 2, 1, 0 }, toArray(hist.buckets));
+ }
+
+ @Test
+ public void testStdev()
+ {
+ long[] vals;
+ EstimatedHistogram hist;
+ vals = new long[] { -10, -3, -2, -2, -1, -1, -1, -1, -1, -1, -1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 10 };
+ hist = new HistogramBuilder(vals).buildWithStdevRangesAroundMean(2);
+ assertArrayEquals(new long[] { -11, -6, -3, 0, 3, 6, 10 }, hist.getBucketOffsets());
+ assertArrayEquals(new long[] { 0, 1, 1, 10, 10, 0, 1, 0 }, toArray(hist.buckets));
+ }
+
+ private static long[] toArray(AtomicLongArray a)
+ {
+ final long[] r = new long[a.length()];
+ for (int i = 0 ; i < r.length ; i++)
+ r[i] = a.get(i);
+ return r;
+ }
+
+ @Test
+ public void testStdevLargeNumbers()
+ {
+ long[] vals;
+ EstimatedHistogram hist;
+ vals = new long[100000];
+ for (int i = 0 ; i < vals.length ; i++)
+ {
+ if (i < vals.length * 0.6f)
+ vals[i] = 60;
+ else if (i < vals.length * 0.8f)
+ vals[i] = 120;
+ else if (i < vals.length * 0.9f)
+ vals[i] = 180;
+ else if (i < vals.length * 0.95f)
+ vals[i] = 240;
+ else if (i < vals.length * 0.98f)
+ vals[i] = 320;
+ else
+ vals[i] = 1000;
+ }
+ hist = new HistogramBuilder(vals).buildWithStdevRangesAroundMean(2);
+ assertArrayEquals(new long[] { 59, 120, 260, 400, 1000 }, hist.getBucketOffsets());
+ assertArrayEquals(new long[] { 0, 80000, 15000, 3000, 2000, 0 }, toArray(hist.buckets));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/cassandra/blob/18f0234b/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java
----------------------------------------------------------------------
diff --git a/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java b/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java
index 70fe3a0..2b459c6 100644
--- a/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java
+++ b/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java
@@ -25,16 +25,15 @@ import com.google.common.collect.AbstractIterator;
import com.google.common.io.ByteArrayDataInput;
import com.google.common.io.ByteArrayDataOutput;
import com.google.common.io.ByteStreams;
-import org.apache.hadoop.hdfs.server.common.Storage;
+
import org.junit.Before;
import org.junit.Test;
-
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.dht.*;
import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.utils.MerkleTree.Hashable;
import org.apache.cassandra.utils.MerkleTree.RowHash;
+import org.apache.cassandra.utils.MerkleTree.TreeDifference;
import org.apache.cassandra.utils.MerkleTree.TreeRange;
import org.apache.cassandra.utils.MerkleTree.TreeRangeIterator;
@@ -322,7 +321,7 @@ public class MerkleTreeTest
// validate the tree
TreeRangeIterator ranges = mt.invalids();
for (TreeRange range : ranges)
- range.addHash(new RowHash(range.right, new byte[0]));
+ range.addHash(new RowHash(range.right, new byte[0], 0));
assert mt.hash(new Range<>(tok(-1), tok(-1))) != null :
"Could not hash tree " + mt;
@@ -491,7 +490,7 @@ public class MerkleTreeTest
public RowHash computeNext()
{
if (tokens.hasNext())
- return new RowHash(tokens.next(), DUMMY);
+ return new RowHash(tokens.next(), DUMMY, DUMMY.length);
return endOfData();
}
}
[3/3] git commit: Merge branch 'cassandra-2.0' into trunk
Posted by yu...@apache.org.
Merge branch 'cassandra-2.0' into trunk
Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo
Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/9cf1068b
Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/9cf1068b
Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/9cf1068b
Branch: refs/heads/trunk
Commit: 9cf1068b73e9d7ce47b3d2eaf223a20cb6779940
Parents: 426caf7 18f0234
Author: Yuki Morishita <yu...@apache.org>
Authored: Wed Aug 7 12:20:19 2013 -0500
Committer: Yuki Morishita <yu...@apache.org>
Committed: Wed Aug 7 12:20:19 2013 -0500
----------------------------------------------------------------------
CHANGES.txt | 1 +
.../org/apache/cassandra/repair/Validator.java | 61 ++++-
.../cassandra/utils/EstimatedHistogram.java | 65 ++++++
.../cassandra/utils/HistogramBuilder.java | 158 +++++++++++++
.../org/apache/cassandra/utils/MerkleTree.java | 233 ++++++++++++++-----
.../cassandra/repair/DifferencerTest.java | 2 +-
.../cassandra/utils/HistogramBuilderTest.java | 109 +++++++++
.../apache/cassandra/utils/MerkleTreeTest.java | 9 +-
8 files changed, 572 insertions(+), 66 deletions(-)
----------------------------------------------------------------------