You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by bb...@apache.org on 2022/09/06 19:38:04 UTC
[hbase] branch branch-2.5 updated: HBASE-27224 HFile tool statistic sampling produces misleading results (#4638)
This is an automated email from the ASF dual-hosted git repository.
bbeaudreault pushed a commit to branch branch-2.5
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2.5 by this push:
new 6f6857b83fb HBASE-27224 HFile tool statistic sampling produces misleading results (#4638)
6f6857b83fb is described below
commit 6f6857b83fbf90e77d58642da38c8bc878ccdf2f
Author: Bryan Beaudreault <bb...@hubspot.com>
AuthorDate: Tue Sep 6 15:04:32 2022 -0400
HBASE-27224 HFile tool statistic sampling produces misleading results (#4638)
Signed-off-by: Duo Zhang <zh...@apache.org>
Reviewed-by: Clay Baenziger <cw...@clayb.net>
---
.../hadoop/hbase/io/hfile/HFilePrettyPrinter.java | 240 +++++++++++++++------
.../hbase/io/hfile/TestHFilePrettyPrinter.java | 56 +++++
2 files changed, 226 insertions(+), 70 deletions(-)
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePrettyPrinter.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePrettyPrinter.java
index cb74ed39a85..38e422ce09c 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePrettyPrinter.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePrettyPrinter.java
@@ -20,21 +20,17 @@ package org.apache.hadoop.hbase.io.hfile;
import static com.codahale.metrics.MetricRegistry.name;
import com.codahale.metrics.ConsoleReporter;
-import com.codahale.metrics.Counter;
-import com.codahale.metrics.Gauge;
import com.codahale.metrics.Histogram;
-import com.codahale.metrics.Meter;
-import com.codahale.metrics.MetricFilter;
import com.codahale.metrics.MetricRegistry;
-import com.codahale.metrics.ScheduledReporter;
import com.codahale.metrics.Snapshot;
-import com.codahale.metrics.Timer;
+import com.codahale.metrics.UniformReservoir;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.IOException;
import java.io.PrintStream;
import java.text.DateFormat;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
@@ -43,10 +39,8 @@ import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
-import java.util.SortedMap;
import java.util.TimeZone;
-import java.util.concurrent.TimeUnit;
-import org.apache.commons.lang3.StringUtils;
+import java.util.concurrent.atomic.LongAdder;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
@@ -107,6 +101,7 @@ public class HFilePrettyPrinter extends Configured implements Tool {
private boolean printBlockIndex;
private boolean printBlockHeaders;
private boolean printStats;
+ private boolean printStatRanges;
private boolean checkRow;
private boolean checkFamily;
private boolean isSeekToRow = false;
@@ -150,6 +145,8 @@ public class HFilePrettyPrinter extends Configured implements Tool {
options.addOption("w", "seekToRow", true,
"Seek to this row and print all the kvs for this row only");
options.addOption("s", "stats", false, "Print statistics");
+ options.addOption("d", "details", false,
+ "Print detailed statistics, including counts by range");
options.addOption("i", "checkMobIntegrity", false,
"Print all cells whose mob files are missing");
@@ -181,7 +178,8 @@ public class HFilePrettyPrinter extends Configured implements Tool {
shouldPrintMeta = cmd.hasOption("m");
printBlockIndex = cmd.hasOption("b");
printBlockHeaders = cmd.hasOption("h");
- printStats = cmd.hasOption("s");
+ printStatRanges = cmd.hasOption("d");
+ printStats = cmd.hasOption("s") || printStatRanges;
checkRow = cmd.hasOption("k");
checkFamily = cmd.hasOption("a");
checkMobIntegrity = cmd.hasOption("i");
@@ -356,7 +354,7 @@ public class HFilePrettyPrinter extends Configured implements Tool {
}
if (printStats) {
- fileStats.finish();
+ fileStats.finish(printStatRanges);
out.println("Stats:\n" + fileStats);
}
@@ -382,7 +380,7 @@ public class HFilePrettyPrinter extends Configured implements Tool {
}
// collect stats
if (printStats) {
- fileStats.collect(cell);
+ fileStats.collect(cell, printStatRanges);
}
// dump key value
if (printKey) {
@@ -581,18 +579,101 @@ public class HFilePrettyPrinter extends Configured implements Tool {
}
}
+ // Default reservoir is exponentially decaying, but we're doing a point-in-time analysis
+ // of a store file. It doesn't make sense to prefer keys later in the store file.
+ private static final MetricRegistry.MetricSupplier<Histogram> UNIFORM_RESERVOIR =
+ () -> new Histogram(new UniformReservoir());
+
+ // Useful ranges for viewing distribution of small to large keys, values, and rows.
+ // we only print ranges which actually have values, so more here doesn't add much overhead
+ private static final long[] RANGES = new long[] { 1, 3, 10, 50, 100, 500, 1_000, 5_000, 10_000,
+ 50_000, 100_000, 500_000, 750_000, 1_000_000, 5_000_000, 10_000_000, 50_000_000, 100_000_000 };
+
+ /**
+ * Holds a Histogram and supporting min/max and range buckets for analyzing distribution of key
+ * bytes, value bytes, row bytes, and row columns. Supports adding values, getting the histogram,
+ * and getting counts per range.
+ */
+ static class KeyValueStats {
+ private final Histogram histogram;
+ private final String name;
+ private long max = Long.MIN_VALUE;
+ private long min = Long.MAX_VALUE;
+ private boolean collectRanges = false;
+ private final LongAdder[] rangeCounts;
+
+ KeyValueStats(MetricRegistry metricRegistry, String statName) {
+ this.histogram =
+ metricRegistry.histogram(name(HFilePrettyPrinter.class, statName), UNIFORM_RESERVOIR);
+ this.name = statName;
+ this.rangeCounts = new LongAdder[RANGES.length];
+ for (int i = 0; i < rangeCounts.length; i++) {
+ rangeCounts[i] = new LongAdder();
+ }
+ }
+
+ void update(long value, boolean collectRanges) {
+ histogram.update(value);
+ min = Math.min(value, min);
+ max = Math.max(value, max);
+
+ if (collectRanges) {
+ this.collectRanges = true;
+ int result = Arrays.binarySearch(RANGES, value);
+ int idx = result >= 0 ? result : Math.abs(result) - 1;
+ rangeCounts[idx].increment();
+ }
+ }
+
+ Histogram getHistogram() {
+ return histogram;
+ }
+
+ String getName() {
+ return name;
+ }
+
+ long getMax() {
+ return max;
+ }
+
+ long getMin() {
+ return min;
+ }
+
+ long[] getRanges() {
+ return RANGES;
+ }
+
+ long getCountAtOrBelow(long range) {
+ long count = 0;
+ for (int i = 0; i < RANGES.length; i++) {
+ if (RANGES[i] <= range) {
+ count += rangeCounts[i].sum();
+ } else {
+ break;
+ }
+ }
+ return count;
+ }
+
+ boolean hasRangeCounts() {
+ return collectRanges;
+ }
+ }
+
private static class KeyValueStatsCollector {
private final MetricRegistry metricsRegistry = new MetricRegistry();
private final ByteArrayOutputStream metricsOutput = new ByteArrayOutputStream();
- private final SimpleReporter simpleReporter = SimpleReporter.forRegistry(metricsRegistry)
- .outputTo(new PrintStream(metricsOutput)).filter(MetricFilter.ALL).build();
- Histogram keyLen = metricsRegistry.histogram(name(HFilePrettyPrinter.class, "Key length"));
- Histogram valLen = metricsRegistry.histogram(name(HFilePrettyPrinter.class, "Val length"));
- Histogram rowSizeBytes =
- metricsRegistry.histogram(name(HFilePrettyPrinter.class, "Row size (bytes)"));
- Histogram rowSizeCols =
- metricsRegistry.histogram(name(HFilePrettyPrinter.class, "Row size (columns)"));
+ KeyValueStats keyLen = new KeyValueStats(metricsRegistry, "Key length");
+ KeyValueStats valLen = new KeyValueStats(metricsRegistry, "Val length");
+ KeyValueStats rowSizeBytes = new KeyValueStats(metricsRegistry, "Row size (bytes)");
+ KeyValueStats rowSizeCols = new KeyValueStats(metricsRegistry, "Row size (columns)");
+
+ private final SimpleReporter simpleReporter =
+ SimpleReporter.newBuilder().outputTo(new PrintStream(metricsOutput)).addStats(keyLen)
+ .addStats(valLen).addStats(rowSizeBytes).addStats(rowSizeCols).build();
long curRowBytes = 0;
long curRowCols = 0;
@@ -603,11 +684,11 @@ public class HFilePrettyPrinter extends Configured implements Tool {
private long maxRowBytes = 0;
private long curRowKeyLength;
- public void collect(Cell cell) {
- valLen.update(cell.getValueLength());
+ public void collect(Cell cell, boolean printStatRanges) {
+ valLen.update(cell.getValueLength(), printStatRanges);
if (prevCell != null && CellComparator.getInstance().compareRows(prevCell, cell) != 0) {
// new row
- collectRow();
+ collectRow(printStatRanges);
}
curRowBytes += cell.getSerializedSize();
curRowKeyLength = KeyValueUtil.keyLength(cell);
@@ -615,10 +696,10 @@ public class HFilePrettyPrinter extends Configured implements Tool {
prevCell = cell;
}
- private void collectRow() {
- rowSizeBytes.update(curRowBytes);
- rowSizeCols.update(curRowCols);
- keyLen.update(curRowKeyLength);
+ private void collectRow(boolean printStatRanges) {
+ rowSizeBytes.update(curRowBytes, printStatRanges);
+ rowSizeCols.update(curRowCols, printStatRanges);
+ keyLen.update(curRowKeyLength, printStatRanges);
if (curRowBytes > maxRowBytes && prevCell != null) {
biggestRow = CellUtil.cloneRow(prevCell);
@@ -629,9 +710,9 @@ public class HFilePrettyPrinter extends Configured implements Tool {
curRowCols = 0;
}
- public void finish() {
+ public void finish(boolean printStatRanges) {
if (curRowCols > 0) {
- collectRow();
+ collectRow(printStatRanges);
}
}
@@ -640,7 +721,6 @@ public class HFilePrettyPrinter extends Configured implements Tool {
if (prevCell == null) return "no data available for statistics";
// Dump the metrics to the output stream
- simpleReporter.stop();
simpleReporter.report();
return metricsOutput.toString() + "Key of biggest row: " + Bytes.toStringBinary(biggestRow);
@@ -648,41 +728,32 @@ public class HFilePrettyPrinter extends Configured implements Tool {
}
/**
- * Almost identical to ConsoleReporter, but extending ScheduledReporter, as extending
- * ConsoleReporter in this version of dropwizard is now too much trouble.
+ * Simple reporter which collects registered histograms for printing to an output stream in
+ * {@link #report()}.
*/
- private static class SimpleReporter extends ScheduledReporter {
+ private static final class SimpleReporter {
/**
- * Returns a new {@link Builder} for {@link ConsoleReporter}.
- * @param registry the registry to report
- * @return a {@link Builder} instance for a {@link ConsoleReporter}
+ * Returns a new {@link Builder} for {@link SimpleReporter}.
+ * @return a {@link Builder} instance for a {@link SimpleReporter}
*/
- public static Builder forRegistry(MetricRegistry registry) {
- return new Builder(registry);
+ public static Builder newBuilder() {
+ return new Builder();
}
/**
* A builder for {@link SimpleReporter} instances. Defaults to using the default locale and time
- * zone, writing to {@code System.out}, converting rates to events/second, converting durations
- * to milliseconds, and not filtering metrics.
+ * zone, writing to {@code System.out}.
*/
public static class Builder {
- private final MetricRegistry registry;
+ private final List<KeyValueStats> stats = new ArrayList<>();
private PrintStream output;
private Locale locale;
private TimeZone timeZone;
- private TimeUnit rateUnit;
- private TimeUnit durationUnit;
- private MetricFilter filter;
- private Builder(MetricRegistry registry) {
- this.registry = registry;
+ private Builder() {
this.output = System.out;
this.locale = Locale.getDefault();
this.timeZone = TimeZone.getDefault();
- this.rateUnit = TimeUnit.SECONDS;
- this.durationUnit = TimeUnit.MILLISECONDS;
- this.filter = MetricFilter.ALL;
}
/**
@@ -696,12 +767,12 @@ public class HFilePrettyPrinter extends Configured implements Tool {
}
/**
- * Only report metrics which match the given filter.
- * @param filter a {@link MetricFilter}
+ * Add the given {@link KeyValueStats} to be reported
+ * @param stat the stat to be reported
* @return {@code this}
*/
- public Builder filter(MetricFilter filter) {
- this.filter = filter;
+ public Builder addStats(KeyValueStats stat) {
+ this.stats.add(stat);
return this;
}
@@ -710,35 +781,31 @@ public class HFilePrettyPrinter extends Configured implements Tool {
* @return a {@link ConsoleReporter}
*/
public SimpleReporter build() {
- return new SimpleReporter(registry, output, locale, timeZone, rateUnit, durationUnit,
- filter);
+ return new SimpleReporter(output, stats, locale, timeZone);
}
}
private final PrintStream output;
+ private final List<KeyValueStats> stats;
private final Locale locale;
private final DateFormat dateFormat;
- private SimpleReporter(MetricRegistry registry, PrintStream output, Locale locale,
- TimeZone timeZone, TimeUnit rateUnit, TimeUnit durationUnit, MetricFilter filter) {
- super(registry, "simple-reporter", filter, rateUnit, durationUnit);
+ private SimpleReporter(PrintStream output, List<KeyValueStats> stats, Locale locale,
+ TimeZone timeZone) {
this.output = output;
+ this.stats = stats;
this.locale = locale;
-
this.dateFormat = DateFormat.getDateTimeInstance(DateFormat.SHORT, DateFormat.MEDIUM, locale);
dateFormat.setTimeZone(timeZone);
}
- @Override
- public void report(SortedMap<String, Gauge> gauges, SortedMap<String, Counter> counters,
- SortedMap<String, Histogram> histograms, SortedMap<String, Meter> meters,
- SortedMap<String, Timer> timers) {
+ public void report() {
// we know we only have histograms
- if (!histograms.isEmpty()) {
- for (Map.Entry<String, Histogram> entry : histograms.entrySet()) {
- output.print(" " + StringUtils.substringAfterLast(entry.getKey(), "."));
+ if (!stats.isEmpty()) {
+ for (KeyValueStats stat : stats) {
+ output.print(" " + stat.getName());
output.println(':');
- printHistogram(entry.getValue());
+ printHistogram(stat);
}
output.println();
}
@@ -747,10 +814,12 @@ public class HFilePrettyPrinter extends Configured implements Tool {
output.flush();
}
- private void printHistogram(Histogram histogram) {
+ private void printHistogram(KeyValueStats stats) {
+ Histogram histogram = stats.getHistogram();
Snapshot snapshot = histogram.getSnapshot();
- output.printf(locale, " min = %d%n", snapshot.getMin());
- output.printf(locale, " max = %d%n", snapshot.getMax());
+
+ output.printf(locale, " min = %d%n", stats.getMin());
+ output.printf(locale, " max = %d%n", stats.getMax());
output.printf(locale, " mean = %2.2f%n", snapshot.getMean());
output.printf(locale, " stddev = %2.2f%n", snapshot.getStdDev());
output.printf(locale, " median = %2.2f%n", snapshot.getMedian());
@@ -760,6 +829,37 @@ public class HFilePrettyPrinter extends Configured implements Tool {
output.printf(locale, " 99%% <= %2.2f%n", snapshot.get99thPercentile());
output.printf(locale, " 99.9%% <= %2.2f%n", snapshot.get999thPercentile());
output.printf(locale, " count = %d%n", histogram.getCount());
+
+ // if printStatRanges was enabled with -d arg, below we'll create an approximate histogram
+ // of counts based on the configured ranges in RANGES. Each range of sizes (i.e. <= 50, <=
+ // 100, etc) will have a count printed if any values were seen in that range. If no values
+ // were seen for a range, that range will be excluded to keep the output small.
+ if (stats.hasRangeCounts()) {
+ output.printf(locale, " (range <= count):%n");
+ long lastVal = 0;
+ long lastRange = 0;
+ for (long range : stats.getRanges()) {
+ long val = stats.getCountAtOrBelow(range);
+ if (val - lastVal > 0) {
+ // print the last zero value before this one, to give context
+ if (lastVal == 0 && lastRange != 0) {
+ printRangeCount(lastRange, lastVal);
+ }
+ printRangeCount(range, val - lastVal);
+ }
+ lastVal = val;
+ lastRange = range;
+ }
+ if (histogram.getCount() - lastVal > 0) {
+ // print any remaining that might have been outside our buckets
+ printRangeCount(Long.MAX_VALUE, histogram.getCount() - lastVal);
+ }
+ }
+ }
+
+ private void printRangeCount(long range, long countAtOrBelow) {
+ String rangeString = range == Long.MAX_VALUE ? "inf" : Long.toString(range);
+ output.printf(locale, "%17s <= %d%n", rangeString, countAtOrBelow);
}
}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestHFilePrettyPrinter.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestHFilePrettyPrinter.java
index 7a78981c665..9ec5dc1c576 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestHFilePrettyPrinter.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestHFilePrettyPrinter.java
@@ -21,6 +21,7 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertTrue;
+import com.codahale.metrics.MetricRegistry;
import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
import org.apache.hadoop.conf.Configuration;
@@ -126,4 +127,59 @@ public class TestHFilePrettyPrinter {
String expectedResult = "Scanning -> " + fileNotInRootDir + "\n" + "Scanned kv count -> 1\n";
assertEquals(expectedResult, result);
}
+
+ @Test
+ public void testHistograms() throws Exception {
+ Path fileNotInRootDir = UTIL.getDataTestDir("hfile");
+ TestHRegionServerBulkLoad.createHFile(fs, fileNotInRootDir, cf, fam, value, 1000);
+ assertNotEquals("directory used is not an HBase root dir", UTIL.getDefaultRootDirPath(),
+ fileNotInRootDir);
+
+ System.setOut(ps);
+ new HFilePrettyPrinter(conf).run(new String[] { "-s", "-d", String.valueOf(fileNotInRootDir) });
+ String result = stream.toString();
+ LOG.info(result);
+
+ // split out the output into sections based on the headers
+ String[] headers =
+ new String[] { "Key length", "Val length", "Row size (bytes)", "Row size (columns)" };
+ // for each section, there is a corresponding expected (count, range) pairs
+ int[][] expectations = new int[][] { new int[] { 0, 10, 1000, 50 }, new int[] { 0, 1, 1000, 3 },
+ new int[] { 0, 10, 1000, 50 }, new int[] { 1000, 1 }, };
+
+ for (int i = 0; i < headers.length - 1; i++) {
+ int idx = result.indexOf(headers[i]);
+ int nextIdx = result.indexOf(headers[i + 1]);
+
+ assertContainsRanges(result.substring(idx, nextIdx), expectations[i]);
+ }
+ }
+
+ private void assertContainsRanges(String result, int... rangeCountPairs) {
+ for (int i = 0; i < rangeCountPairs.length - 1; i += 2) {
+ String expected = rangeCountPairs[i + 1] + " <= " + rangeCountPairs[i];
+ assertTrue("expected:\n" + result + "\nto contain: '" + expected + "'",
+ result.contains(expected));
+ }
+ }
+
+ @Test
+ public void testKeyValueStats() {
+ HFilePrettyPrinter.KeyValueStats stats =
+ new HFilePrettyPrinter.KeyValueStats(new MetricRegistry(), "test");
+ long[] ranges = stats.getRanges();
+ for (long range : ranges) {
+ stats.update(range - 1, true);
+ }
+
+ assertEquals(ranges[ranges.length - 1] - 1, stats.getMax());
+ assertEquals(ranges[0] - 1, stats.getMin());
+
+ int total = 1;
+ for (long range : ranges) {
+ long val = stats.getCountAtOrBelow(range);
+ assertEquals(total++, val);
+ }
+
+ }
}