You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ns...@apache.org on 2011/10/11 04:19:47 UTC
svn commit: r1181556 - in /hbase/branches/0.89/src:
main/java/org/apache/hadoop/hbase/
main/java/org/apache/hadoop/hbase/io/hfile/
main/java/org/apache/hadoop/hbase/regionserver/
main/java/org/apache/hadoop/hbase/regionserver/metrics/ test/java/org/apa...
Author: nspiegelberg
Date: Tue Oct 11 02:19:47 2011
New Revision: 1181556
URL: http://svn.apache.org/viewvc?rev=1181556&view=rev
Log:
Multi-level block index
Summary:
This is a version of Liyin's original implementation of multi-level block
indexes with a significant amount of refactoring, documentation, and bug fixes
to the point that it can run smoothly in HBaseTest for at least a few days. I
also added a unit test, which currently only tests one- and two-level block
indexes. There is a discussion of multi-level block indexes at
https://issues.apache.org/jira/browse/HBASE-3857, and the relevant JIRA these
changes will be part of is https://issues.apache.org/jira/browse/HBASE-3856.
Test Plan:
Unit tests. Load test (HBaseTest). Dark launch.
Reviewed By: kannan
Reviewers: liyintang, kranganathan, kannan, nspiegelberg, gqchen, aaiyer
Commenters: jgray, liyintang
CC: hbase@lists, , mbautin, jgray, kenny, liyintang, kannan
Revert Plan:
The new block index implementation is capable of reading version 1 HFile block
indexes and does that as part of HFile reader v1 in D251875. Howver, multi-level
block indexes introduce non-reverse-compatible changes to the block index
format.
Differential Revision: 251949
Added:
hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java
hbase/branches/0.89/src/test/java/org/apache/hadoop/hbase/io/hfile/TestHFileBlockIndex.java
Modified:
hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/HServerLoad.java
hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/metrics/RegionServerMetrics.java
Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/HServerLoad.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/HServerLoad.java?rev=1181556&r1=1181555&r2=1181556&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/HServerLoad.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/HServerLoad.java Tue Oct 11 02:19:47 2011
@@ -63,9 +63,27 @@ public class HServerLoad implements Writ
private int storefileSizeMB;
/** the current size of the memstore for the region, in MB */
private int memstoreSizeMB;
- /** the current total size of storefile indexes for the region, in MB */
+
+ /**
+ * The current total size of root-level store file indexes for the region,
+ * in MB. The same as {@link #rootIndexSizeKB} but in MB.
+ */
private int storefileIndexSizeMB;
+ /** The current total size of root-level indexes for the region, in KB */
+ private int rootIndexSizeKB;
+
+ /**
+ * The total size of all index blocks, not just the root level, in KB.
+ */
+ private int totalStaticIndexSizeKB;
+
+ /**
+ * The total size of all Bloom filter blocks, not just loaded into the
+ * block cache, in KB.
+ */
+ private int totalStaticBloomSizeKB;
+
/**
* Constructor, for Writable
*/
@@ -83,13 +101,18 @@ public class HServerLoad implements Writ
*/
public RegionLoad(final byte[] name, final int stores,
final int storefiles, final int storefileSizeMB,
- final int memstoreSizeMB, final int storefileIndexSizeMB) {
+ final int memstoreSizeMB, final int storefileIndexSizeMB,
+ final int rootIndexSizeKB, final int totalStaticIndexSizeKB,
+ final int totalStaticBloomSizeKB) {
this.name = name;
this.stores = stores;
this.storefiles = storefiles;
this.storefileSizeMB = storefileSizeMB;
this.memstoreSizeMB = memstoreSizeMB;
this.storefileIndexSizeMB = storefileIndexSizeMB;
+ this.rootIndexSizeKB = rootIndexSizeKB;
+ this.totalStaticIndexSizeKB = totalStaticIndexSizeKB;
+ this.totalStaticBloomSizeKB = totalStaticBloomSizeKB;
}
// Getters
@@ -191,6 +214,9 @@ public class HServerLoad implements Writ
this.storefileSizeMB = in.readInt();
this.memstoreSizeMB = in.readInt();
this.storefileIndexSizeMB = in.readInt();
+ this.rootIndexSizeKB = in.readInt();
+ this.totalStaticIndexSizeKB = in.readInt();
+ this.totalStaticBloomSizeKB = in.readInt();
}
public void write(DataOutput out) throws IOException {
@@ -201,6 +227,9 @@ public class HServerLoad implements Writ
out.writeInt(storefileSizeMB);
out.writeInt(memstoreSizeMB);
out.writeInt(storefileIndexSizeMB);
+ out.writeInt(rootIndexSizeKB);
+ out.writeInt(totalStaticIndexSizeKB);
+ out.writeInt(totalStaticBloomSizeKB);
}
/**
@@ -218,6 +247,12 @@ public class HServerLoad implements Writ
Integer.valueOf(this.memstoreSizeMB));
sb = Strings.appendKeyValue(sb, "storefileIndexSizeMB",
Integer.valueOf(this.storefileIndexSizeMB));
+ sb = Strings.appendKeyValue(sb, "rootIndexSizeKB",
+ Integer.valueOf(this.rootIndexSizeKB));
+ sb = Strings.appendKeyValue(sb, "totalStaticIndexSizeKB",
+ Integer.valueOf(this.totalStaticIndexSizeKB));
+ sb = Strings.appendKeyValue(sb, "totalStaticBloomSizeKB",
+ Integer.valueOf(this.totalStaticBloomSizeKB));
return sb.toString();
}
}
@@ -457,9 +492,12 @@ public class HServerLoad implements Writ
@Deprecated
public void addRegionInfo(final byte[] name, final int stores,
final int storefiles, final int storefileSizeMB,
- final int memstoreSizeMB, final int storefileIndexSizeMB) {
+ final int memstoreSizeMB, final int storefileIndexSizeMB,
+ final int rootIndexSizeKB, final int totalStaticIndexSizeKB,
+ final int totalStaticBloomSizeKB) {
this.regionLoad.add(new HServerLoad.RegionLoad(name, stores, storefiles,
- storefileSizeMB, memstoreSizeMB, storefileIndexSizeMB));
+ storefileSizeMB, memstoreSizeMB, storefileIndexSizeMB, rootIndexSizeKB,
+ totalStaticIndexSizeKB, totalStaticBloomSizeKB));
}
// Writable
Added: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java?rev=1181556&view=auto
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java (added)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java Tue Oct 11 02:19:47 2011
@@ -0,0 +1,1288 @@
+/*
+ * Copyright 2011 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.hfile;
+
+import java.io.ByteArrayOutputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicReference;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.io.HeapSize;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.ClassSize;
+import org.apache.hadoop.hbase.util.CompoundBloomFilterWriter;
+import org.apache.hadoop.io.RawComparator;
+import org.apache.hadoop.io.WritableUtils;
+
+/**
+ * Provides functionality to write ({@link BlockIndexWriter}) and read
+ * ({@link BlockIndexReader}) single-level and multi-level block indexes.
+ *
+ * Examples of how to use the block index writer can be found in
+ * {@link CompoundBloomFilterWriter} and {@link HFileWriterV2}. Examples of how
+ * to use the reader can be found in {@link HFileReaderV2} and
+ * {@link TestHFileBlockIndex}.
+ */
+public class HFileBlockIndex {
+
+ private static final Log LOG = LogFactory.getLog(HFileBlockIndex.class);
+
+ static final int DEFAULT_MAX_CHUNK_SIZE = 128 * 1024;
+
+ /**
+ * The maximum size guideline for index blocks (both leaf, intermediate, and
+ * root). If not specified, {@link #DEFAULT_MAX_CHUNK_SIZE} is used.
+ */
+ public static final String MAX_CHUNK_SIZE_KEY = "hfile.index.block.max.size";
+
+ /**
+ * The number of bytes stored in each "secondary index" entry in addition to
+ * key bytes in the non-root index block format. The first long is the file
+ * offset of the deeper-level block the entry points to, and the int that
+ * follows is that block's on-disk size without including header.
+ */
+ static final int SECONDARY_INDEX_ENTRY_OVERHEAD = Bytes.SIZEOF_INT
+ + Bytes.SIZEOF_LONG;
+
+ /**
+ * Error message when trying to use inline block API in single-level mode.
+ */
+ private static final String INLINE_BLOCKS_NOT_ALLOWED =
+ "Inline blocks are not allowed in the single-level-only mode";
+
+ /**
+ * Configuration key to cache leaf- and intermediate-level index blocks on
+ * write.
+ */
+ public static final String CACHE_INDEX_BLOCKS_ON_WRITE_KEY =
+ "hfile.block.index.cacheonwrite";
+
+ /**
+ * The size of a meta-data record used for finding the mid-key in a
+ * multi-level index. Consists of the middle leaf-level index block offset
+ * (long), its on-disk size without header included (int), and the mid-key
+ * entry's zero-based index in that leaf index block.
+ */
+ private static final int MID_KEY_METADATA_SIZE = Bytes.SIZEOF_LONG +
+ 2 * Bytes.SIZEOF_INT;
+
+ /**
+ * The reader will always hold the root level index in the memory. Index
+ * blocks at all other levels will be cached in the LRU cache in practice,
+ * although this API does not enforce that.
+ *
+ * All non-root (leaf and intermediate) index blocks contain what we call a
+ * "secondary index": an array of offsets to the entries within the block.
+ * This allows us to do binary search for the entry corresponding to the
+ * given key without having to deserialize the block.
+ */
+ public static class BlockIndexReader implements HeapSize {
+ /** Needed doing lookup on blocks. */
+ private final RawComparator<byte[]> comparator;
+
+ // Root-level data.
+ private byte[][] blockKeys;
+ private long[] blockOffsets;
+ private int[] blockDataSizes;
+ private int rootByteSize = 0;
+ private int rootCount = 0;
+
+ // Mid-key metadata.
+ private long midLeafBlockOffset = -1;
+ private int midLeafBlockOnDiskSize = -1;
+ private int midKeyEntry = -1;
+
+ /** Pre-computed mid-key */
+ private AtomicReference<byte[]> midKey = new AtomicReference<byte[]>();
+
+ /**
+ * The number of levels in the block index tree. One if there is only root
+ * level, two for root and leaf levels, etc.
+ */
+ private int searchTreeLevel;
+
+ /** A way to read {@link HFile} blocks at a given offset */
+ private HFileBlock.BasicReader blockReader;
+
+ public BlockIndexReader(final RawComparator<byte[]> c, final int treeLevel,
+ final HFileBlock.BasicReader blockReader) {
+ this(c, treeLevel);
+ this.blockReader = blockReader;
+ }
+
+ public BlockIndexReader(final RawComparator<byte[]> c, final int treeLevel)
+ {
+ comparator = c;
+ searchTreeLevel = treeLevel;
+ }
+
+ /**
+ * @return true if the block index is empty.
+ */
+ public boolean isEmpty() {
+ return blockKeys.length == 0;
+ }
+
+ /**
+ * Verifies that the block index is non-empty and throws an
+ * {@link IllegalStateException} otherwise.
+ */
+ public void ensureNonEmpty() {
+ if (blockKeys.length == 0) {
+ throw new IllegalStateException("Block index is empty or not loaded");
+ }
+ }
+
+ /**
+ * Return the data block which contains this key. This function will only
+ * be called when the HFile version is larger than 1.
+ *
+ * @param key the key we are looking for
+ * @param keyOffset the offset of the key in its byte array
+ * @param keyLength the length of the key
+ * @param currentBlock the current block, to avoid re-reading the same
+ * block
+ * @return reader a basic way to load blocks
+ * @throws IOException
+ */
+ public HFileBlock seekToDataBlock(final byte[] key, int keyOffset,
+ int keyLength, HFileBlock currentBlock)
+ throws IOException {
+ int rootLevelIndex = rootBlockContainingKey(key, keyOffset, keyLength);
+ if (rootLevelIndex < 0 || rootLevelIndex >= blockOffsets.length) {
+ return null;
+ }
+
+ // Read the next-level (intermediate or leaf) index block.
+ long currentOffset = blockOffsets[rootLevelIndex];
+ int currentOnDiskSize = blockDataSizes[rootLevelIndex];
+
+ int lookupLevel = 1; // How many levels deep we are in our lookup.
+ HFileBlock block = blockReader.readBlockData(currentOffset,
+ currentOnDiskSize, -1, true);
+ if (block == null) {
+ throw new IOException("Failed to read block at offset " +
+ currentOffset + ", onDiskSize=" + currentOnDiskSize);
+ }
+ while (!block.getBlockType().equals(BlockType.DATA)) {
+ // Read the block. It may be intermediate level block, leaf level block
+ // or data block. In any case, we expect non-root index block format.
+
+ // We don't allow more than searchTreeLevel iterations of this loop.
+ if (++lookupLevel > searchTreeLevel) {
+ throw new IOException("Search Tree Level overflow: lookupLevel: "+
+ lookupLevel + " searchTreeLevel: " + searchTreeLevel);
+ }
+
+ // read to the byte buffer
+ ByteBuffer buffer = block.getBufferWithoutHeader();
+ if (!locateNonRootIndexEntry(buffer, key, keyOffset, keyLength,
+ comparator)) {
+ throw new IOException("The key "
+ + Bytes.toStringBinary(key, keyOffset, keyLength)
+ + " is before the" + " first key of the non-root index block "
+ + block);
+ }
+
+ currentOffset = buffer.getLong();
+ currentOnDiskSize = buffer.getInt();
+
+ // Located a deeper-level block, now read it.
+ if (currentBlock != null && currentBlock.getOffset() == currentOffset)
+ {
+ // Avoid reading the same block.
+ block = currentBlock;
+ } else {
+ block = blockReader.readBlockData(currentOffset, currentOnDiskSize,
+ -1, true);
+ }
+ }
+
+ if (lookupLevel != searchTreeLevel) {
+ throw new IOException("Reached a data block at level " + lookupLevel +
+ " but the number of levels is " + searchTreeLevel);
+ }
+
+ return block;
+ }
+
+ /**
+ * An approximation to the {@link HFile}'s mid-key. Operates on block
+ * boundaries, and does not go inside blocks. In other words, returns the
+ * first key of the middle block of the file.
+ *
+ * @return the first key of the middle block
+ */
+ public byte[] midkey() throws IOException {
+ if (rootCount == 0)
+ throw new IOException("HFile empty");
+
+ byte[] midKey = this.midKey.get();
+ if (midKey != null)
+ return midKey;
+
+ if (midLeafBlockOffset >= 0) {
+ if (blockReader == null) {
+ throw new IOException("Have to read the middle leaf block but " +
+ "no block reader available");
+ }
+ HFileBlock midLeafBlock = blockReader.readBlockData(
+ midLeafBlockOffset, midLeafBlockOnDiskSize, -1, true);
+ ByteBuffer b = midLeafBlock.getBufferWithoutHeader();
+ int numDataBlocks = b.getInt();
+ int keyRelOffset = b.getInt(Bytes.SIZEOF_INT * (midKeyEntry + 1));
+ int keyLen = b.getInt(Bytes.SIZEOF_INT * (midKeyEntry + 2)) -
+ keyRelOffset;
+ int keyOffset = b.arrayOffset() +
+ Bytes.SIZEOF_INT * (numDataBlocks + 2) + keyRelOffset +
+ SECONDARY_INDEX_ENTRY_OVERHEAD;
+ midKey = Arrays.copyOfRange(b.array(), keyOffset, keyOffset + keyLen);
+ } else {
+ // The middle of the root-level index.
+ midKey = blockKeys[(rootCount - 1) / 2];
+ }
+
+ this.midKey.set(midKey);
+ return midKey;
+ }
+
+ /**
+ * @param i from 0 to {@link #getRootBlockCount() - 1}
+ */
+ public byte[] getRootBlockKey(int i) {
+ return blockKeys[i];
+ }
+
+ /**
+ * @param i from 0 to {@link #getRootBlockCount() - 1}
+ */
+ public long getRootBlockOffset(int i) {
+ return blockOffsets[i];
+ }
+
+ /**
+ * @param i zero-based index of a root-level block
+ * @return the on-disk size of the root-level block for version 2, or the
+ * uncompressed size for version 1
+ */
+ public int getRootBlockDataSize(int i) {
+ return blockDataSizes[i];
+ }
+
+ /**
+ * @return the number of root-level blocks in this block index
+ */
+ public int getRootBlockCount() {
+ return rootCount;
+ }
+
+ /**
+ * Finds the root-level index block containing the given key.
+ *
+ * @param key
+ * Key to find
+ * @return Offset of block containing <code>key</code> (between 0 and the
+ * number of blocks - 1) or -1 if this file does not contain the
+ * request.
+ */
+ public int rootBlockContainingKey(final byte[] key, int offset,
+ int length) {
+ int pos = Bytes.binarySearch(blockKeys, key, offset, length,
+ comparator);
+ // pos is between -(blockKeys.length + 1) to blockKeys.length - 1, see
+ // binarySearch's javadoc.
+
+ if (pos >= 0) {
+ // This means this is an exact match with an element of blockKeys.
+ assert pos < blockKeys.length;
+ return pos;
+ }
+
+ // Otherwise, pos = -(i + 1), where blockKeys[i - 1] < key < blockKeys[i],
+ // and i is in [0, blockKeys.length]. We are returning j = i - 1 such that
+ // blockKeys[j] <= key < blockKeys[j + 1]. In particular, j = -1 if
+ // key < blockKeys[0], meaning the file does not contain the given key.
+
+ int i = -pos - 1;
+ assert 0 <= i && i <= blockKeys.length;
+ return i - 1;
+ }
+
+ /**
+ * Adds a new entry in the root block index. Only used when reading.
+ *
+ * @param key Last key in the block
+ * @param offset file offset where the block is stored
+ * @param dataSize the uncompressed data size
+ */
+ private void add(final byte[] key, final long offset, final int dataSize) {
+ blockOffsets[rootCount] = offset;
+ blockKeys[rootCount] = key;
+ blockDataSizes[rootCount] = dataSize;
+
+ rootCount++;
+ rootByteSize += SECONDARY_INDEX_ENTRY_OVERHEAD + key.length;
+ }
+
+ /**
+ * Performs a binary search over a non-root level index block. Utilizes the
+ * secondary index, which records the offsets of (offset, onDiskSize,
+ * firstKey) tuples of all entries.
+ *
+ * @param key the key we are searching for offsets to individual entries in
+ * the blockIndex buffer
+ * @param keyOffset the offset of the key in its byte array
+ * @param keyLength the length of the key
+ * @param nonRootIndex the non-root index block buffer, starting with the
+ * secondary index. The position is ignored.
+ * @return the index i in [0, numEntries - 1] such that keys[i] <= key <
+ * keys[i + 1], if keys is the array of all keys being searched, or
+ * -1 otherwise
+ * @throws IOException
+ */
+ static int binarySearchNonRootIndex(byte[] key, int keyOffset,
+ int keyLength, ByteBuffer nonRootIndex,
+ RawComparator<byte[]> comparator) {
+
+ int numEntries = nonRootIndex.getInt(0);
+ int low = 0;
+ int high = numEntries - 1;
+ int mid = 0;
+
+ // Entries start after the number of entries and the secondary index.
+ // The secondary index takes numEntries + 1 ints.
+ int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2);
+
+ // If we imagine that keys[-1] = -Infinity and
+ // keys[numEntries] = Infinity, then we are maintaining an invariant that
+ // keys[low - 1] < key < keys[high + 1] while narrowing down the range.
+
+ while (low <= high) {
+ mid = (low + high) >>> 1;
+
+ // Midkey's offset relative to the end of secondary index
+ int midKeyRelOffset = nonRootIndex.getInt(
+ Bytes.SIZEOF_INT * (mid + 1));
+
+ // The offset of the middle key in the blockIndex buffer
+ int midKeyOffset = entriesOffset // Skip secondary index
+ + midKeyRelOffset // Skip all entries until mid
+ + SECONDARY_INDEX_ENTRY_OVERHEAD; // Skip offset and on-disk-size
+
+ // We subtract the two consecutive secondary index elements, which
+ // gives us the size of the whole (offset, onDiskSize, key) tuple. We
+ // then need to subtract the overhead of offset and onDiskSize.
+ int midLength = nonRootIndex.getInt(Bytes.SIZEOF_INT * (mid + 2)) -
+ midKeyRelOffset - SECONDARY_INDEX_ENTRY_OVERHEAD;
+
+ // we have to compare in this order, because the comparator order
+ // has special logic when the 'left side' is a special key.
+ int cmp = comparator.compare(key, keyOffset, keyLength,
+ nonRootIndex.array(), nonRootIndex.arrayOffset() + midKeyOffset,
+ midLength);
+
+ // key lives above the midpoint
+ if (cmp > 0)
+ low = mid + 1; // Maintain the invariant that keys[low - 1] < key
+ // key lives below the midpoint
+ else if (cmp < 0)
+ high = mid - 1; // Maintain the invariant that key < keys[high + 1]
+ else
+ return mid; // exact match
+ }
+
+ // As per our invariant, keys[low - 1] < key < keys[high + 1], meaning
+ // that low - 1 < high + 1 and (low - high) <= 1. As per the loop break
+ // condition, low >= high + 1. Therefore, low = high + 1.
+
+ if (low != high + 1) {
+ throw new IllegalStateException("Binary search broken: low=" + low
+ + " " + "instead of " + (high + 1));
+ }
+
+ // OK, our invariant says that keys[low - 1] < key < keys[low]. We need to
+ // return i such that keys[i] <= key < keys[i + 1]. Therefore i = low - 1.
+ int i = low - 1;
+
+ // Some extra validation on the result.
+ if (i < -1 || i >= numEntries) {
+ throw new IllegalStateException("Binary search broken: result is " +
+ i + " but expected to be between -1 and (numEntries - 1) = " +
+ (numEntries - 1));
+ }
+
+ return i;
+ }
+
+ /**
+ * Search for one key using the secondary index in a non-root block. In case
+ * of success, positions the provided buffer at the entry of interest, where
+ * the file offset and the on-disk-size can be read.
+ *
+ * @param nonRootBlock a non-root block without header. Initial position
+ * does not matter.
+ * @param key the byte array containing the key
+ * @param keyOffset the offset of the key in its byte array
+ * @param keyLength the length of the key
+ * @return true in the case the index entry containing the given key was
+ * found, false in the case the given key is before the first key
+ *
+ */
+ static boolean locateNonRootIndexEntry(ByteBuffer nonRootBlock, byte[] key,
+ int keyOffset, int keyLength, RawComparator<byte[]> comparator) {
+ int entryIndex = binarySearchNonRootIndex(key, keyOffset, keyLength,
+ nonRootBlock, comparator);
+
+ if (entryIndex == -1) {
+ return false;
+ }
+
+ int numEntries = nonRootBlock.getInt(0);
+
+ // The end of secondary index and the beginning of entries themselves.
+ int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2);
+
+ // The offset of the entry we are interested in relative to the end of
+ // the secondary index.
+ int entryRelOffset = nonRootBlock.getInt(Bytes.SIZEOF_INT
+ * (1 + entryIndex));
+
+ nonRootBlock.position(entriesOffset + entryRelOffset);
+ return true;
+ }
+
+ /**
+ * Read in the root-level index from the given input stream. Must match
+ * what was written into the root level by
+ * {@link BlockIndexWriter#writeIndexBlocks(FSDataOutputStream)} at the
+ * offset that function returned.
+ *
+ * @param in the buffered input stream or wrapped byte input stream
+ * @param numEntries the number of root-level index entries
+ * @throws IOException
+ */
+ public void readRootIndex(DataInput in, final int numEntries)
+ throws IOException {
+ blockOffsets = new long[numEntries];
+ blockKeys = new byte[numEntries][];
+ blockDataSizes = new int[numEntries];
+
+ // If index size is zero, no index was written.
+ if (numEntries > 0) {
+ for (int i = 0; i < numEntries; ++i) {
+ long offset = in.readLong();
+ int dataSize = in.readInt();
+ byte[] key = Bytes.readByteArray(in);
+ add(key, offset, dataSize);
+ }
+ }
+ }
+
+ /**
+ * Read the root-level metadata of a multi-level block index. Based on
+ * {@link #readRootIndex(DataInput, int)}, but also reads metadata
+ * necessary to compute the mid-key in a multi-level index.
+ *
+ * @param in the buffered or byte input stream to read from
+ * @param numEntries the number of root-level index entries
+ * @throws IOException
+ */
+ public void readMultiLevelIndexRoot(DataInputStream in,
+ final int numEntries) throws IOException {
+ readRootIndex(in, numEntries);
+ if (in.available() < MID_KEY_METADATA_SIZE) {
+ // No mid-key metadata available.
+ return;
+ }
+
+ midLeafBlockOffset = in.readLong();
+ midLeafBlockOnDiskSize = in.readInt();
+ midKeyEntry = in.readInt();
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("size=" + rootCount).append("\n");
+ for (int i = 0; i < rootCount; i++) {
+ sb.append("key=").append(KeyValue.keyToString(blockKeys[i]))
+ .append("\n offset=").append(blockOffsets[i])
+ .append(", dataSize=" + blockDataSizes[i]).append("\n");
+ }
+ return sb.toString();
+ }
+
+ @Override
+ public long heapSize() {
+ long heapSize = ClassSize.align(6 * ClassSize.REFERENCE +
+ 3 * Bytes.SIZEOF_INT + ClassSize.OBJECT);
+
+ // Mid-key metadata.
+ heapSize += MID_KEY_METADATA_SIZE;
+
+ // Calculating the size of blockKeys
+ if (blockKeys != null) {
+ // Adding array + references overhead
+ heapSize += ClassSize.align(ClassSize.ARRAY + blockKeys.length
+ * ClassSize.REFERENCE);
+
+ // Adding bytes
+ for (byte[] key : blockKeys) {
+ heapSize += ClassSize.align(ClassSize.ARRAY + key.length);
+ }
+ }
+
+ if (blockOffsets != null) {
+ heapSize += ClassSize.align(ClassSize.ARRAY + blockOffsets.length
+ * Bytes.SIZEOF_LONG);
+ }
+
+ if (blockDataSizes != null) {
+ heapSize += ClassSize.align(ClassSize.ARRAY + blockDataSizes.length
+ * Bytes.SIZEOF_INT);
+ }
+
+ return ClassSize.align(heapSize);
+ }
+
+ }
+
+ /**
+ * Writes the block index into the output stream. Generate the tree from
+ * bottom up. The leaf level is written to disk as a sequence of inline
+ * blocks, if it is larger than a certain number of bytes. If the leaf level
+ * is not large enough, we write all entries to the root level instead.
+ *
+ * After all leaf blocks have been written, we end up with an index
+ * referencing the resulting leaf index blocks. If that index is larger than
+ * the allowed root index size, the writer will break it up into
+ * reasonable-size intermediate-level index block chunks write those chunks
+ * out, and create another index referencing those chunks. This will be
+ * repeated until the remaining index is small enough to become the root
+ * index. However, in most practical cases we will only have leaf-level
+ * blocks and the root index, or just the root index.
+ */
+ public static class BlockIndexWriter implements InlineBlockWriter {
+ /**
+ * While the index is being written, this represents the current block
+ * index referencing all leaf blocks, with one exception. If the file is
+ * being closed and there are not enough blocks to complete even a single
+ * leaf block, no leaf blocks get written and this contains the entire
+ * block index. After all levels of the index were written by
+ * {@link #writeIndexBlocks(FSDataOutputStream)}, this contains the final
+ * root-level index.
+ */
+ private BlockIndexChunk rootChunk = new BlockIndexChunk();
+
+ /**
+ * Current leaf-level chunk. New entries referencing data blocks get added
+ * to this chunk until it grows large enough to be written to disk.
+ */
+ private BlockIndexChunk curInlineChunk = new BlockIndexChunk();
+
+ /**
+ * The number of block index levels. This is one if there is only root
+ * level (even empty), two if there a leaf level and root level, and is
+ * higher if there are intermediate levels. This is only final after
+ * {@link #writeIndexBlocks(FSDataOutputStream)} has been called. The
+ * initial value accounts for the root level, and will be increased to two
+ * as soon as we find out there is a leaf-level in
+ * {@link #blockWritten(long, int)}.
+ */
+ private int numLevels = 1;
+
+ private HFileBlock.Writer blockWriter;
+ private byte[] firstKey = null;
+
+ /**
+ * The total number of leaf-level entries, i.e. entries referenced by
+ * leaf-level blocks. For the data block index this is equal to the number
+ * of data blocks.
+ */
+ private long totalNumEntries;
+
+ /** Total compressed size of all index blocks. */
+ private long totalBlockOnDiskSize;
+
+ /** Total uncompressed size of all index blocks. */
+ private long totalBlockUncompressedSize;
+
+ /** The maximum size guideline of all multi-level index blocks. */
+ private int maxChunkSize;
+
+ /** Whether we require this block index to always be single-level. */
+ private boolean singleLevelOnly;
+
+ /** Block cache, or null if cache-on-write is disabled */
+ private BlockCache blockCache;
+
+ /** Name to use for computing cache keys */
+ private String nameForCaching;
+
+ /** Creates a single-level block index writer */
+ public BlockIndexWriter() {
+ this(null, null, null);
+ singleLevelOnly = true;
+ }
+
+ /**
+ * Creates a multi-level block index writer.
+ *
+ * @param blockWriter the block writer to use to write index blocks
+ */
+ public BlockIndexWriter(HFileBlock.Writer blockWriter,
+ BlockCache blockCache, String nameForCaching) {
+ if ((blockCache == null) != (nameForCaching == null)) {
+ throw new IllegalArgumentException("Block cache and file name for " +
+ "caching must be both specified or both null");
+ }
+
+ this.blockWriter = blockWriter;
+ this.blockCache = blockCache;
+ this.nameForCaching = nameForCaching;
+ this.maxChunkSize = HFileBlockIndex.DEFAULT_MAX_CHUNK_SIZE;
+ }
+
+ public void setMaxChunkSize(int maxChunkSize) {
+ if (maxChunkSize <= 0) {
+ throw new IllegalArgumentException("Invald maximum index block size");
+ }
+ this.maxChunkSize = maxChunkSize;
+ }
+
+ /**
+ * Writes the root level and intermediate levels of the block index into
+ * the output stream, generating the tree from bottom up. Assumes that the
+ * leaf level has been inline-written to the disk if there is enough data
+ * for more than one leaf block. We iterate by breaking the current level
+ * of the block index, starting with the index of all leaf-level blocks,
+ * into chunks small enough to be written to disk, and generate its parent
+ * level, until we end up with a level small enough to become the root
+ * level.
+ *
+ * If the leaf level is not large enough, there is no inline block index
+ * anymore, so we only write that level of block index to disk as the root
+ * level.
+ *
+ * @param out FSDataOutputStream
+ * @return position at which we entered the root-level index.
+ * @throws IOException
+ */
+ public long writeIndexBlocks(FSDataOutputStream out) throws IOException {
+ if (curInlineChunk.getNumEntries() != 0) {
+ throw new IOException("Trying to write a multi-level block index, " +
+ "but are " + curInlineChunk.getNumEntries() + " entries in the " +
+ "last inline chunk.");
+ }
+
+ // We need to get mid-key metadata before we create intermediate
+ // indexes and overwrite the root chunk.
+ byte[] midKeyMetadata = numLevels > 1 ? rootChunk.getMidKeyMetadata()
+ : null;
+
+ while (rootChunk.getRootSize() > maxChunkSize) {
+ rootChunk = writeIntermediateLevel(out, rootChunk);
+ numLevels += 1;
+ }
+
+ // write the root level
+ long rootLevelIndexPos = out.getPos();
+
+ {
+ DataOutput blockStream = blockWriter.startWriting(BlockType.ROOT_INDEX,
+ false);
+ rootChunk.writeRoot(blockStream);
+ if (midKeyMetadata != null)
+ blockStream.write(midKeyMetadata);
+ blockWriter.writeHeaderAndData(out);
+ }
+
+ // Add root index block size
+ totalBlockOnDiskSize += blockWriter.getOnDiskSizeWithoutHeader();
+ totalBlockUncompressedSize +=
+ blockWriter.getUncompressedSizeWithoutHeader();
+
+ LOG.info("Wrote a " + numLevels + "-level index with root level at pos "
+ + out.getPos() + ", " + rootChunk.getNumEntries()
+ + " root-level entries, " + totalNumEntries + " total entries, "
+ + totalBlockOnDiskSize + " bytes total on-disk size, "
+ + totalBlockUncompressedSize + " bytes total uncompressed size.");
+
+ return rootLevelIndexPos;
+ }
+
+ /**
+ * Writes the block index data as a single level only. Does not do any
+ * block framing.
+ *
+ * @param out the buffered output stream to write the index to. Typically a
+ * stream writing into an {@link HFile} block.
+ * @param description a short description of the index being written. Used
+ * in a log message.
+ * @throws IOException
+ */
+ public void writeSingleLevelIndex(DataOutput out, String description)
+ throws IOException {
+ expectNumLevels(1);
+
+ if (!singleLevelOnly)
+ throw new IOException("Single-level mode is turned off");
+
+ if (rootChunk.getNumEntries() > 0)
+ throw new IOException("Root-level entries already added in " +
+ "single-level mode");
+
+ rootChunk = curInlineChunk;
+ curInlineChunk = new BlockIndexChunk();
+
+ LOG.info("Wrote a single-level " + description + " index with "
+ + rootChunk.getNumEntries() + " entries, " + rootChunk.getRootSize()
+ + " bytes");
+ rootChunk.writeRoot(out);
+ }
+
+ /**
+ * Split the current level of the block index into intermediate index
+ * blocks of permitted size and write those blocks to disk. Return the next
+ * level of the block index referencing those intermediate-level blocks.
+ *
+ * @param out
+ * @param currentLevel the current level of the block index, such as the a
+ * chunk referencing all leaf-level index blocks
+ * @return the parent level block index, which becomes the root index after
+ * a few (usually zero) iterations
+ * @throws IOException
+ */
+ private BlockIndexChunk writeIntermediateLevel(FSDataOutputStream out,
+ BlockIndexChunk currentLevel) throws IOException {
+ // Entries referencing intermediate-level blocks we are about to create.
+ BlockIndexChunk parent = new BlockIndexChunk();
+
+ // The current intermediate-level block index chunk.
+ BlockIndexChunk curChunk = new BlockIndexChunk();
+
+ for (int i = 0; i < currentLevel.getNumEntries(); ++i) {
+ curChunk.add(currentLevel.getBlockKey(i),
+ currentLevel.getBlockOffset(i), currentLevel.getOnDiskDataSize(i));
+
+ if (curChunk.getRootSize() >= maxChunkSize)
+ writeIntermediateBlock(out, parent, curChunk);
+ }
+
+ if (curChunk.getNumEntries() > 0) {
+ writeIntermediateBlock(out, parent, curChunk);
+ }
+
+ return parent;
+ }
+
+ private void writeIntermediateBlock(FSDataOutputStream out,
+ BlockIndexChunk parent, BlockIndexChunk curChunk) throws IOException {
+ long beginOffset = out.getPos();
+ DataOutputStream dos = blockWriter.startWriting(
+ BlockType.INTERMEDIATE_INDEX, cacheOnWrite());
+ curChunk.writeNonRoot(dos);
+ byte[] curFirstKey = curChunk.getBlockKey(0);
+ blockWriter.writeHeaderAndData(out);
+
+ if (blockCache != null) {
+ blockCache.cacheBlock(HFile.getBlockCacheKey(nameForCaching,
+ beginOffset), blockWriter.getBlockForCaching());
+ }
+
+ // Add intermediate index block size
+ totalBlockOnDiskSize += blockWriter.getOnDiskSizeWithoutHeader();
+ totalBlockUncompressedSize +=
+ blockWriter.getUncompressedSizeWithoutHeader();
+
+ // OFFSET is the beginning offset the chunk of block index entries.
+ // SIZE is the total byte size of the chunk of block index entries
+ // + the secondary index size
+ // FIRST_KEY is the first key in the chunk of block index
+ // entries.
+ parent.add(curFirstKey, beginOffset,
+ blockWriter.getOnDiskSizeWithHeader());
+
+ // clear current block index chunk
+ curChunk.clear();
+ curFirstKey = null;
+ }
+
+ /**
+ * @return how many block index entries there are in the root level
+ */
+ public final int getNumRootEntries() {
+ return rootChunk.getNumEntries();
+ }
+
+ /**
+ * @return the number of levels in this block index.
+ */
+ public int getNumLevels() {
+ return numLevels;
+ }
+
+ private void expectNumLevels(int expectedNumLevels) {
+ if (numLevels != expectedNumLevels) {
+ throw new IllegalStateException("Number of block index levels is "
+ + numLevels + "but is expected to be " + expectedNumLevels);
+ }
+ }
+
+ /**
+ * Whether there is an inline block ready to be written. In general, we
+ * write an leaf-level index block as an inline block as soon as its size
+ * as serialized in the non-root format reaches a certain threshold.
+ */
+ @Override
+ public boolean shouldWriteBlock(boolean closing) {
+ if (singleLevelOnly)
+ throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED);
+
+ if (curInlineChunk.getNumEntries() == 0)
+ return false;
+
+ // We do have some entries in the current inline chunk.
+ if (closing) {
+ if (rootChunk.getNumEntries() == 0) {
+ // We did not add any leaf-level blocks yet. Instead of creating a
+ // leaf level with one block, move these entries to the root level.
+
+ expectNumLevels(1);
+ rootChunk = curInlineChunk;
+ curInlineChunk = new BlockIndexChunk();
+ return false;
+ }
+
+ return true;
+ } else {
+ return curInlineChunk.getNonRootSize() >= maxChunkSize;
+ }
+ }
+
+ /**
+ * Write out the current inline index block. Inline blocks are non-root
+ * blocks, so the non-root index format is used.
+ *
+ * @param out
+ * @param position The beginning offset of the inline block in the file not
+ * include the header.
+ */
+ @Override
+ public void writeInlineBlock(DataOutput out) throws IOException {
+ if (singleLevelOnly)
+ throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED);
+
+ // Write the inline block index to the output stream in the non-root
+ // index block format.
+ curInlineChunk.writeNonRoot(out);
+
+ // Save the first key of the inline block so that we can add it to the
+ // parent-level index.
+ firstKey = curInlineChunk.getBlockKey(0);
+
+ // Start a new inline index block
+ curInlineChunk.clear();
+ }
+
+ /**
+ * Called after an inline block has been written so that we can add an
+ * entry referring to that block to the parent-level index.
+ */
+ @Override
+ public void blockWritten(long offset, int onDiskSize, int uncompressedSize)
+ {
+ // Add leaf index block size
+ totalBlockOnDiskSize += onDiskSize;
+ totalBlockUncompressedSize += uncompressedSize;
+
+ if (singleLevelOnly)
+ throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED);
+
+ if (firstKey == null) {
+ throw new IllegalStateException("Trying to add second-level index " +
+ "entry with offset=" + offset + " and onDiskSize=" + onDiskSize +
+ "but the first key was not set in writeInlineBlock");
+ }
+
+ if (rootChunk.getNumEntries() == 0) {
+ // We are writing the first leaf block, so increase index level.
+ expectNumLevels(1);
+ numLevels = 2;
+ }
+
+ // Add another entry to the second-level index. Include the number of
+ // entries in all previous leaf-level chunks for mid-key calculation.
+ rootChunk.add(firstKey, offset, onDiskSize, totalNumEntries);
+ firstKey = null;
+ }
+
+ @Override
+ public BlockType getInlineBlockType() {
+ return BlockType.LEAF_INDEX;
+ }
+
+ /**
+ * Add one index entry to the current leaf-level block. When the leaf-level
+ * block gets large enough, it will be flushed to disk as an inline block.
+ *
+ * @param firstKey the first key of the data block
+ * @param blockOffset the offset of the data block
+ * @param blockDataSize the on-disk size of the data block ({@link HFile}
+ * format version 2), or the uncompressed size of the data block (
+ * {@link HFile} format version 1).
+ */
+ public void addEntry(byte[] firstKey, long blockOffset, int blockDataSize)
+ {
+ curInlineChunk.add(firstKey, blockOffset, blockDataSize);
+ ++totalNumEntries;
+ }
+
+ /**
+ * @throws IOException if we happened to write a multi-level index.
+ */
+ public void ensureSingeLevel() throws IOException {
+ if (numLevels > 1) {
+ throw new IOException ("Wrote a " + numLevels + "-level index with " +
+ rootChunk.getNumEntries() + " root-level entries, but " +
+ "this is expected to be a single-level block index.");
+ }
+ }
+
+ @Override
+ public boolean cacheOnWrite() {
+ return blockCache != null;
+ }
+
+ /**
+ * The total uncompressed size of the root index block, intermediate-level
+ * index blocks, and leaf-level index blocks.
+ *
+ * @return the total uncompressed size of all index blocks
+ */
+ public long getTotalUncompressedSize() {
+ return totalBlockUncompressedSize;
+ }
+
+ }
+
+ /**
+ * A single chunk of the block index in the process of writing. The data in
+ * this chunk can become a leaf-level, intermediate-level, or root index
+ * block.
+ */
+ static class BlockIndexChunk {
+
+ /** First keys of the key range corresponding to each index entry. */
+ private List<byte[]> blockKeys = new ArrayList<byte[]>();
+
+ /** Block offset in backing stream. */
+ private List<Long> blockOffsets = new ArrayList<Long>();
+
+ /** On-disk data sizes of lower-level data or index blocks. */
+ private List<Integer> onDiskDataSizes = new ArrayList<Integer>();
+
+ /**
+ * The cumulative number of sub-entries, i.e. entries on deeper-level block
+ * index entries. numSubEntriesAt[i] is the number of sub-entries in the
+ * blocks corresponding to this chunk's entries #0 through #i inclusively.
+ */
+ private List<Long> numSubEntriesAt = new ArrayList<Long>();
+
+ /**
+ * The offset of the next entry to be added, relative to the end of the
+ * "secondary index" in the "non-root" format representation of this index
+ * chunk. This is the next value to be added to the secondary index.
+ */
+ private int curTotalNonRootEntrySize = 0;
+
+ /**
+ * The accumulated size of this chunk if stored in the root index format.
+ */
+ private int curTotalRootSize = 0;
+
+ /**
+ * The "secondary index" used for binary search over variable-length
+ * records in a "non-root" format block. These offsets are relative to the
+ * end of this secondary index.
+ */
+ private List<Integer> secondaryIndexOffsetMarks = new ArrayList<Integer>();
+
+ /**
+ * Adds a new entry to this block index chunk.
+ *
+ * @param firstKey the first key in the block pointed to by this entry
+ * @param blockOffset the offset of the next-level block pointed to by this
+ * entry
+ * @param onDiskDataSize the on-disk data of the block pointed to by this
+ * entry, including header size
+ * @param curTotalNumSubEntries if this chunk is the root index chunk under
+ * construction, this specifies the current total number of
+ * sub-entries in all leaf-level chunks, including the one
+ * corresponding to the second-level entry being added.
+ */
+ void add(byte[] firstKey, long blockOffset, int onDiskDataSize,
+ long curTotalNumSubEntries) {
+ // Record the offset for the secondary index
+ secondaryIndexOffsetMarks.add(curTotalNonRootEntrySize);
+ curTotalNonRootEntrySize += SECONDARY_INDEX_ENTRY_OVERHEAD
+ + firstKey.length;
+
+ curTotalRootSize += Bytes.SIZEOF_LONG + Bytes.SIZEOF_INT
+ + WritableUtils.getVIntSize(firstKey.length) + firstKey.length;
+
+ blockKeys.add(firstKey);
+ blockOffsets.add(blockOffset);
+ onDiskDataSizes.add(onDiskDataSize);
+
+ if (curTotalNumSubEntries != -1) {
+ numSubEntriesAt.add(curTotalNumSubEntries);
+
+ // Make sure the parallel arrays are in sync.
+ if (numSubEntriesAt.size() != blockKeys.size()) {
+ throw new IllegalStateException("Only have key/value count " +
+ "stats for " + numSubEntriesAt.size() + " block index " +
+ "entries out of " + blockKeys.size());
+ }
+ }
+ }
+
+ /**
+ * The same as {@link #add(byte[], long, int, long)} but does not take the
+ * key/value into account. Used for single-level indexes.
+ *
+ * @see {@link #add(byte[], long, int, long)}
+ */
+ public void add(byte[] firstKey, long blockOffset, int onDiskDataSize) {
+ add(firstKey, blockOffset, onDiskDataSize, -1);
+ }
+
+ public void clear() {
+ blockKeys.clear();
+ blockOffsets.clear();
+ onDiskDataSizes.clear();
+ secondaryIndexOffsetMarks.clear();
+ numSubEntriesAt.clear();
+ curTotalNonRootEntrySize = 0;
+ curTotalRootSize = 0;
+ }
+
+ /**
+ * Finds the entry corresponding to the deeper-level index block containing
+ * the given deeper-level entry (a "sub-entry"), assuming a global 0-based
+ * ordering of sub-entries.
+ *
+ * <p>
+ * <i> Implementation note. </i> We are looking for i such that
+ * numSubEntriesAt[i - 1] <= k < numSubEntriesAt[i], because a deeper-level
+ * block #i (0-based) contains sub-entries # numSubEntriesAt[i - 1]'th
+ * through numSubEntriesAt[i] - 1, assuming a global 0-based ordering of
+ * sub-entries. i is by definition the insertion point of k in
+ * numSubEntriesAt.
+ *
+ * @param k sub-entry index, from 0 to the total number sub-entries - 1
+ * @return the 0-based index of the entry corresponding to the given
+ * sub-entry
+ */
+ public int getEntryBySubEntry(long k) {
+ // We define mid-key as the key corresponding to k'th sub-entry
+ // (0-based).
+
+ int i = Collections.binarySearch(numSubEntriesAt, k);
+
+ // Exact match: cumulativeWeight[i] = k. This means chunks #0 through
+ // #i contain exactly k sub-entries, and the sub-entry #k (0-based)
+ // is in the (i + 1)'th chunk.
+ if (i >= 0)
+ return i + 1;
+
+ // Inexact match. Return the insertion point.
+ return -i - 1;
+ }
+
+ /**
+ * Used when writing the root block index of a multi-level block index.
+ * Serializes additional information allowing to efficiently identify the
+ * mid-key.
+ *
+ * @return a few serialized fields for finding the mid-key
+ * @throws IOException if could not create metadata for computing mid-key
+ */
+ public byte[] getMidKeyMetadata() throws IOException {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream(
+ MID_KEY_METADATA_SIZE);
+ DataOutputStream baosDos = new DataOutputStream(baos);
+ long totalNumSubEntries = numSubEntriesAt.get(blockKeys.size() - 1);
+ if (totalNumSubEntries == 0) {
+ throw new IOException("No leaf-level entries, mid-key unavailable");
+ }
+ long midKeySubEntry = (totalNumSubEntries - 1) / 2;
+ int midKeyEntry = getEntryBySubEntry(midKeySubEntry);
+
+ baosDos.writeLong(blockOffsets.get(midKeyEntry));
+ baosDos.writeInt(onDiskDataSizes.get(midKeyEntry));
+
+ long numSubEntriesBefore = midKeyEntry > 0
+ ? numSubEntriesAt.get(midKeyEntry - 1) : 0;
+ long subEntryWithinEntry = midKeySubEntry - numSubEntriesBefore;
+ if (subEntryWithinEntry < 0 || subEntryWithinEntry > Integer.MAX_VALUE)
+ {
+ throw new IOException("Could not identify mid-key index within the "
+ + "leaf-level block containing mid-key: out of range ("
+ + subEntryWithinEntry + ", numSubEntriesBefore="
+ + numSubEntriesBefore + ", midKeySubEntry=" + midKeySubEntry
+ + ")");
+ }
+
+ baosDos.writeInt((int) subEntryWithinEntry);
+
+ if (baosDos.size() != MID_KEY_METADATA_SIZE) {
+ throw new IOException("Could not write mid-key metadata: size=" +
+ baosDos.size() + ", correct size: " + MID_KEY_METADATA_SIZE);
+ }
+
+ return baos.toByteArray();
+ }
+
+ /**
+ * Writes the block index chunk in the non-root index block format. This
+ * format contains the number of entries, an index of integer offsets
+ * for quick binary search on variable-length records, and tuples of
+ * block offset, on-disk block size, and the first key for each entry.
+ *
+ * @param out
+ * @throws IOException
+ */
+ void writeNonRoot(DataOutput out) throws IOException {
+ // The number of entries in the block.
+ out.writeInt(blockKeys.size());
+
+ if (secondaryIndexOffsetMarks.size() != blockKeys.size()) {
+ throw new IOException("Corrupted block index chunk writer: " +
+ blockKeys.size() + " entries but " +
+ secondaryIndexOffsetMarks.size() + " secondary index items");
+ }
+
+ // For each entry, write a "secondary index" of relative offsets to the
+ // entries from the end of the secondary index. This works, because at
+ // read time we read the number of entries and know where the secondary
+ // index ends.
+ for (int currentSecondaryIndex : secondaryIndexOffsetMarks)
+ out.writeInt(currentSecondaryIndex);
+
+ // We include one other element in the secondary index to calculate the
+ // size of each entry more easily by subtracting secondary index elements.
+ out.writeInt(curTotalNonRootEntrySize);
+
+ for (int i = 0; i < blockKeys.size(); ++i) {
+ out.writeLong(blockOffsets.get(i));
+ out.writeInt(onDiskDataSizes.get(i));
+ out.write(blockKeys.get(i));
+ }
+ }
+
+ /**
+ * @return the size of this chunk if stored in the non-root index block
+ * format
+ */
+ int getNonRootSize() {
+ return Bytes.SIZEOF_INT // Number of entries
+ + Bytes.SIZEOF_INT * (blockKeys.size() + 1) // Secondary index
+ + curTotalNonRootEntrySize; // All entries
+ }
+
+ /**
+ * Writes this chunk into the given output stream in the root block index
+ * format. This format is similar to the {@link HFile} version 1 block
+ * index format, except that we store on-disk size of the block instead of
+ * its uncompressed size.
+ *
+ * @param out the data output stream to write the block index to. Typically
+ * a stream writing into an {@link HFile} block.
+ * @throws IOException
+ */
+ void writeRoot(DataOutput out) throws IOException {
+ for (int i = 0; i < blockKeys.size(); ++i) {
+ out.writeLong(blockOffsets.get(i));
+ out.writeInt(onDiskDataSizes.get(i));
+ Bytes.writeByteArray(out, blockKeys.get(i));
+ }
+ }
+
+ /**
+ * @return the size of this chunk if stored in the root index block format
+ */
+ int getRootSize() {
+ return curTotalRootSize;
+ }
+
+ /**
+ * @return the number of entries in this block index chunk
+ */
+ public int getNumEntries() {
+ return blockKeys.size();
+ }
+
+ public byte[] getBlockKey(int i) {
+ return blockKeys.get(i);
+ }
+
+ public long getBlockOffset(int i) {
+ return blockOffsets.get(i);
+ }
+
+ public int getOnDiskDataSize(int i) {
+ return onDiskDataSizes.get(i);
+ }
+
+ public long getCumulativeNumKV(int i) {
+ if (i < 0)
+ return 0;
+ return numSubEntriesAt.get(i);
+ }
+
+ }
+
+ /**
+ * @return true if the given configuration specifies that we should
+ * cache-on-write index blocks
+ */
+ public static boolean shouldCacheOnWrite(Configuration conf) {
+ return conf.getBoolean(CACHE_INDEX_BLOCKS_ON_WRITE_KEY, false);
+ }
+
+ public static int getMaxChunkSize(Configuration conf) {
+ return conf.getInt(MAX_CHUNK_SIZE_KEY, DEFAULT_MAX_CHUNK_SIZE);
+ }
+
+}
Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=1181556&r1=1181555&r2=1181556&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Tue Oct 11 02:19:47 2011
@@ -770,18 +770,33 @@ public class HRegionServer implements HR
int storefileSizeMB = 0;
int memstoreSizeMB = (int)(r.memstoreSize.get()/1024/1024);
int storefileIndexSizeMB = 0;
+ int rootIndexSizeKB = 0;
+ int totalStaticIndexSizeKB = 0;
+ int totalStaticBloomSizeKB = 0;
synchronized (r.stores) {
stores += r.stores.size();
for (Store store: r.stores.values()) {
storefiles += store.getStorefilesCount();
+
storefileSizeMB +=
(int)(store.getStorefilesSize()/1024/1024);
+
storefileIndexSizeMB +=
(int)(store.getStorefilesIndexSize()/1024/1024);
+
+ rootIndexSizeKB +=
+ (int) (store.getStorefilesIndexSize() / 1024);
+
+ totalStaticIndexSizeKB +=
+ (int) (store.getTotalStaticIndexSize() / 1024);
+
+ totalStaticBloomSizeKB +=
+ (int) (store.getTotalStaticBloomSize() / 1024);
}
}
return new HServerLoad.RegionLoad(name, stores, storefiles,
- storefileSizeMB, memstoreSizeMB, storefileIndexSizeMB);
+ storefileSizeMB, memstoreSizeMB, storefileIndexSizeMB, rootIndexSizeKB,
+ totalStaticIndexSizeKB, totalStaticBloomSizeKB);
}
/**
@@ -987,6 +1002,8 @@ public class HRegionServer implements HR
int storefiles = 0;
long memstoreSize = 0;
long storefileIndexSize = 0;
+ long totalStaticIndexSize = 0;
+ long totalStaticBloomSize = 0;
synchronized (this.onlineRegions) {
for (Map.Entry<Integer, HRegion> e: this.onlineRegions.entrySet()) {
HRegion r = e.getValue();
@@ -997,6 +1014,8 @@ public class HRegionServer implements HR
Store store = ee.getValue();
storefiles += store.getStorefilesCount();
storefileIndexSize += store.getStorefilesIndexSize();
+ totalStaticIndexSize += store.getTotalStaticIndexSize();
+ totalStaticBloomSize += store.getTotalStaticBloomSize();
}
}
}
@@ -1004,7 +1023,14 @@ public class HRegionServer implements HR
this.metrics.stores.set(stores);
this.metrics.storefiles.set(storefiles);
this.metrics.memstoreSizeMB.set((int)(memstoreSize/(1024*1024)));
- this.metrics.storefileIndexSizeMB.set((int)(storefileIndexSize/(1024*1024)));
+ this.metrics.storefileIndexSizeMB.set(
+ (int) (storefileIndexSize / (1024 * 1024)));
+ this.metrics.rootIndexSizeKB.set(
+ (int) (storefileIndexSize / 1024));
+ this.metrics.totalStaticIndexSizeKB.set(
+ (int) (totalStaticIndexSize / 1024));
+ this.metrics.totalStaticBloomSizeKB.set(
+ (int) (totalStaticBloomSize / 1024));
this.metrics.compactionQueueSize.set(compactSplitThread.
getCompactionQueueSize());
Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/metrics/RegionServerMetrics.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/metrics/RegionServerMetrics.java?rev=1181556&r1=1181555&r2=1181556&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/metrics/RegionServerMetrics.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/metrics/RegionServerMetrics.java Tue Oct 11 02:19:47 2011
@@ -109,11 +109,25 @@ public class RegionServerMetrics impleme
public final MetricsIntValue storefiles = new MetricsIntValue("storefiles", registry);
/**
- * Sum of all the storefile index sizes in this regionserver in MB
+ * Sum of all the storefile index sizes in this regionserver in MB. This is
+ * a legacy metric to be phased out as we fully transition to multi-level
+ * block indexes.
*/
public final MetricsIntValue storefileIndexSizeMB =
new MetricsIntValue("storefileIndexSizeMB", registry);
+ /** The total size of block index root levels in this regionserver in KB. */
+ public final MetricsIntValue rootIndexSizeKB =
+ new MetricsIntValue("rootIndexSizeKB", registry);
+
+ /** Total size of all block indexes (not necessarily loaded in memory) */
+ public final MetricsIntValue totalStaticIndexSizeKB =
+ new MetricsIntValue("totalStaticIndexSizeKB", registry);
+
+ /** Total size of all Bloom filters (not necessarily loaded in memory) */
+ public final MetricsIntValue totalStaticBloomSizeKB =
+ new MetricsIntValue("totalStaticBloomSizeKB", registry);
+
/**
* Sum of all the memstore sizes in this regionserver in MB
*/
@@ -241,6 +255,9 @@ public class RegionServerMetrics impleme
this.stores.pushMetric(this.metricsRecord);
this.storefiles.pushMetric(this.metricsRecord);
this.storefileIndexSizeMB.pushMetric(this.metricsRecord);
+ this.rootIndexSizeKB.pushMetric(this.metricsRecord);
+ this.totalStaticIndexSizeKB.pushMetric(this.metricsRecord);
+ this.totalStaticBloomSizeKB.pushMetric(this.metricsRecord);
this.memstoreSizeMB.pushMetric(this.metricsRecord);
this.regions.pushMetric(this.metricsRecord);
this.requests.pushMetric(this.metricsRecord);
@@ -371,6 +388,12 @@ public class RegionServerMetrics impleme
Integer.valueOf(this.storefiles.get()));
sb = Strings.appendKeyValue(sb, "storefileIndexSize",
Integer.valueOf(this.storefileIndexSizeMB.get()));
+ sb = Strings.appendKeyValue(sb, "rootIndexSizeKB",
+ Integer.valueOf(this.rootIndexSizeKB.get()));
+ sb = Strings.appendKeyValue(sb, "totalStaticIndexSizeKB",
+ Integer.valueOf(this.totalStaticIndexSizeKB.get()));
+ sb = Strings.appendKeyValue(sb, "totalStaticBloomSizeKB",
+ Integer.valueOf(this.totalStaticBloomSizeKB.get()));
sb = Strings.appendKeyValue(sb, "memstoreSize",
Integer.valueOf(this.memstoreSizeMB.get()));
sb = Strings.appendKeyValue(sb, "compactionQueueSize",
Added: hbase/branches/0.89/src/test/java/org/apache/hadoop/hbase/io/hfile/TestHFileBlockIndex.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/test/java/org/apache/hadoop/hbase/io/hfile/TestHFileBlockIndex.java?rev=1181556&view=auto
==============================================================================
--- hbase/branches/0.89/src/test/java/org/apache/hadoop/hbase/io/hfile/TestHFileBlockIndex.java (added)
+++ hbase/branches/0.89/src/test/java/org/apache/hadoop/hbase/io/hfile/TestHFileBlockIndex.java Tue Oct 11 02:19:47 2011
@@ -0,0 +1,603 @@
+/*
+ * Copyright 2011 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.io.hfile;
+
+import java.io.ByteArrayOutputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Random;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.io.hfile.HFileBlockIndex.BlockIndexReader;
+import org.apache.hadoop.hbase.io.hfile.HFileBlockIndex.BlockIndexChunk;
+import org.apache.hadoop.hbase.regionserver.StoreFile;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.ClassSize;
+
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import static org.junit.Assert.*;
+
+@RunWith(Parameterized.class)
+public class TestHFileBlockIndex {
+
+ @Parameters
+ public static Collection<Object[]> compressionAlgorithms() {
+ return HBaseTestingUtility.COMPRESSION_ALGORITHMS_PARAMETERIZED;
+ }
+
+ public TestHFileBlockIndex(Compression.Algorithm compr) {
+ this.compr = compr;
+ }
+
+ private static final Log LOG = LogFactory.getLog(TestHFileBlockIndex.class);
+
+ private static final int NUM_DATA_BLOCKS = 1000;
+ private static final HBaseTestingUtility TEST_UTIL =
+ new HBaseTestingUtility();
+
+ private static final int SMALL_BLOCK_SIZE = 4096;
+ private static final int NUM_KV = 10000;
+
+ private static FileSystem fs;
+ private Path path;
+ private Random rand;
+ private long rootIndexOffset;
+ private int numRootEntries;
+ private int numLevels;
+ private static final List<byte[]> keys = new ArrayList<byte[]>();
+ private final Compression.Algorithm compr;
+ private byte[] firstKeyInFile;
+ private Configuration conf;
+
+ private static final int[] INDEX_CHUNK_SIZES = { 4096, 512, 384 };
+ private static final int[] EXPECTED_NUM_LEVELS = { 2, 3, 4 };
+ private static final int[] UNCOMPRESSED_INDEX_SIZES =
+ { 19187, 21813, 23086 };
+
+ static {
+ assert INDEX_CHUNK_SIZES.length == EXPECTED_NUM_LEVELS.length;
+ assert INDEX_CHUNK_SIZES.length == UNCOMPRESSED_INDEX_SIZES.length;
+ }
+
+ @Before
+ public void setUp() throws IOException {
+ keys.clear();
+ rand = new Random(2389757);
+ firstKeyInFile = null;
+ conf = TEST_UTIL.getConfiguration();
+
+ // This test requires at least HFile format version 2.
+ conf.setInt(HFile.FORMAT_VERSION_KEY, HFile.MAX_FORMAT_VERSION);
+
+ fs = FileSystem.get(conf);
+ }
+
+ @Test
+ public void testBlockIndex() throws IOException {
+ path = new Path(HBaseTestingUtility.getTestDir(), "block_index_" + compr);
+ writeWholeIndex();
+ readIndex();
+ }
+
+ /**
+ * A wrapper around a block reader which only caches the results of the last
+ * operation. Not thread-safe.
+ */
+ private static class BlockReaderWrapper implements HFileBlock.BasicReader {
+
+ private HFileBlock.BasicReader realReader;
+ private long prevOffset;
+ private long prevOnDiskSize;
+ private long prevUncompressedSize;
+ private boolean prevPread;
+ private HFileBlock prevBlock;
+
+ public int hitCount = 0;
+ public int missCount = 0;
+
+ public BlockReaderWrapper(HFileBlock.BasicReader realReader) {
+ this.realReader = realReader;
+ }
+
+ @Override
+ public HFileBlock readBlockData(long offset, long onDiskSize,
+ int uncompressedSize, boolean pread) throws IOException {
+ if (offset == prevOffset && onDiskSize == prevOnDiskSize &&
+ uncompressedSize == prevUncompressedSize && pread == prevPread) {
+ hitCount += 1;
+ return prevBlock;
+ }
+
+ missCount += 1;
+ prevBlock = realReader.readBlockData(offset, onDiskSize,
+ uncompressedSize, pread);
+ prevOffset = offset;
+ prevOnDiskSize = onDiskSize;
+ prevUncompressedSize = uncompressedSize;
+ prevPread = pread;
+
+ return prevBlock;
+ }
+ }
+
+ public void readIndex() throws IOException {
+ long fileSize = fs.getFileStatus(path).getLen();
+ LOG.info("Size of " + path + ": " + fileSize);
+
+ FSDataInputStream istream = fs.open(path);
+ HFileBlock.FSReader blockReader = new HFileBlock.FSReaderV2(istream,
+ compr, fs.getFileStatus(path).getLen());
+
+ BlockReaderWrapper brw = new BlockReaderWrapper(blockReader);
+ HFileBlockIndex.BlockIndexReader indexReader =
+ new HFileBlockIndex.BlockIndexReader(
+ Bytes.BYTES_RAWCOMPARATOR, numLevels, brw);
+
+ indexReader.readRootIndex(blockReader.blockRange(rootIndexOffset,
+ fileSize).nextBlockAsStream(BlockType.ROOT_INDEX), numRootEntries);
+
+ long prevOffset = -1;
+ int i = 0;
+ int expectedHitCount = 0;
+ int expectedMissCount = 0;
+ LOG.info("Total number of keys: " + keys.size());
+ for (byte[] key : keys) {
+ assertTrue(key != null);
+ assertTrue(indexReader != null);
+ HFileBlock b = indexReader.seekToDataBlock(key, 0, key.length, null);
+ if (Bytes.BYTES_RAWCOMPARATOR.compare(key, firstKeyInFile) < 0) {
+ assertTrue(b == null);
+ ++i;
+ continue;
+ }
+
+ String keyStr = "key #" + i + ", " + Bytes.toStringBinary(key);
+
+ assertTrue("seekToDataBlock failed for " + keyStr, b != null);
+
+ if (prevOffset == b.getOffset()) {
+ assertEquals(++expectedHitCount, brw.hitCount);
+ } else {
+ LOG.info("First key in a new block: " + keyStr + ", block offset: "
+ + b.getOffset() + ")");
+ assertTrue(b.getOffset() > prevOffset);
+ assertEquals(++expectedMissCount, brw.missCount);
+ prevOffset = b.getOffset();
+ }
+ ++i;
+ }
+
+ istream.close();
+ }
+
+ private void writeWholeIndex() throws IOException {
+ assertEquals(0, keys.size());
+ HFileBlock.Writer hbw = new HFileBlock.Writer(compr);
+ FSDataOutputStream outputStream = fs.create(path);
+ HFileBlockIndex.BlockIndexWriter biw =
+ new HFileBlockIndex.BlockIndexWriter(hbw, null, null);
+
+ for (int i = 0; i < NUM_DATA_BLOCKS; ++i) {
+ hbw.startWriting(BlockType.DATA, false).write(
+ String.valueOf(rand.nextInt(1000)).getBytes());
+ long blockOffset = outputStream.getPos();
+ hbw.writeHeaderAndData(outputStream);
+
+ byte[] firstKey = null;
+ for (int j = 0; j < 16; ++j) {
+ byte[] k = TestHFileWriterV2.randomOrderedKey(rand, i * 16 + j);
+ keys.add(k);
+ if (j == 8)
+ firstKey = k;
+ }
+ assertTrue(firstKey != null);
+ if (firstKeyInFile == null)
+ firstKeyInFile = firstKey;
+ biw.addEntry(firstKey, blockOffset, hbw.getOnDiskSizeWithHeader());
+
+ writeInlineBlocks(hbw, outputStream, biw, false);
+ }
+ writeInlineBlocks(hbw, outputStream, biw, true);
+ rootIndexOffset = biw.writeIndexBlocks(outputStream);
+ outputStream.close();
+
+ numLevels = biw.getNumLevels();
+ numRootEntries = biw.getNumRootEntries();
+
+ LOG.info("Index written: numLevels=" + numLevels + ", numRootEntries=" +
+ numRootEntries + ", rootIndexOffset=" + rootIndexOffset);
+ }
+
+ private void writeInlineBlocks(HFileBlock.Writer hbw,
+ FSDataOutputStream outputStream, HFileBlockIndex.BlockIndexWriter biw,
+ boolean isClosing) throws IOException {
+ while (biw.shouldWriteBlock(isClosing)) {
+ long offset = outputStream.getPos();
+ biw.writeInlineBlock(hbw.startWriting(biw.getInlineBlockType(), false));
+ hbw.writeHeaderAndData(outputStream);
+ biw.blockWritten(offset, hbw.getOnDiskSizeWithHeader(),
+ hbw.getUncompressedSizeWithoutHeader());
+ LOG.info("Wrote an inline index block at " + offset + ", size " +
+ hbw.getOnDiskSizeWithHeader());
+ }
+ }
+
+ private static final long getDummyFileOffset(int i) {
+ return i * 185 + 379;
+ }
+
+ private static final int getDummyOnDiskSize(int i) {
+ return i * i * 37 + i * 19 + 13;
+ }
+
+ @Test
+ public void testSecondaryIndexBinarySearch() throws IOException {
+ int numTotalKeys = 99;
+ assertTrue(numTotalKeys % 2 == 1); // Ensure no one made this even.
+
+ // We only add odd-index keys into the array that we will binary-search.
+ int numSearchedKeys = (numTotalKeys - 1) / 2;
+
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ DataOutputStream dos = new DataOutputStream(baos);
+
+ dos.writeInt(numSearchedKeys);
+ int curAllEntriesSize = 0;
+ int numEntriesAdded = 0;
+
+ // Only odd-index elements of this array are used to keep the secondary
+ // index entries of the corresponding keys.
+ int secondaryIndexEntries[] = new int[numTotalKeys];
+
+ for (int i = 0; i < numTotalKeys; ++i) {
+ byte[] k = TestHFileWriterV2.randomOrderedKey(rand, i * 2);
+ keys.add(k);
+ String msgPrefix = "Key #" + i + " (" + Bytes.toStringBinary(k) + "): ";
+ StringBuilder padding = new StringBuilder();
+ while (msgPrefix.length() + padding.length() < 70)
+ padding.append(' ');
+ msgPrefix += padding;
+ if (i % 2 == 1) {
+ dos.writeInt(curAllEntriesSize);
+ secondaryIndexEntries[i] = curAllEntriesSize;
+ LOG.info(msgPrefix + "secondary index entry #" + ((i - 1) / 2) +
+ ", offset " + curAllEntriesSize);
+ curAllEntriesSize += k.length
+ + HFileBlockIndex.SECONDARY_INDEX_ENTRY_OVERHEAD;
+ ++numEntriesAdded;
+ } else {
+ secondaryIndexEntries[i] = -1;
+ LOG.info(msgPrefix + "not in the searched array");
+ }
+ }
+
+ // Make sure the keys are increasing.
+ for (int i = 0; i < keys.size() - 1; ++i)
+ assertTrue(Bytes.BYTES_RAWCOMPARATOR.compare(keys.get(i),
+ keys.get(i + 1)) < 0);
+
+ dos.writeInt(curAllEntriesSize);
+ assertEquals(numSearchedKeys, numEntriesAdded);
+ int secondaryIndexOffset = dos.size();
+ assertEquals(Bytes.SIZEOF_INT * (numSearchedKeys + 2),
+ secondaryIndexOffset);
+
+ for (int i = 1; i <= numTotalKeys - 1; i += 2) {
+ assertEquals(dos.size(),
+ secondaryIndexOffset + secondaryIndexEntries[i]);
+ long dummyFileOffset = getDummyFileOffset(i);
+ int dummyOnDiskSize = getDummyOnDiskSize(i);
+ LOG.debug("Storing file offset=" + dummyFileOffset + " and onDiskSize=" +
+ dummyOnDiskSize + " at offset " + dos.size());
+ dos.writeLong(dummyFileOffset);
+ dos.writeInt(dummyOnDiskSize);
+ LOG.debug("Stored key " + ((i - 1) / 2) +" at offset " + dos.size());
+ dos.write(keys.get(i));
+ }
+
+ dos.writeInt(curAllEntriesSize);
+
+ ByteBuffer nonRootIndex = ByteBuffer.wrap(baos.toByteArray());
+ for (int i = 0; i < numTotalKeys; ++i) {
+ byte[] searchKey = keys.get(i);
+ byte[] arrayHoldingKey = new byte[searchKey.length +
+ searchKey.length / 2];
+
+ // To make things a bit more interesting, store the key we are looking
+ // for at a non-zero offset in a new array.
+ System.arraycopy(searchKey, 0, arrayHoldingKey, searchKey.length / 2,
+ searchKey.length);
+
+ int searchResult = BlockIndexReader.binarySearchNonRootIndex(
+ arrayHoldingKey, searchKey.length / 2, searchKey.length, nonRootIndex,
+ Bytes.BYTES_RAWCOMPARATOR);
+ String lookupFailureMsg = "Failed to look up key #" + i + " ("
+ + Bytes.toStringBinary(searchKey) + ")";
+
+ int expectedResult;
+ int referenceItem;
+
+ if (i % 2 == 1) {
+ // This key is in the array we search as the element (i - 1) / 2. Make
+ // sure we find it.
+ expectedResult = (i - 1) / 2;
+ referenceItem = i;
+ } else {
+ // This key is not in the array but between two elements on the array,
+ // in the beginning, or in the end. The result should be the previous
+ // key in the searched array, or -1 for i = 0.
+ expectedResult = i / 2 - 1;
+ referenceItem = i - 1;
+ }
+
+ assertEquals(lookupFailureMsg, expectedResult, searchResult);
+
+ // Now test we can get the offset and the on-disk-size using a
+ // higher-level API function.s
+ boolean locateBlockResult =
+ BlockIndexReader.locateNonRootIndexEntry(nonRootIndex, arrayHoldingKey,
+ searchKey.length / 2, searchKey.length, Bytes.BYTES_RAWCOMPARATOR);
+
+ if (i == 0) {
+ assertFalse(locateBlockResult);
+ } else {
+ assertTrue(locateBlockResult);
+ String errorMsg = "i=" + i + ", position=" + nonRootIndex.position();
+ assertEquals(errorMsg, getDummyFileOffset(referenceItem),
+ nonRootIndex.getLong());
+ assertEquals(errorMsg, getDummyOnDiskSize(referenceItem),
+ nonRootIndex.getInt());
+ }
+ }
+
+ }
+
+ @Test
+ public void testBlockIndexChunk() throws IOException {
+ BlockIndexChunk c = new BlockIndexChunk();
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ int N = 1000;
+ int[] numSubEntriesAt = new int[N];
+ int numSubEntries = 0;
+ for (int i = 0; i < N; ++i) {
+ baos.reset();
+ DataOutputStream dos = new DataOutputStream(baos);
+ c.writeNonRoot(dos);
+ assertEquals(c.getNonRootSize(), dos.size());
+
+ baos.reset();
+ dos = new DataOutputStream(baos);
+ c.writeRoot(dos);
+ assertEquals(c.getRootSize(), dos.size());
+
+ byte[] k = TestHFileWriterV2.randomOrderedKey(rand, i);
+ numSubEntries += rand.nextInt(5) + 1;
+ keys.add(k);
+ c.add(k, getDummyFileOffset(i), getDummyOnDiskSize(i), numSubEntries);
+ }
+
+ // Test the ability to look up the entry that contains a particular
+ // deeper-level index block's entry ("sub-entry"), assuming a global
+ // 0-based ordering of sub-entries. This is needed for mid-key calculation.
+ for (int i = 0; i < N; ++i) {
+ for (int j = i == 0 ? 0 : numSubEntriesAt[i - 1];
+ j < numSubEntriesAt[i];
+ ++j) {
+ assertEquals(i, c.getEntryBySubEntry(j));
+ }
+ }
+ }
+
+ /** Checks if the HeapSize calculator is within reason */
+ @Test
+ public void testHeapSizeForBlockIndex() throws IOException {
+ Class<HFileBlockIndex.BlockIndexReader> cl =
+ HFileBlockIndex.BlockIndexReader.class;
+ long expected = ClassSize.estimateBase(cl, false);
+
+ HFileBlockIndex.BlockIndexReader bi =
+ new HFileBlockIndex.BlockIndexReader(Bytes.BYTES_RAWCOMPARATOR, 1);
+ long actual = bi.heapSize();
+
+ // Since the arrays in BlockIndex(byte [][] blockKeys, long [] blockOffsets,
+ // int [] blockDataSizes) are all null they are not going to show up in the
+ // HeapSize calculation, so need to remove those array costs from expected.
+ expected -= ClassSize.align(3 * ClassSize.ARRAY);
+
+ if (expected != actual) {
+ ClassSize.estimateBase(cl, true);
+ assertEquals(expected, actual);
+ }
+ }
+
+ /**
+ * Testing block index through the HFile writer/reader APIs. Allows to test
+ * setting index block size through configuration, intermediate-level index
+ * blocks, and caching index blocks on write.
+ *
+ * @throws IOException
+ */
+ @Test
+ public void testHFileWriterAndReader() throws IOException {
+ Path hfilePath = new Path(HBaseTestingUtility.getTestDir(),
+ "hfile_for_block_index");
+ BlockCache blockCache = StoreFile.getBlockCache(conf);
+
+ for (int testI = 0; testI < INDEX_CHUNK_SIZES.length; ++testI) {
+ int indexBlockSize = INDEX_CHUNK_SIZES[testI];
+ int expectedNumLevels = EXPECTED_NUM_LEVELS[testI];
+ LOG.info("Index block size: " + indexBlockSize + ", compression: "
+ + compr);
+ // Evict all blocks that were cached-on-write by the previous invocation.
+ blockCache.evictBlocksByPrefix(hfilePath.getName()
+ + HFile.CACHE_KEY_SEPARATOR);
+
+ conf.setInt(HFileBlockIndex.MAX_CHUNK_SIZE_KEY, indexBlockSize);
+ Set<String> keyStrSet = new HashSet<String>();
+ byte[][] keys = new byte[NUM_KV][];
+ byte[][] values = new byte[NUM_KV][];
+
+ // Write the HFile
+ {
+ HFile.Writer writer = HFile.getWriterFactory(conf).createWriter(fs,
+ hfilePath, SMALL_BLOCK_SIZE, HFile.DEFAULT_BYTES_PER_CHECKSUM,
+ compr, KeyValue.KEY_COMPARATOR);
+ Random rand = new Random(19231737);
+
+ for (int i = 0; i < NUM_KV; ++i) {
+ byte[] row = TestHFileWriterV2.randomOrderedKey(rand, i);
+
+ // Key will be interpreted by KeyValue.KEY_COMPARATOR
+ byte[] k = KeyValue.createFirstOnRow(row, 0, row.length, row, 0, 0,
+ row, 0, 0).getKey();
+
+ byte[] v = TestHFileWriterV2.randomValue(rand);
+ writer.append(k, v);
+ keys[i] = k;
+ values[i] = v;
+ keyStrSet.add(Bytes.toStringBinary(k));
+
+ if (i > 0) {
+ assertTrue(KeyValue.KEY_COMPARATOR.compare(keys[i - 1],
+ keys[i]) < 0);
+ }
+ }
+
+ writer.close();
+ }
+
+ // Read the HFile
+ HFile.Reader reader = HFile.createReader(fs, hfilePath, blockCache,
+ false, true);
+ assertEquals(expectedNumLevels,
+ reader.getTrailer().getNumDataIndexLevels());
+
+ assertTrue(Bytes.equals(keys[0], reader.getFirstKey()));
+ assertTrue(Bytes.equals(keys[NUM_KV - 1], reader.getLastKey()));
+ LOG.info("Last key: " + Bytes.toStringBinary(keys[NUM_KV - 1]));
+
+ for (boolean pread : new boolean[] { false, true }) {
+ HFileScanner scanner = reader.getScanner(true, pread);
+ for (int i = 0; i < NUM_KV; ++i) {
+ checkSeekTo(keys, scanner, i);
+ checkKeyValue("i=" + i, keys[i], values[i], scanner.getKey(),
+ scanner.getValue());
+ }
+ assertTrue(scanner.seekTo());
+ for (int i = NUM_KV - 1; i >= 0; --i) {
+ checkSeekTo(keys, scanner, i);
+ checkKeyValue("i=" + i, keys[i], values[i], scanner.getKey(),
+ scanner.getValue());
+ }
+ }
+
+ // Manually compute the mid-key and validate it.
+ HFileReaderV2 reader2 = (HFileReaderV2) reader;
+ HFileBlock.FSReader fsReader = reader2.getUncachedBlockReader();
+
+ HFileBlock.BlockIterator iter = fsReader.blockRange(0,
+ reader.getTrailer().getLoadOnOpenDataOffset());
+ HFileBlock block;
+ List<byte[]> blockKeys = new ArrayList<byte[]>();
+ while ((block = iter.nextBlock()) != null) {
+ if (block.getBlockType() != BlockType.LEAF_INDEX)
+ return;
+ ByteBuffer b = block.getBufferReadOnly();
+ int n = b.getInt();
+ // One int for the number of items, and n + 1 for the secondary index.
+ int entriesOffset = Bytes.SIZEOF_INT * (n + 2);
+
+ // Get all the keys from the leaf index block. S
+ for (int i = 0; i < n; ++i) {
+ int keyRelOffset = b.getInt(Bytes.SIZEOF_INT * (i + 1));
+ int nextKeyRelOffset = b.getInt(Bytes.SIZEOF_INT * (i + 2));
+ int keyLen = nextKeyRelOffset - keyRelOffset;
+ int keyOffset = b.arrayOffset() + entriesOffset + keyRelOffset +
+ HFileBlockIndex.SECONDARY_INDEX_ENTRY_OVERHEAD;
+ byte[] blockKey = Arrays.copyOfRange(b.array(), keyOffset, keyOffset
+ + keyLen);
+ String blockKeyStr = Bytes.toString(blockKey);
+ blockKeys.add(blockKey);
+
+ // If the first key of the block is not among the keys written, we
+ // are not parsing the non-root index block format correctly.
+ assertTrue("Invalid block key from leaf-level block: " + blockKeyStr,
+ keyStrSet.contains(blockKeyStr));
+ }
+ }
+
+ // Validate the mid-key.
+ assertEquals(
+ Bytes.toStringBinary(blockKeys.get((blockKeys.size() - 1) / 2)),
+ Bytes.toStringBinary(reader.midkey()));
+
+ assertEquals(UNCOMPRESSED_INDEX_SIZES[testI],
+ reader.getTrailer().getUncompressedDataIndexSize());
+
+ reader.close();
+ }
+ }
+
+ private void checkSeekTo(byte[][] keys, HFileScanner scanner, int i)
+ throws IOException {
+ assertEquals("Failed to seek to key #" + i + " ("
+ + Bytes.toStringBinary(keys[i]) + ")", 0, scanner.seekTo(keys[i]));
+ }
+
+ private void assertArrayEqualsBuffer(String msgPrefix, byte[] arr,
+ ByteBuffer buf) {
+ assertEquals(msgPrefix + ": expected " + Bytes.toStringBinary(arr)
+ + ", actual " + Bytes.toStringBinary(buf), 0, Bytes.compareTo(arr, 0,
+ arr.length, buf.array(), buf.arrayOffset(), buf.limit()));
+ }
+
+ /** Check a key/value pair after it was read by the reader */
+ private void checkKeyValue(String msgPrefix, byte[] expectedKey,
+ byte[] expectedValue, ByteBuffer keyRead, ByteBuffer valueRead) {
+ if (!msgPrefix.isEmpty())
+ msgPrefix += ". ";
+
+ assertArrayEqualsBuffer(msgPrefix + "Invalid key", expectedKey, keyRead);
+ assertArrayEqualsBuffer(msgPrefix + "Invalid value", expectedValue,
+ valueRead);
+ }
+
+}