You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by da...@apache.org on 2018/01/25 19:10:45 UTC
[1/2] kudu git commit: [java] add test to reproduce KUDU-2267
Repository: kudu
Updated Branches:
refs/heads/master f62e4cd0e -> cd7166cb7
[java] add test to reproduce KUDU-2267
Currently, if a master has never been a leader from the very start of
the cluster, it has just self-signed server TLS certificate. In that
case if a client does not have any valid Kerberos credentials but only
authn token, then the client may see negotiation failure error and is
not able to connect. This patch adds a regression test for this issue.
Change-Id: I4879749988dc884fe81cf36838819379db91ae72
Reviewed-on: http://gerrit.cloudera.org:8080/9071
Tested-by: Kudu Jenkins
Reviewed-by: Alexey Serbin <as...@cloudera.com>
Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/0f0e4214
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/0f0e4214
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/0f0e4214
Branch: refs/heads/master
Commit: 0f0e42144a0d37f57394b373eb75eeb18bc28174
Parents: f62e4cd
Author: hahao <ha...@cloudera.com>
Authored: Thu Jan 18 16:06:46 2018 -0800
Committer: Hao Hao <ha...@cloudera.com>
Committed: Thu Jan 25 03:05:16 2018 +0000
----------------------------------------------------------------------
.../org/apache/kudu/client/AsyncKuduClient.java | 5 +++
.../apache/kudu/client/ConnectToCluster.java | 20 ++++++++--
.../org/apache/kudu/client/TestSecurity.java | 40 +++++++++++++++++++-
3 files changed, 59 insertions(+), 6 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/kudu/blob/0f0e4214/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java
----------------------------------------------------------------------
diff --git a/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java b/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java
index dd282ff..de36e3d 100644
--- a/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java
+++ b/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java
@@ -745,6 +745,11 @@ public class AsyncKuduClient implements AutoCloseable {
return requestTracker;
}
+ @VisibleForTesting
+ KuduTable getMasterTable() {
+ return masterTable;
+ }
+
/**
* Creates a new {@link AsyncKuduScanner.AsyncKuduScannerBuilder} for a particular table.
* @param table the name of the table you intend to scan.
http://git-wip-us.apache.org/repos/asf/kudu/blob/0f0e4214/java/kudu-client/src/main/java/org/apache/kudu/client/ConnectToCluster.java
----------------------------------------------------------------------
diff --git a/java/kudu-client/src/main/java/org/apache/kudu/client/ConnectToCluster.java b/java/kudu-client/src/main/java/org/apache/kudu/client/ConnectToCluster.java
index d69bd76..f3341a7 100644
--- a/java/kudu-client/src/main/java/org/apache/kudu/client/ConnectToCluster.java
+++ b/java/kudu-client/src/main/java/org/apache/kudu/client/ConnectToCluster.java
@@ -86,6 +86,11 @@ final class ConnectToCluster {
return responseD;
}
+ @VisibleForTesting
+ List<Exception> getExceptionsReceived() {
+ return exceptionsReceived;
+ }
+
private static Deferred<ConnectToMasterResponsePB> connectToMaster(
final KuduTable masterTable,
final RpcProxy masterProxy,
@@ -148,11 +153,20 @@ final class ConnectToCluster {
long defaultTimeoutMs,
Connection.CredentialsPolicy credentialsPolicy) {
ConnectToCluster connector = new ConnectToCluster(masterAddresses);
+ connector.connectToMasters(masterTable, parentRpc,
+ defaultTimeoutMs, credentialsPolicy);
+ return connector.responseD;
+ }
+ @VisibleForTesting
+ void connectToMasters(KuduTable masterTable,
+ KuduRpc<?> parentRpc,
+ long defaultTimeoutMs,
+ Connection.CredentialsPolicy credentialsPolicy) {
// Try to connect to each master. The ConnectToCluster instance
// waits until it gets a good response before firing the returned
// deferred.
- for (HostAndPort hostAndPort : masterAddresses) {
+ for (HostAndPort hostAndPort : masterAddrs) {
Deferred<ConnectToMasterResponsePB> d;
RpcProxy proxy = masterTable.getAsyncClient().newMasterRpcProxy(
hostAndPort, credentialsPolicy);
@@ -164,10 +178,8 @@ final class ConnectToCluster {
Status statusIOE = Status.IOError(message);
d = Deferred.fromError(new NonRecoverableException(statusIOE));
}
- d.addCallbacks(connector.callbackForNode(hostAndPort),
- connector.errbackForNode(hostAndPort));
+ d.addCallbacks(callbackForNode(hostAndPort), errbackForNode(hostAndPort));
}
- return connector.responseD;
}
/**
http://git-wip-us.apache.org/repos/asf/kudu/blob/0f0e4214/java/kudu-client/src/test/java/org/apache/kudu/client/TestSecurity.java
----------------------------------------------------------------------
diff --git a/java/kudu-client/src/test/java/org/apache/kudu/client/TestSecurity.java b/java/kudu-client/src/test/java/org/apache/kudu/client/TestSecurity.java
index c9881c4..863ed03 100644
--- a/java/kudu-client/src/test/java/org/apache/kudu/client/TestSecurity.java
+++ b/java/kudu-client/src/test/java/org/apache/kudu/client/TestSecurity.java
@@ -13,14 +13,17 @@
*/
package org.apache.kudu.client;
+import static org.apache.kudu.util.AssertHelpers.assertEventuallyTrue;
import static org.junit.Assert.assertNotNull;
+import java.util.List;
+
+import org.apache.kudu.util.AssertHelpers.BooleanExpression;
+import org.apache.kudu.util.SecurityUtil;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
-import org.apache.kudu.util.SecurityUtil;
-
public class TestSecurity extends BaseKuduTest {
private static final String TABLE_NAME = "TestSecurity-table";
@@ -73,6 +76,39 @@ public class TestSecurity extends BaseKuduTest {
}
/**
+ * Regression test for KUDU-2267, client with valid token but without valid
+ * Kerberos credentials should be able to connect to all the masters.
+ */
+ @Test
+ public void testKudu2267() throws Exception {
+ byte[] authnData = client.exportAuthenticationCredentials().join();
+ assertNotNull(authnData);
+ String oldTicketCache = System.getProperty(SecurityUtil.KUDU_TICKETCACHE_PROPERTY);
+ System.clearProperty(SecurityUtil.KUDU_TICKETCACHE_PROPERTY);
+ try {
+ final KuduClient newClient = new KuduClient.KuduClientBuilder(masterAddresses).build();
+ newClient.importAuthenticationCredentials(authnData);
+
+ // Try to connect to all the masters and assert there is no
+ // authentication failures.
+ assertEventuallyTrue("Not able to connect to all the masters",
+ new BooleanExpression() {
+ @Override
+ public boolean get() throws Exception {
+ ConnectToCluster connector = new ConnectToCluster(masterHostPorts);
+ connector.connectToMasters(newClient.asyncClient.getMasterTable(), null,
+ DEFAULT_SLEEP, Connection.CredentialsPolicy.ANY_CREDENTIALS);
+ connector.getDeferred().join();
+ List<Exception> s = connector.getExceptionsReceived();
+ return s.size() == 0;
+ }
+ }, DEFAULT_SLEEP);
+ } finally {
+ System.setProperty(SecurityUtil.KUDU_TICKETCACHE_PROPERTY, oldTicketCache);
+ }
+ }
+
+ /**
* Test that a client is able to connect to masters using valid tokens
* after all masters were killed and restarted, and before a leader is
* elected. Leader election time is configured to be long enough using
[2/2] kudu git commit: design-docs: improve cfile.md
Posted by da...@apache.org.
design-docs: improve cfile.md
I've continually found the CFile abstraction difficult to place in
context with the higher-level DiskRowSet and lower-level BlockManager
abstractions. Every time I revisit the CFile code I have to re-research
these relationships and interactions. cfile.md is currently lacking in
these details, so this is an attempt to fix that.
Change-Id: I770028bba3f7a49c96f32893c285221c84be39ce
Reviewed-on: http://gerrit.cloudera.org:8080/8860
Tested-by: Kudu Jenkins
Reviewed-by: David Ribeiro Alves <da...@gmail.com>
Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/cd7166cb
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/cd7166cb
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/cd7166cb
Branch: refs/heads/master
Commit: cd7166cb778aed0a26f8e76d4023ff4054c58d76
Parents: 0f0e421
Author: Dan Burkert <da...@apache.org>
Authored: Sat Dec 16 14:12:33 2017 -0800
Committer: Dan Burkert <da...@cloudera.com>
Committed: Thu Jan 25 19:05:54 2018 +0000
----------------------------------------------------------------------
docs/design-docs/cfile.md | 398 +++++++++++++++++++++++++---------------
docs/design-docs/tablet.md | 4 +-
2 files changed, 253 insertions(+), 149 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/kudu/blob/cd7166cb/docs/design-docs/cfile.md
----------------------------------------------------------------------
diff --git a/docs/design-docs/cfile.md b/docs/design-docs/cfile.md
index 7f4a0f4..330385f 100644
--- a/docs/design-docs/cfile.md
+++ b/docs/design-docs/cfile.md
@@ -11,208 +11,312 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
-CFile is a simple columnar format which stores multiple related B-Trees.
+# CFile
-File format
------------------
-```
-<header>
-<blocks>
-<btree root blocks>
-<footer>
-EOF
+CFile is an on-disk columnar storage format which holds data and associated
+B-Tree indexes. Within a [DiskRowSet](tablet.md) each column and DeltaFile will
+map to a single CFile. In addition, the DiskRowSet's bloomfilter will be stored
+in a CFile, and if the table has a compound primary key, then the associated
+ad-hoc index will be stored in a separate CFile.
+Despite the name, a CFile does not necessarily correspond one-to-one with a file
+in the filesystem. The mapping of CFiles to the filesystem is determined by the
+[BlockManager](../../src/kudu/fs/block_manager.h) abstraction. A CFile is
+written to a single BlockManager Block (not to be confused with cblocks, which
+are internal to CFiles, discussed below).
-Header
-------
+A CFile is composed of a header, a variable number of cblocks, and a footer.
+There are a few different types of cblocks, each with a different format: data
+cblocks, nullable data cblocks, index cblocks, and dictionary cblocks.
-<magic>: see below
-<header length>: 32-bit unsigned integer length of the header protobuf
-<header>: CFileHeaderPB protobuf
-<checksum>: An optional Crc32 checksum of the magic, length, and protobuf
+Data cblocks consist of a sequence of consecutive values. Each value is assigned
+an ordinal index, or offset, which is unique within the CFile. Thus, a cblock
+can be identified within a CFile by the range of ordinal indexes it contains;
+for instance, the first three cblocks of a CFile might have the ordinal index
+ranges `[0, 55]`, `[56, 132]`, and `[133, 511]`.
-Block
------
-<data>: see below
-<checksum>: An optional Crc32 checksum of the data
+In order to support efficient seeks to an arbitrary ordinal index within the
+CFile, a positional index may be included which contains a mapping of starting
+ordinal index to data cblock. See [CFile Index](#cfile-index) for more details.
-Footer
-------
+For CFiles which contain data in sorted order (for example, a CFile containing
+the Primary Key column), a value index may be created. The value index supports
+efficiently seeking to an arbitrary value in the CFile. See
+[CFile Index](#cfile-index) for more details.
-<checksum>: An optional Crc32 checksum of the protobuf, magic, and length
-<footer>: CFileFooterPB protobuf
-<magic>: see below
-<footer length>: 32-bit unsigned integer length of the footer protobuf
+## File Format
+The CFile header, cblocks, and footer are written consecutively to the file
+without padding.
-Magic strings
--------------
-CFile v1: the string 'kuducfil'
-CFile v2: the string 'kuducfl2'
-```
+### Header Layout
-The two versions are identical except where specifically noted in the source
-code or this document.
+| field | width (bytes) | notes |
+| --- | --- | --- |
+| magic | 8 | [see below](#versioning) |
+| length | 4 | 32-bit unsigned length of the following Protobuf message |
+| message | variable | encoded [`CFileHeaderPB`][cfile.proto] Protobuf message |
+| checksum | 4 | optional CRC-32 checksum of magic, length, and message |
-==============================
+### Footer Layout
-Data blocks:
+| field | width (bytes) | notes |
+| --- | --- | --- |
+| checksum | 4 | optional CRC-32 checksum of message, magic, and length |
+| magic | 8 | [see below](#versioning) |
+| message | variable | encoded [`CFileFooterPB`][cfile.proto] Protobuf message |
+| length | 4 | unsigned 32-bit length of the preceding Protobuf message |
-Data blocks are stored with various types of encodings.
+### Versioning
-* Prefix Encoding
+The CFile header and footer each contain a 'magic' string which indicates the
+CFile's version.
-Currently used for STRING blocks. This is based on the encoding used
-by LevelDB for its data blocks, more or less.
+| version | string |
+| --- | --- |
+| CFile v1 | `kuducfil` |
+| CFile v2 | `kuducfl2` |
-Starts with a header of four uint32s, group-varint coded:
-```
- <num elements> \
- <ordinal position> |
- <restart interval> | group varint 32
- <unused> /
-```
-Followed by prefix-compressed values. Each value is stored relative
-to the value preceding it using the following format:
-```
- shared_bytes: varint32
- unshared_bytes: varint32
- delta: char[unshared_bytes]
-```
-Periodically there will be a "restart point" which is necessary for
-faster binary searching. At a "restart point", shared_bytes is
-0 but otherwise the encoding is the same.
+CFile v2 was introduced in Kudu 1.3 in order to make a breaking change in the
+CFile format. At the same time, two sets of feature flags were introduced to the
+footer in order to make more granular forwards compatibility possible in the
+future. See commit [`f82cf6918c`] for details.
-At the end of the block is a trailer with the offsets of the
-restart points:
-```
- restart_points[num_restarts]: uint32
- num_restarts: uint32
-```
-The restart points are offsets relative to the start of the block,
-including the header.
+### Data cblock Layout
+| field | width (bytes) | notes |
+| --- | --- | --- |
+| data | variable | [encoded](#data-encodings) data values |
+| checksum | 4 | optional CRC-32 checksum of the data |
-* Group Varint Frame-Of-Reference Encoding
+If the CFile is configured to use compression then the encoded data is
+compressed before the checksum is appended.
-Used for uint32 blocks.
+### Nullable Data cblock Layout
-Starts with a header:
-```
-<num elements> \
-<min element> |
-<ordinal position> | group varint 32
-<unused> /
-```
-The ordinal position is the ordinal position of the first item in the
-file. For example, the first data block in the file has ordinal position
-0. If that block had 400 data entries, then the second data block would
-have ordinal position 400.
+Columns marked as nullable in the schema are stored in a nullable cblock, which
+includes a bitmap which tracks which rows (ordinal offsets) are null and not
+null.
-Followed by the actual data, each set of 4 integers using group-varint.
-The last group is padded with 0s.
-Each integer is relative to the min element in the header.
+| field | width (bytes) | notes |
+| --- | --- | --- |
+| value count | variable | [LEB128] encoded count of values |
+| bitmap length | variable | [LEB128] encoded length of the following null bitmap |
+| null bitmap | variable | [RLE] encoded bitmap |
+| data | variable | encoded non-null data values |
+| checksum | 4 | optional CRC-32 checksum of the value count, bitmap length, null bitmap, and data |
-==============================
+If the CFile is configured to use compression then the value count, bitmap
+length, null bitmap, and encoded data are compressed before the checksum is
+appended.
-Nullable Columns:
+### Checksums
-If a column is marked as nullable in the schema, a bitmap is used to keep track
-of the null and not null rows.
+Checksums can be optionally written and verified.
-The bitmap is added the begininning of the data block, and it uses RLE.
-```
- <num elements in the block> : vint
- <null bitmap size> : vint
- <null bitmap> : RLE encoding
- <encoded non-null values> : encoded data
-```
-Data Block Example - 4 items, the first and last are nulls.
-```
- 4 Num Elements in the block
- 1 Null Bitmap Size
- 0110 Null Bitmap
- v2 Value of row 2
- v3 Value of row 3
-```
-==============================
+When checksums for the header, data, and footer are written in the CFile, the
+`incompatible_features` bitset in the CFileFooterPB message is used. A
+"checksum" bit is set to ensure the reader knows if checksums exist.
+
+When reading a CFile the footer should be read first to find if the file
+contains checksums. If the `incompatible_features` bitset indicates checksums
+exist, the reader can optionally validate them against the read data.
+
+## Data Encodings
+
+cblock data is encoded before being stored. If compression is enabled, then the
+cblock is encoded first, and then compressed.
+
+Data cblocks are best-effort size limited to `--cfile-default-block-size` bytes, at
+which point a new cblock will be added to the CFile.
+
+### Plain Encoding
+
+The simplest encoding type is plain encoding, which stores values in their
+natural representation.
+
+Plain encoding for fixed-width (integer) types consists of the little-endian
+values, followed by a trailer containing two little-endian `uint32`s: the count
+of values, and the ordinal position of the cblock.
+
+Plain encoding for BINARY or STRING (variable-width) columns is laid out as
+follows:
+
+| ordinal position | little-endian `uint32` |
+| --- | --- |
+| num values | little-endian `uint32` |
+| offsets position | little-endian `uint32` |
+| values | sequential values with no padding |
+| offsets | [group-varint frame-of-reference](#group-varint-frame-of-reference-encoding) encoded value offsets |
+
+### Dictionary Encoding
+
+[Dictionary encoding](dictionary-encoding) may be used for BINARY or STRING
+columns. All dictionary encoded cblocks in a CFile share the same dictionary.
+If the dictionary becomes full, subsequent cblocks in the CFile switch to plain
+encoding. The dictionary is stored as a plain encoded binary cblock, and the
+data codewords are stored as [bitshuffle encoded](#bitshuffle-encoding)
+`uint32`s.
+
+### Prefix Encoding
+
+Currently used for `BINARY` and `STRING` data cblocks. This is based on the
+encoding used by LevelDB for its data blocks, more or less.
-Index blocks:
+Starts with a header of four [Group Varint] encoded `uint32`s:
-The index blocks are organized in a B-Tree. As data blocks are written,
-they are appended to the end of a leaf index block. When a leaf index
-block reaches the configured block size, it is added to another index
-block higher up the tree, and a new leaf is started. If the intermediate
-index block fills, it will start a new intermediate block and spill into
-an even higher-layer internal block.
+| field |
+| --- |
+| number of elements |
+| ordinal position |
+| restart interval |
+| unused |
+
+Followed by prefix-compressed values. Each value is stored relative to the value
+preceding it using the following format:
+
+| field | type |
+| --- | --- |
+| `shared_bytes` | `varint32` |
+| `unshared_bytes` | `varint32` |
+| `delta` | `char[unshared_bytes]` |
+
+Periodically there will be a "restart point" which is necessary for faster
+binary searching. At a "restart point", `shared_bytes` is 0 but otherwise the
+encoding is the same.
+
+At the end of the cblock is a trailer with the offsets of the restart points:
+
+| field | type |
+| --- | --- |
+| `restart_points` | `uint32[num_restarts]` |
+| `num_restarts` | `uint32` |
+
+The restart points are offsets relative to the start of the cblock, including
+the header.
+
+### Run-length Encoding
+
+Integer and bool types may be [run-length encoded](RLE), which has a simply
+layout: the number of values and ordinal position of the cblock as little-endian
+`uint32`s, followed by the run-length encoded data. The encoder will
+automatically fall back to a bit-packed (literal) encoding if the data is not
+conducive to run-length encoding.
+
+### Bitshuffle Encoding
+
+Integer types may be [bitshuffle](bitshuffle) encoded. Bitshuffle-encoded
+cblocks are automatically LZ4 compressed, so additional compression is not
+recommended.
+
+### Group Varint Frame-of-Reference Encoding
+
+Used internally for `UINT32` blocks. Kudu does not support unsigned integer
+columns, so this encoding is not used for column data.
+
+Starts with a header of four [group-varint](group-varint) encoded `uint32`s:
+
+| field |
+| --- |
+| number of elements |
+| minimum element |
+| ordinal position |
+| unused |
+
+The ordinal position is the ordinal position of the first item in the file. For
+example, the first data cblock in the file has ordinal position 0. If that cblock
+had 400 values, then the second data cblock would have ordinal position
+400.
+
+Followed by the actual data, each set of 4 integers using group-varint. The last
+group is padded with 0s. Each integer is relative to the min element in the
+header.
+
+## CFile Index
+
+CFiles may optionally include a positional (ordinal) index and value index.
+Positional indexes are used to satisfy queries such as: "seek to the data cblock
+containing the Nth entry in this CFile". Value indexes are used to satisfy
+queries such as: "seek to the data cblock containing `123` in this CFile". Value
+indexes are only present in CFiles which contain data stored in sorted order
+(e.g. the primary key column).
+
+Ordinal and value indices are organized as a [B-Tree] of index cblocks. As data
+cblocks are written, entries are appended to the end of a leaf index cblock.
+When a leaf index cblock reaches the configured index cblock size, it is added
+to another index cblock higher up the tree, and a new leaf is started. If the
+intermediate index cblock fills, it will start a new intermediate cblock and
+spill into an even higher-layer internal cblock.
For example:
+
```
[Int 0]
- ------------------------------
- | |
- [Int 1] [Int 2]
- ----------------- --------------
- | | | | |
-[Leaf 0] ... [Leaf N] [Leaf N+1] [Leaf N+2]
+ -----------------------------
+ | |
+ [Int 1] [Int 2]
+ ----------------- --------------
+ | | | | |
+[Leaf 0] ... [Leaf N] [Leaf N+1] [Leaf N+2]
```
-In this case, we wrote N leaf blocks, which filled up the node labeled
-Int 1. At this point, the writer would create Int 0 with one entry pointing
-to Int 1. Further leaf blocks (N+1 and N+2) would be written to a new
-internal node (Int 2). When the file is completed, Int 2 will spill,
-adding its entry into Int 0 as well.
+In this case, we wrote N leaf blocks, which filled up the internal node labeled
+Int 1. At this point, the writer would create Int 0 with one entry pointing to
+Int 1. Further leaf blocks (N+1 and N+2) would be written to a new internal node
+(Int 2). When the file is completed, Int 2 will spill, adding its entry into Int 0
+as well.
-Note that this strategy doesn't result in a fully balanced b-tree, but instead
+Note that this strategy doesn't result in a fully balanced B-tree, but instead
results in a 100% "fill factor" on all nodes in each level except for the last
one written.
-There are two types of indexes:
-
-- Positional indexes: map ordinal position -> data block offset
+An index cblock is encoded similarly for both types of indexes:
-These are used to satisfy queries like: "seek to the Nth entry in this file"
-
-- Value-based indexes: reponsible for mapping value -> data block offset
-
-These are only present in files which contain data stored in sorted order
-(e.g key columns). They can satisfy seeks by value.
-
-
-An index block is encoded similarly for both types of indexes:
```
-<key> <block offset> <block size>
-<key> <block offset> <block size>
+<key> <cblock offset> <cblock size>
+<key> <cblock offset> <cblock size>
...
key: vint64 for positional, otherwise varint-length-prefixed string
offset: vint64
- block size: vint32
+ cblock size: vint32
<offset to first key> (fixed32)
<offset to second key> (fixed32)
...
- These offsets are relative to the start of the block.
+ These offsets are relative to the start of the cblock.
<trailer>
A IndexBlockTrailerPB protobuf
<trailer length>
```
-The trailer protobuf includes a field which designates whether the block
-is a leaf node or internal node of the B-Tree, allowing a reader to know
-whether the pointer is to another index block or to a data block.
-==============================
+Index cblocks are written using a [post-order traversal], and the index cblocks
+may intersperse with the data cblocks.
-Checksums:
+The trailer protobuf includes a field which designates whether the cblock is a
+leaf node or internal node of the B-Tree, allowing a reader to know whether the
+pointer is to another index cblock or to a data cblock.
-Checksums can be optionally written and verified.
+## DeltaFile
+
+Every DeltaFile in a DiskRowSet is written to a CFile; in fact, a DeltaFile is
+just a wrapper around CFile which specializes it for REDO and UNDO delta data.
+The deltas associated with a row update are grouped into a RowChangeList and
+written to as a binary values to the underlying CFile. Each value is prefixed
+with a DeltaKey, which consists of the ordinal row index (within the
+DiskRowSet), and the timestamp. The CFile includes a value index so that the
+delta belonging to a specific row can be located efficiently.
+
+## BloomFile
-When checksums for the header, data, and footer are written in the CFile,
-the incompatible_features bitset in the CFile footer is used. A "checksum"
-bit is set to ensure the reader knows if checksums exist.
+BloomFile is a wrapper around CFile which stores a bloomfilter datastructure.
-When reading a CFile the footer should be read first to find if the
-file contains checksums. If the incompatible_features bitset indicates
-checksums exist, the reader can optionally validate them against the
-read data.
\ No newline at end of file
+[LEB128]: https://en.wikipedia.org/wiki/LEB128
+[RLE]: https://en.wikipedia.org/wiki/Run-length_encoding
+[`f82cf6918c`]: https://github.com/apache/kudu/commit/f82cf6918c00dff6aecad5a6b50836b7eb5db508
+[B-tree]: https://en.wikipedia.org/wiki/B-tree
+[bitshuffle]: https://github.com/kiyo-masui/bitshuffle
+[cfile.proto]: ../../src/kudu/cfile/cfile.proto
+[Group Varint]: https://static.googleusercontent.com/media/research.google.com/en//people/jeff/WSDM09-keynote.pdf
+[post-order traversal]: https://en.wikipedia.org/wiki/Tree_traversal#Post-order
http://git-wip-us.apache.org/repos/asf/kudu/blob/cd7166cb/docs/design-docs/tablet.md
----------------------------------------------------------------------
diff --git a/docs/design-docs/tablet.md b/docs/design-docs/tablet.md
index b021d10..d4656be 100644
--- a/docs/design-docs/tablet.md
+++ b/docs/design-docs/tablet.md
@@ -194,7 +194,7 @@ When the MemRowSet fills up, a Flush occurs, which persists the data to disk.
| DiskRowSet 0 | | DiskRowSet 1 | .. | DiskRowSet N |
+-------------+- +--------------+ +--------------+
```
-When the data is flushed, it is stored as a set of CFiles (see src/kudu/cfile/README).
+When the data is flushed, it is stored as a set of CFiles (see cfile.md).
Each of the rows in the data is addressable by a sequential "rowid", which is
dense, immutable, and unique within this DiskRowSet. For example, if a given
DiskRowSet contains 5 rows, then they will be assigned rowid 0 through 4, in
@@ -203,7 +203,7 @@ rows with the same rowids.
Reads may map between primary keys (user-visible) and rowids (internal) using an index
structure. In the case that the primary key is a simple key, the key structure is
-embedded within the primary key column's cfile. Otherwise, a separate index cfile
+embedded within the primary key column's CFile. Otherwise, a separate index CFile
stores the encoded compound key and provides a similar function.
NOTE: rowids are not explicitly stored with each row, but rather an implicit