You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2017/11/04 17:10:04 UTC
[5/7] hbase git commit: HBASE-19179 Remove hbase-prefix-tree
http://git-wip-us.apache.org/repos/asf/hbase/blob/f8c58930/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/PrefixTreeEncoder.java
----------------------------------------------------------------------
diff --git a/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/PrefixTreeEncoder.java b/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/PrefixTreeEncoder.java
deleted file mode 100644
index 8ba8828..0000000
--- a/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/PrefixTreeEncoder.java
+++ /dev/null
@@ -1,542 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.hbase.codec.prefixtree.encode;
-
-import java.io.IOException;
-import java.io.OutputStream;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.yetus.audience.InterfaceAudience;
-import org.apache.hadoop.hbase.Cell;
-import org.apache.hadoop.hbase.CellUtil;
-import org.apache.hadoop.hbase.PrivateCellUtil;
-import org.apache.hadoop.hbase.KeyValueUtil;
-import org.apache.hadoop.hbase.codec.prefixtree.PrefixTreeBlockMeta;
-import org.apache.hadoop.hbase.codec.prefixtree.encode.column.ColumnSectionWriter;
-import org.apache.hadoop.hbase.codec.prefixtree.encode.other.CellTypeEncoder;
-import org.apache.hadoop.hbase.codec.prefixtree.encode.other.ColumnNodeType;
-import org.apache.hadoop.hbase.codec.prefixtree.encode.other.LongEncoder;
-import org.apache.hadoop.hbase.codec.prefixtree.encode.row.RowSectionWriter;
-import org.apache.hadoop.hbase.codec.prefixtree.encode.tokenize.Tokenizer;
-import org.apache.hadoop.hbase.io.CellOutputStream;
-import org.apache.hadoop.hbase.util.ArrayUtils;
-import org.apache.hadoop.hbase.util.ByteRange;
-import org.apache.hadoop.hbase.util.SimpleMutableByteRange;
-import org.apache.hadoop.hbase.util.byterange.ByteRangeSet;
-import org.apache.hadoop.hbase.util.byterange.impl.ByteRangeHashSet;
-import org.apache.hadoop.hbase.util.byterange.impl.ByteRangeTreeSet;
-import org.apache.hadoop.hbase.util.vint.UFIntTool;
-import org.apache.hadoop.io.WritableUtils;
-/**
- * This is the primary class for converting a CellOutputStream into an encoded byte[]. As Cells are
- * added they are completely copied into the various encoding structures. This is important because
- * usually the cells being fed in during compactions will be transient.<br>
- * <br>
- * Usage:<br>
- * 1) constructor<br>
- * 4) append cells in sorted order: write(Cell cell)<br>
- * 5) flush()<br>
- */
-@InterfaceAudience.Private
-public class PrefixTreeEncoder implements CellOutputStream {
-
- /**************** static ************************/
-
- protected static final Log LOG = LogFactory.getLog(PrefixTreeEncoder.class);
-
- //future-proof where HBase supports multiple families in a data block.
- public static final boolean MULITPLE_FAMILIES_POSSIBLE = false;
-
- private static final boolean USE_HASH_COLUMN_SORTER = true;
- private static final int INITIAL_PER_CELL_ARRAY_SIZES = 256;
- private static final int VALUE_BUFFER_INIT_SIZE = 64 * 1024;
-
-
- /**************** fields *************************/
-
- protected long numResets = 0L;
-
- protected OutputStream outputStream;
-
- /*
- * Cannot change during a single block's encoding. If false, then substitute incoming Cell's
- * mvccVersion with zero and write out the block as usual.
- */
- protected boolean includeMvccVersion;
-
- /*
- * reusable ByteRanges used for communicating with the sorters/compilers
- */
- protected ByteRange rowRange;
- protected ByteRange familyRange;
- protected ByteRange qualifierRange;
- protected ByteRange tagsRange;
-
- /*
- * incoming Cell fields are copied into these arrays
- */
- protected long[] timestamps;
- protected long[] mvccVersions;
- protected byte[] typeBytes;
- protected int[] valueOffsets;
- protected int[] tagsOffsets;
- protected byte[] values;
- protected byte[] tags;
-
- protected PrefixTreeBlockMeta blockMeta;
-
- /*
- * Sub-encoders for the simple long/byte fields of a Cell. Add to these as each cell arrives and
- * compile before flushing.
- */
- protected LongEncoder timestampEncoder;
- protected LongEncoder mvccVersionEncoder;
- protected CellTypeEncoder cellTypeEncoder;
-
- /*
- * Structures used for collecting families and qualifiers, de-duplicating them, and sorting them
- * so they can be passed to the tokenizers. Unlike row keys where we can detect duplicates by
- * comparing only with the previous row key, families and qualifiers can arrive in unsorted order
- * in blocks spanning multiple rows. We must collect them all into a set to de-duplicate them.
- */
- protected ByteRangeSet familyDeduplicator;
- protected ByteRangeSet qualifierDeduplicator;
- protected ByteRangeSet tagsDeduplicator;
- /*
- * Feed sorted byte[]s into these tokenizers which will convert the byte[]s to an in-memory
- * trie structure with nodes connected by memory pointers (not serializable yet).
- */
- protected Tokenizer rowTokenizer;
- protected Tokenizer familyTokenizer;
- protected Tokenizer qualifierTokenizer;
- protected Tokenizer tagsTokenizer;
-
- /*
- * Writers take an in-memory trie, sort the nodes, calculate offsets and lengths, and write
- * all information to an output stream of bytes that can be stored on disk.
- */
- protected RowSectionWriter rowWriter;
- protected ColumnSectionWriter familyWriter;
- protected ColumnSectionWriter qualifierWriter;
- protected ColumnSectionWriter tagsWriter;
-
- /*
- * Integers used for counting cells and bytes. We keep track of the size of the Cells as if they
- * were full KeyValues because some parts of HBase like to know the "unencoded size".
- */
- protected int totalCells = 0;
- protected int totalUnencodedBytes = 0;//numBytes if the cells were KeyValues
- protected int totalValueBytes = 0;
- protected int totalTagBytes = 0;
- protected int maxValueLength = 0;
- protected int maxTagLength = 0;
- protected int totalBytes = 0;//
-
-
- /***************** construct ***********************/
-
- public PrefixTreeEncoder(OutputStream outputStream, boolean includeMvccVersion) {
- // used during cell accumulation
- this.blockMeta = new PrefixTreeBlockMeta();
- this.rowRange = new SimpleMutableByteRange();
- this.familyRange = new SimpleMutableByteRange();
- this.qualifierRange = new SimpleMutableByteRange();
- this.timestamps = new long[INITIAL_PER_CELL_ARRAY_SIZES];
- this.mvccVersions = new long[INITIAL_PER_CELL_ARRAY_SIZES];
- this.typeBytes = new byte[INITIAL_PER_CELL_ARRAY_SIZES];
- this.valueOffsets = new int[INITIAL_PER_CELL_ARRAY_SIZES];
- this.values = new byte[VALUE_BUFFER_INIT_SIZE];
-
- // used during compilation
- this.familyDeduplicator = USE_HASH_COLUMN_SORTER ? new ByteRangeHashSet()
- : new ByteRangeTreeSet();
- this.qualifierDeduplicator = USE_HASH_COLUMN_SORTER ? new ByteRangeHashSet()
- : new ByteRangeTreeSet();
- this.timestampEncoder = new LongEncoder();
- this.mvccVersionEncoder = new LongEncoder();
- this.cellTypeEncoder = new CellTypeEncoder();
- this.rowTokenizer = new Tokenizer();
- this.familyTokenizer = new Tokenizer();
- this.qualifierTokenizer = new Tokenizer();
- this.rowWriter = new RowSectionWriter();
- this.familyWriter = new ColumnSectionWriter();
- this.qualifierWriter = new ColumnSectionWriter();
- initializeTagHelpers();
-
- reset(outputStream, includeMvccVersion);
- }
-
- public void reset(OutputStream outputStream, boolean includeMvccVersion) {
- ++numResets;
- this.includeMvccVersion = includeMvccVersion;
- this.outputStream = outputStream;
- valueOffsets[0] = 0;
- familyDeduplicator.reset();
- qualifierDeduplicator.reset();
- tagsDeduplicator.reset();
- tagsWriter.reset();
- tagsTokenizer.reset();
- rowTokenizer.reset();
- timestampEncoder.reset();
- mvccVersionEncoder.reset();
- cellTypeEncoder.reset();
- familyTokenizer.reset();
- qualifierTokenizer.reset();
- rowWriter.reset();
- familyWriter.reset();
- qualifierWriter.reset();
-
- totalCells = 0;
- totalUnencodedBytes = 0;
- totalValueBytes = 0;
- maxValueLength = 0;
- totalBytes = 0;
- }
-
- protected void initializeTagHelpers() {
- this.tagsRange = new SimpleMutableByteRange();
- this.tagsDeduplicator = USE_HASH_COLUMN_SORTER ? new ByteRangeHashSet()
- : new ByteRangeTreeSet();
- this.tagsTokenizer = new Tokenizer();
- this.tagsWriter = new ColumnSectionWriter();
- }
-
- /**
- * Check that the arrays used to hold cell fragments are large enough for the cell that is being
- * added. Since the PrefixTreeEncoder is cached between uses, these arrays may grow during the
- * first few block encodings but should stabilize quickly.
- */
- protected void ensurePerCellCapacities() {
- int currentCapacity = valueOffsets.length;
- int neededCapacity = totalCells + 2;// some things write one index ahead. +2 to be safe
- if (neededCapacity < currentCapacity) {
- return;
- }
-
- int padding = neededCapacity;//this will double the array size
- timestamps = ArrayUtils.growIfNecessary(timestamps, neededCapacity, padding);
- mvccVersions = ArrayUtils.growIfNecessary(mvccVersions, neededCapacity, padding);
- typeBytes = ArrayUtils.growIfNecessary(typeBytes, neededCapacity, padding);
- valueOffsets = ArrayUtils.growIfNecessary(valueOffsets, neededCapacity, padding);
- }
-
- /******************** CellOutputStream methods *************************/
-
- /**
- * Note: Unused until support is added to the scanner/heap
- * <p/>
- * The following method are optimized versions of write(Cell cell). The result should be
- * identical, however the implementation may be able to execute them much more efficiently because
- * it does not need to compare the unchanged fields with the previous cell's.
- * <p/>
- * Consider the benefits during compaction when paired with a CellScanner that is also aware of
- * row boundaries. The CellScanner can easily use these methods instead of blindly passing Cells
- * to the write(Cell cell) method.
- * <p/>
- * The savings of skipping duplicate row detection are significant with long row keys. A
- * DataBlockEncoder may store a row key once in combination with a count of how many cells are in
- * the row. With a 100 byte row key, we can replace 100 byte comparisons with a single increment
- * of the counter, and that is for every cell in the row.
- */
-
- /**
- * Add a Cell to the output stream but repeat the previous row.
- */
- //@Override
- public void writeWithRepeatRow(Cell cell) {
- ensurePerCellCapacities();//can we optimize away some of this?
-
- //save a relatively expensive row comparison, incrementing the row's counter instead
- rowTokenizer.incrementNumOccurrencesOfLatestValue();
- addFamilyPart(cell);
- addQualifierPart(cell);
- addAfterRowFamilyQualifier(cell);
- }
-
-
- @Override
- public void write(Cell cell) {
- ensurePerCellCapacities();
-
- rowTokenizer.addSorted(PrivateCellUtil.fillRowRange(cell, rowRange));
- addFamilyPart(cell);
- addQualifierPart(cell);
- addTagPart(cell);
- addAfterRowFamilyQualifier(cell);
- }
-
-
- private void addTagPart(Cell cell) {
- PrivateCellUtil.fillTagRange(cell, tagsRange);
- tagsDeduplicator.add(tagsRange);
- }
-
- /***************** internal add methods ************************/
-
- private void addAfterRowFamilyQualifier(Cell cell){
- // timestamps
- timestamps[totalCells] = cell.getTimestamp();
- timestampEncoder.add(cell.getTimestamp());
-
- // memstore timestamps
- if (includeMvccVersion) {
- mvccVersions[totalCells] = cell.getSequenceId();
- mvccVersionEncoder.add(cell.getSequenceId());
- totalUnencodedBytes += WritableUtils.getVIntSize(cell.getSequenceId());
- }else{
- //must overwrite in case there was a previous version in this array slot
- mvccVersions[totalCells] = 0L;
- if(totalCells == 0){//only need to do this for the first cell added
- mvccVersionEncoder.add(0L);
- }
- //totalUncompressedBytes += 0;//mvccVersion takes zero bytes when disabled
- }
-
- // types
- typeBytes[totalCells] = cell.getTypeByte();
- cellTypeEncoder.add(cell.getTypeByte());
-
- // values
- totalValueBytes += cell.getValueLength();
- // double the array each time we run out of space
- values = ArrayUtils.growIfNecessary(values, totalValueBytes, 2 * totalValueBytes);
- CellUtil.copyValueTo(cell, values, valueOffsets[totalCells]);
- if (cell.getValueLength() > maxValueLength) {
- maxValueLength = cell.getValueLength();
- }
- valueOffsets[totalCells + 1] = totalValueBytes;
-
- // general
- totalUnencodedBytes += KeyValueUtil.length(cell);
- ++totalCells;
- }
-
- private void addFamilyPart(Cell cell) {
- if (MULITPLE_FAMILIES_POSSIBLE || totalCells == 0) {
- PrivateCellUtil.fillFamilyRange(cell, familyRange);
- familyDeduplicator.add(familyRange);
- }
- }
-
- private void addQualifierPart(Cell cell) {
- PrivateCellUtil.fillQualifierRange(cell, qualifierRange);
- qualifierDeduplicator.add(qualifierRange);
- }
-
-
- /****************** compiling/flushing ********************/
-
- /**
- * Expensive method. The second half of the encoding work happens here.
- *
- * Take all the separate accumulated data structures and turn them into a single stream of bytes
- * which is written to the outputStream.
- */
- @Override
- public void flush() throws IOException {
- compile();
-
- // do the actual flushing to the output stream. Order matters.
- blockMeta.writeVariableBytesToOutputStream(outputStream);
- rowWriter.writeBytes(outputStream);
- familyWriter.writeBytes(outputStream);
- qualifierWriter.writeBytes(outputStream);
- tagsWriter.writeBytes(outputStream);
- timestampEncoder.writeBytes(outputStream);
- mvccVersionEncoder.writeBytes(outputStream);
- //CellType bytes are in the row nodes. there is no additional type section
- outputStream.write(values, 0, totalValueBytes);
- }
-
- /**
- * Now that all the cells have been added, do the work to reduce them to a series of byte[]
- * fragments that are ready to be written to the output stream.
- */
- protected void compile(){
- blockMeta.setNumKeyValueBytes(totalUnencodedBytes);
- int lastValueOffset = valueOffsets[totalCells];
- blockMeta.setValueOffsetWidth(UFIntTool.numBytes(lastValueOffset));
- blockMeta.setValueLengthWidth(UFIntTool.numBytes(maxValueLength));
- blockMeta.setNumValueBytes(totalValueBytes);
- totalBytes += totalTagBytes + totalValueBytes;
-
- //these compile methods will add to totalBytes
- compileTypes();
- compileMvccVersions();
- compileTimestamps();
- compileTags();
- compileQualifiers();
- compileFamilies();
- compileRows();
-
- int numMetaBytes = blockMeta.calculateNumMetaBytes();
- blockMeta.setNumMetaBytes(numMetaBytes);
- totalBytes += numMetaBytes;
- }
-
- /**
- * <p>
- * The following "compile" methods do any intermediate work necessary to transform the cell
- * fragments collected during the writing phase into structures that are ready to write to the
- * outputStream.
- * </p>
- * The family and qualifier treatment is almost identical, as is timestamp and mvccVersion.
- */
-
- protected void compileTypes() {
- blockMeta.setAllSameType(cellTypeEncoder.areAllSameType());
- if(cellTypeEncoder.areAllSameType()){
- blockMeta.setAllTypes(cellTypeEncoder.getOnlyType());
- }
- }
-
- protected void compileMvccVersions() {
- mvccVersionEncoder.compile();
- blockMeta.setMvccVersionFields(mvccVersionEncoder);
- int numMvccVersionBytes = mvccVersionEncoder.getOutputArrayLength();
- totalBytes += numMvccVersionBytes;
- }
-
- protected void compileTimestamps() {
- timestampEncoder.compile();
- blockMeta.setTimestampFields(timestampEncoder);
- int numTimestampBytes = timestampEncoder.getOutputArrayLength();
- totalBytes += numTimestampBytes;
- }
-
- protected void compileQualifiers() {
- blockMeta.setNumUniqueQualifiers(qualifierDeduplicator.size());
- qualifierDeduplicator.compile();
- qualifierTokenizer.addAll(qualifierDeduplicator.getSortedRanges());
- qualifierWriter.reconstruct(blockMeta, qualifierTokenizer, ColumnNodeType.QUALIFIER);
- qualifierWriter.compile();
- int numQualifierBytes = qualifierWriter.getNumBytes();
- blockMeta.setNumQualifierBytes(numQualifierBytes);
- totalBytes += numQualifierBytes;
- }
-
- protected void compileFamilies() {
- blockMeta.setNumUniqueFamilies(familyDeduplicator.size());
- familyDeduplicator.compile();
- familyTokenizer.addAll(familyDeduplicator.getSortedRanges());
- familyWriter.reconstruct(blockMeta, familyTokenizer, ColumnNodeType.FAMILY);
- familyWriter.compile();
- int numFamilyBytes = familyWriter.getNumBytes();
- blockMeta.setNumFamilyBytes(numFamilyBytes);
- totalBytes += numFamilyBytes;
- }
-
- protected void compileTags() {
- blockMeta.setNumUniqueTags(tagsDeduplicator.size());
- tagsDeduplicator.compile();
- tagsTokenizer.addAll(tagsDeduplicator.getSortedRanges());
- tagsWriter.reconstruct(blockMeta, tagsTokenizer, ColumnNodeType.TAGS);
- tagsWriter.compile();
- int numTagBytes = tagsWriter.getNumBytes();
- blockMeta.setNumTagsBytes(numTagBytes);
- totalBytes += numTagBytes;
- }
-
- protected void compileRows() {
- rowWriter.reconstruct(this);
- rowWriter.compile();
- int numRowBytes = rowWriter.getNumBytes();
- blockMeta.setNumRowBytes(numRowBytes);
- blockMeta.setRowTreeDepth(rowTokenizer.getTreeDepth());
- totalBytes += numRowBytes;
- }
-
- /********************* convenience getters ********************************/
-
- public long getValueOffset(int index) {
- return valueOffsets[index];
- }
-
- public int getValueLength(int index) {
- return (int) (valueOffsets[index + 1] - valueOffsets[index]);
- }
-
- /************************* get/set *************************************/
-
- public PrefixTreeBlockMeta getBlockMeta() {
- return blockMeta;
- }
-
- public Tokenizer getRowTokenizer() {
- return rowTokenizer;
- }
-
- public LongEncoder getTimestampEncoder() {
- return timestampEncoder;
- }
-
- public int getTotalBytes() {
- return totalBytes;
- }
-
- public long[] getTimestamps() {
- return timestamps;
- }
-
- public long[] getMvccVersions() {
- return mvccVersions;
- }
-
- public byte[] getTypeBytes() {
- return typeBytes;
- }
-
- public LongEncoder getMvccVersionEncoder() {
- return mvccVersionEncoder;
- }
-
- public ByteRangeSet getFamilySorter() {
- return familyDeduplicator;
- }
-
- public ByteRangeSet getQualifierSorter() {
- return qualifierDeduplicator;
- }
-
- public ByteRangeSet getTagSorter() {
- return tagsDeduplicator;
- }
-
- public ColumnSectionWriter getFamilyWriter() {
- return familyWriter;
- }
-
- public ColumnSectionWriter getQualifierWriter() {
- return qualifierWriter;
- }
-
- public ColumnSectionWriter getTagWriter() {
- return tagsWriter;
- }
-
- public RowSectionWriter getRowWriter() {
- return rowWriter;
- }
-
- public ByteRange getValueByteRange() {
- return new SimpleMutableByteRange(values, 0, totalValueBytes);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/hbase/blob/f8c58930/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/column/ColumnNodeWriter.java
----------------------------------------------------------------------
diff --git a/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/column/ColumnNodeWriter.java b/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/column/ColumnNodeWriter.java
deleted file mode 100644
index 13c9c9d..0000000
--- a/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/column/ColumnNodeWriter.java
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.hbase.codec.prefixtree.encode.column;
-
-import java.io.IOException;
-import java.io.OutputStream;
-
-import org.apache.yetus.audience.InterfaceAudience;
-import org.apache.hadoop.hbase.codec.prefixtree.PrefixTreeBlockMeta;
-import org.apache.hadoop.hbase.codec.prefixtree.encode.other.ColumnNodeType;
-import org.apache.hadoop.hbase.codec.prefixtree.encode.tokenize.TokenizerNode;
-import org.apache.hadoop.hbase.util.ByteRange;
-import org.apache.hadoop.hbase.util.Bytes;
-import org.apache.hadoop.hbase.util.Strings;
-import org.apache.hadoop.hbase.util.vint.UFIntTool;
-import org.apache.hadoop.hbase.util.vint.UVIntTool;
-
-/**
- * <p>
- * Column nodes can be either family nodes or qualifier nodes, as both sections encode similarly.
- * The family and qualifier sections of the data block are made of 1 or more of these nodes.
- * </p>
- * Each node is composed of 3 sections:<br>
- * <ul>
- * <li>tokenLength: UVInt (normally 1 byte) indicating the number of token bytes
- * <li>token[]: the actual token bytes
- * <li>parentStartPosition: the offset of the next node from the start of the family or qualifier
- * section
- * </ul>
- */
-@InterfaceAudience.Private
-public class ColumnNodeWriter{
-
- /************* fields ****************************/
-
- protected TokenizerNode builderNode;
- protected PrefixTreeBlockMeta blockMeta;
-
- protected int tokenLength;
- protected byte[] token;
- protected int parentStartPosition;
- protected ColumnNodeType nodeType;
-
-
- /*************** construct **************************/
-
- public ColumnNodeWriter(PrefixTreeBlockMeta blockMeta, TokenizerNode builderNode,
- ColumnNodeType nodeType) {
- this.blockMeta = blockMeta;
- this.builderNode = builderNode;
- this.nodeType = nodeType;
- calculateTokenLength();
- }
-
-
- /************* methods *******************************/
-
- public boolean isRoot() {
- return parentStartPosition == 0;
- }
-
- private void calculateTokenLength() {
- tokenLength = builderNode.getTokenLength();
- token = new byte[tokenLength];
- }
-
- /**
- * This method is called before blockMeta.qualifierOffsetWidth is known, so we pass in a
- * placeholder.
- * @param offsetWidthPlaceholder the placeholder
- * @return node width
- */
- public int getWidthUsingPlaceholderForOffsetWidth(int offsetWidthPlaceholder) {
- int width = 0;
- width += UVIntTool.numBytes(tokenLength);
- width += token.length;
- width += offsetWidthPlaceholder;
- return width;
- }
-
- public void writeBytes(OutputStream os) throws IOException {
- int parentOffsetWidth;
- if (this.nodeType == ColumnNodeType.FAMILY) {
- parentOffsetWidth = blockMeta.getFamilyOffsetWidth();
- } else if (this.nodeType == ColumnNodeType.QUALIFIER) {
- parentOffsetWidth = blockMeta.getQualifierOffsetWidth();
- } else {
- parentOffsetWidth = blockMeta.getTagsOffsetWidth();
- }
- UVIntTool.writeBytes(tokenLength, os);
- os.write(token);
- UFIntTool.writeBytes(parentOffsetWidth, parentStartPosition, os);
- }
-
- public void setTokenBytes(ByteRange source) {
- source.deepCopySubRangeTo(0, tokenLength, token, 0);
- }
-
-
- /****************** standard methods ************************/
-
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder();
- sb.append(Strings.padFront(builderNode.getOutputArrayOffset() + "", ' ', 3) + ",");
- sb.append("[");
- sb.append(Bytes.toString(token));
- sb.append("]->");
- sb.append(parentStartPosition);
- return sb.toString();
- }
-
-
- /************************** get/set ***********************/
-
- public void setParentStartPosition(int parentStartPosition) {
- this.parentStartPosition = parentStartPosition;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/hbase/blob/f8c58930/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/column/ColumnSectionWriter.java
----------------------------------------------------------------------
diff --git a/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/column/ColumnSectionWriter.java b/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/column/ColumnSectionWriter.java
deleted file mode 100644
index 986bc06..0000000
--- a/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/column/ColumnSectionWriter.java
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.hbase.codec.prefixtree.encode.column;
-
-import java.io.IOException;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.yetus.audience.InterfaceAudience;
-import org.apache.hadoop.hbase.codec.prefixtree.PrefixTreeBlockMeta;
-import org.apache.hadoop.hbase.codec.prefixtree.encode.other.ColumnNodeType;
-import org.apache.hadoop.hbase.codec.prefixtree.encode.tokenize.Tokenizer;
-import org.apache.hadoop.hbase.codec.prefixtree.encode.tokenize.TokenizerNode;
-import org.apache.hadoop.hbase.util.CollectionUtils;
-import org.apache.hadoop.hbase.util.vint.UFIntTool;
-
-import org.apache.hadoop.hbase.shaded.com.google.common.collect.Lists;
-
-/**
- * <p>
- * Takes the tokenized family or qualifier data and flattens it into a stream of bytes. The family
- * section is written after the row section, and qualifier section after family section.
- * </p>
- * The family and qualifier tries, or "column tries", are structured differently than the row trie.
- * The trie cannot be reassembled without external data about the offsets of the leaf nodes, and
- * these external pointers are stored in the nubs and leaves of the row trie. For each cell in a
- * row, the row trie contains a list of offsets into the column sections (along with pointers to
- * timestamps and other per-cell fields). These offsets point to the last column node/token that
- * comprises the column name. To assemble the column name, the trie is traversed in reverse (right
- * to left), with the rightmost tokens pointing to the start of their "parent" node which is the
- * node to the left.
- * <p>
- * This choice was made to reduce the size of the column trie by storing the minimum amount of
- * offset data. As a result, to find a specific qualifier within a row, you must do a binary search
- * of the column nodes, reassembling each one as you search. Future versions of the PrefixTree might
- * encode the columns in both a forward and reverse trie, which would convert binary searches into
- * more efficient trie searches which would be beneficial for wide rows.
- * </p>
- */
-@InterfaceAudience.Private
-public class ColumnSectionWriter {
-
- public static final int EXPECTED_NUBS_PLUS_LEAVES = 100;
-
- /****************** fields ****************************/
-
- private PrefixTreeBlockMeta blockMeta;
-
- private ColumnNodeType nodeType;
- private Tokenizer tokenizer;
- private int numBytes = 0;
- private ArrayList<TokenizerNode> nonLeaves;
- private ArrayList<TokenizerNode> leaves;
- private ArrayList<TokenizerNode> allNodes;
- private ArrayList<ColumnNodeWriter> columnNodeWriters;
- private List<Integer> outputArrayOffsets;
-
-
- /*********************** construct *********************/
-
- public ColumnSectionWriter() {
- this.nonLeaves = Lists.newArrayList();
- this.leaves = Lists.newArrayList();
- this.outputArrayOffsets = Lists.newArrayList();
- }
-
- public ColumnSectionWriter(PrefixTreeBlockMeta blockMeta, Tokenizer builder,
- ColumnNodeType nodeType) {
- this();// init collections
- reconstruct(blockMeta, builder, nodeType);
- }
-
- public void reconstruct(PrefixTreeBlockMeta blockMeta, Tokenizer builder,
- ColumnNodeType nodeType) {
- this.blockMeta = blockMeta;
- this.tokenizer = builder;
- this.nodeType = nodeType;
- }
-
- public void reset() {
- numBytes = 0;
- nonLeaves.clear();
- leaves.clear();
- outputArrayOffsets.clear();
- }
-
-
- /****************** methods *******************************/
-
- public ColumnSectionWriter compile() {
- if (this.nodeType == ColumnNodeType.FAMILY) {
- // do nothing. max family length fixed at Byte.MAX_VALUE
- } else if (this.nodeType == ColumnNodeType.QUALIFIER) {
- blockMeta.setMaxQualifierLength(tokenizer.getMaxElementLength());
- } else {
- blockMeta.setMaxTagsLength(tokenizer.getMaxElementLength());
- }
- compilerInternals();
- return this;
- }
-
- protected void compilerInternals() {
- tokenizer.setNodeFirstInsertionIndexes();
- tokenizer.appendNodes(nonLeaves, true, false);
-
- tokenizer.appendNodes(leaves, false, true);
-
- allNodes = Lists.newArrayListWithCapacity(nonLeaves.size() + leaves.size());
- allNodes.addAll(nonLeaves);
- allNodes.addAll(leaves);
-
- columnNodeWriters = Lists.newArrayListWithCapacity(CollectionUtils.nullSafeSize(allNodes));
- for (int i = 0; i < allNodes.size(); ++i) {
- TokenizerNode node = allNodes.get(i);
- columnNodeWriters.add(new ColumnNodeWriter(blockMeta, node, this.nodeType));
- }
-
- // leaf widths are known at this point, so add them up
- int totalBytesWithoutOffsets = 0;
- for (int i = allNodes.size() - 1; i >= 0; --i) {
- ColumnNodeWriter columnNodeWriter = columnNodeWriters.get(i);
- // leaves store all but their first token byte
- totalBytesWithoutOffsets += columnNodeWriter.getWidthUsingPlaceholderForOffsetWidth(0);
- }
-
- // figure out how wide our offset FInts are
- int parentOffsetWidth = 0;
- while (true) {
- ++parentOffsetWidth;
- int numBytesFinder = totalBytesWithoutOffsets + parentOffsetWidth * allNodes.size();
- if (numBytesFinder < UFIntTool.maxValueForNumBytes(parentOffsetWidth)) {
- numBytes = numBytesFinder;
- break;
- }// it fits
- }
- if (this.nodeType == ColumnNodeType.FAMILY) {
- blockMeta.setFamilyOffsetWidth(parentOffsetWidth);
- } else if (this.nodeType == ColumnNodeType.QUALIFIER) {
- blockMeta.setQualifierOffsetWidth(parentOffsetWidth);
- } else {
- blockMeta.setTagsOffsetWidth(parentOffsetWidth);
- }
-
- int forwardIndex = 0;
- for (int i = 0; i < allNodes.size(); ++i) {
- TokenizerNode node = allNodes.get(i);
- ColumnNodeWriter columnNodeWriter = columnNodeWriters.get(i);
- int fullNodeWidth = columnNodeWriter
- .getWidthUsingPlaceholderForOffsetWidth(parentOffsetWidth);
- node.setOutputArrayOffset(forwardIndex);
- columnNodeWriter.setTokenBytes(node.getToken());
- if (node.isRoot()) {
- columnNodeWriter.setParentStartPosition(0);
- } else {
- columnNodeWriter.setParentStartPosition(node.getParent().getOutputArrayOffset());
- }
- forwardIndex += fullNodeWidth;
- }
-
- tokenizer.appendOutputArrayOffsets(outputArrayOffsets);
- }
-
- public void writeBytes(OutputStream os) throws IOException {
- for (ColumnNodeWriter columnNodeWriter : columnNodeWriters) {
- columnNodeWriter.writeBytes(os);
- }
- }
-
-
- /************* get/set **************************/
-
- public ArrayList<ColumnNodeWriter> getColumnNodeWriters() {
- return columnNodeWriters;
- }
-
- public int getNumBytes() {
- return numBytes;
- }
-
- public int getOutputArrayOffset(int sortedIndex) {
- return outputArrayOffsets.get(sortedIndex);
- }
-
- public ArrayList<TokenizerNode> getNonLeaves() {
- return nonLeaves;
- }
-
- public ArrayList<TokenizerNode> getLeaves() {
- return leaves;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/hbase/blob/f8c58930/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/other/CellTypeEncoder.java
----------------------------------------------------------------------
diff --git a/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/other/CellTypeEncoder.java b/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/other/CellTypeEncoder.java
deleted file mode 100644
index 73398f6..0000000
--- a/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/other/CellTypeEncoder.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.hbase.codec.prefixtree.encode.other;
-
-import org.apache.yetus.audience.InterfaceAudience;
-
-/**
- * Detect if every KV has the same KeyValue.Type, in which case we don't need to store it for each
- * KV. If(allSameType) during conversion to byte[], then we can store the "onlyType" in blockMeta,
- * therefore not repeating it for each cell and saving 1 byte per cell.
- */
-@InterfaceAudience.Private
-public class CellTypeEncoder {
-
- /************* fields *********************/
-
- protected boolean pendingFirstType = true;
- protected boolean allSameType = true;
- protected byte onlyType;
-
-
- /************* construct *********************/
-
- public void reset() {
- pendingFirstType = true;
- allSameType = true;
- }
-
-
- /************* methods *************************/
-
- public void add(byte type) {
- if (pendingFirstType) {
- onlyType = type;
- pendingFirstType = false;
- } else if (onlyType != type) {
- allSameType = false;
- }
- }
-
-
- /**************** get/set **************************/
-
- public boolean areAllSameType() {
- return allSameType;
- }
-
- public byte getOnlyType() {
- return onlyType;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/hbase/blob/f8c58930/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/other/ColumnNodeType.java
----------------------------------------------------------------------
diff --git a/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/other/ColumnNodeType.java b/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/other/ColumnNodeType.java
deleted file mode 100644
index 090f143..0000000
--- a/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/other/ColumnNodeType.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hbase.codec.prefixtree.encode.other;
-
-import org.apache.yetus.audience.InterfaceAudience;
-
-/**
- * Specifies the type of columnnode writer.
- */
-@InterfaceAudience.Private
-public enum ColumnNodeType {
- FAMILY, QUALIFIER, TAGS;
-}
http://git-wip-us.apache.org/repos/asf/hbase/blob/f8c58930/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/other/LongEncoder.java
----------------------------------------------------------------------
diff --git a/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/other/LongEncoder.java b/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/other/LongEncoder.java
deleted file mode 100644
index e5d153e..0000000
--- a/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/other/LongEncoder.java
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.hbase.codec.prefixtree.encode.other;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.util.Arrays;
-import java.util.HashSet;
-
-import org.apache.yetus.audience.InterfaceAudience;
-import org.apache.hadoop.hbase.util.ArrayUtils;
-import org.apache.hadoop.hbase.util.CollectionUtils;
-import org.apache.hadoop.hbase.util.vint.UFIntTool;
-
-import org.apache.hadoop.hbase.shaded.com.google.common.base.Joiner;
-
-/**
- * Used to de-duplicate, sort, minimize/diff, and serialize timestamps and mvccVersions from a
- * collection of Cells.
- *
- * 1. add longs to a HashSet for fast de-duplication
- * 2. keep track of the min and max
- * 3. copy all values to a new long[]
- * 4. Collections.sort the long[]
- * 5. calculate maxDelta = max - min
- * 6. determine FInt width based on maxDelta
- * 7. PrefixTreeEncoder binary searches to find index of each value
- */
-@InterfaceAudience.Private
-public class LongEncoder {
-
- /****************** fields ****************************/
-
- protected HashSet<Long> uniqueValues;
- protected long[] sortedUniqueValues;
- protected long min, max, maxDelta;
-
- protected int bytesPerDelta;
- protected int bytesPerIndex;
- protected int totalCompressedBytes;
-
-
- /****************** construct ****************************/
-
- public LongEncoder() {
- this.uniqueValues = new HashSet<>();
- }
-
- public void reset() {
- uniqueValues.clear();
- sortedUniqueValues = null;
- min = Long.MAX_VALUE;
- max = Long.MIN_VALUE;
- maxDelta = Long.MIN_VALUE;
- bytesPerIndex = 0;
- bytesPerDelta = 0;
- totalCompressedBytes = 0;
- }
-
-
- /************* methods ***************************/
-
- public void add(long timestamp) {
- uniqueValues.add(timestamp);
- }
-
- public LongEncoder compile() {
- int numUnique = uniqueValues.size();
- if (numUnique == 1) {
- min = CollectionUtils.getFirst(uniqueValues);
- sortedUniqueValues = new long[] { min };
- return this;
- }
-
- sortedUniqueValues = new long[numUnique];
- int lastIndex = -1;
- for (long value : uniqueValues) {
- sortedUniqueValues[++lastIndex] = value;
- }
- Arrays.sort(sortedUniqueValues);
- min = ArrayUtils.getFirst(sortedUniqueValues);
- max = ArrayUtils.getLast(sortedUniqueValues);
- maxDelta = max - min;
- if (maxDelta > 0) {
- bytesPerDelta = UFIntTool.numBytes(maxDelta);
- } else {
- bytesPerDelta = 0;
- }
-
- int maxIndex = numUnique - 1;
- bytesPerIndex = UFIntTool.numBytes(maxIndex);
-
- totalCompressedBytes = numUnique * bytesPerDelta;
-
- return this;
- }
-
- public long getDelta(int index) {
- if (sortedUniqueValues.length == 0) {
- return 0;
- }
- return sortedUniqueValues[index] - min;
- }
-
- public int getIndex(long value) {
- // should always find an exact match
- return Arrays.binarySearch(sortedUniqueValues, value);
- }
-
- public void writeBytes(OutputStream os) throws IOException {
- for (int i = 0; i < sortedUniqueValues.length; ++i) {
- long delta = sortedUniqueValues[i] - min;
- UFIntTool.writeBytes(bytesPerDelta, delta, os);
- }
- }
-
- //convenience method for tests
- public byte[] getByteArray() throws IOException{
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- writeBytes(baos);
- return baos.toByteArray();
- }
-
- public int getOutputArrayLength() {
- return sortedUniqueValues.length * bytesPerDelta;
- }
-
- public int getNumUniqueValues() {
- return sortedUniqueValues.length;
- }
-
-
- /******************* Object methods **********************/
-
- @Override
- public String toString() {
- if (ArrayUtils.isEmpty(sortedUniqueValues)) {
- return "[]";
- }
- return "[" + Joiner.on(",").join(ArrayUtils.toList(sortedUniqueValues)) + "]";
- }
-
-
- /******************** get/set **************************/
-
- public long getMin() {
- return min;
- }
-
- public int getBytesPerDelta() {
- return bytesPerDelta;
- }
-
- public int getBytesPerIndex() {
- return bytesPerIndex;
- }
-
- public int getTotalCompressedBytes() {
- return totalCompressedBytes;
- }
-
- public long[] getSortedUniqueTimestamps() {
- return sortedUniqueValues;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/hbase/blob/f8c58930/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/row/RowNodeWriter.java
----------------------------------------------------------------------
diff --git a/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/row/RowNodeWriter.java b/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/row/RowNodeWriter.java
deleted file mode 100644
index 6e114e9..0000000
--- a/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/row/RowNodeWriter.java
+++ /dev/null
@@ -1,300 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.hbase.codec.prefixtree.encode.row;
-
-import java.io.IOException;
-import java.io.OutputStream;
-import java.util.ArrayList;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.yetus.audience.InterfaceAudience;
-import org.apache.hadoop.hbase.codec.prefixtree.PrefixTreeBlockMeta;
-import org.apache.hadoop.hbase.codec.prefixtree.encode.PrefixTreeEncoder;
-import org.apache.hadoop.hbase.codec.prefixtree.encode.tokenize.TokenizerNode;
-import org.apache.hadoop.hbase.util.ByteRangeUtils;
-import org.apache.hadoop.hbase.util.CollectionUtils;
-import org.apache.hadoop.hbase.util.vint.UFIntTool;
-import org.apache.hadoop.hbase.util.vint.UVIntTool;
-
-/**
- * Serializes the fields comprising one node of the row trie, which can be a branch, nub, or leaf.
- * Please see the write() method for the order in which data is written.
- */
-@InterfaceAudience.Private
-public class RowNodeWriter{
- protected static final Log LOG = LogFactory.getLog(RowNodeWriter.class);
-
- /********************* fields ******************************/
-
- protected PrefixTreeEncoder prefixTreeEncoder;
- protected PrefixTreeBlockMeta blockMeta;
- protected TokenizerNode tokenizerNode;
-
- protected int tokenWidth;
- protected int fanOut;
- protected int numCells;
-
- protected int width;
-
-
- /*********************** construct *************************/
-
- public RowNodeWriter(PrefixTreeEncoder keyValueBuilder, TokenizerNode tokenizerNode) {
- reconstruct(keyValueBuilder, tokenizerNode);
- }
-
- public void reconstruct(PrefixTreeEncoder prefixTreeEncoder, TokenizerNode tokenizerNode) {
- this.prefixTreeEncoder = prefixTreeEncoder;
- reset(tokenizerNode);
- }
-
- public void reset(TokenizerNode node) {
- this.blockMeta = prefixTreeEncoder.getBlockMeta();// changes between blocks
- this.tokenizerNode = node;
- this.tokenWidth = 0;
- this.fanOut = 0;
- this.numCells = 0;
- this.width = 0;
- calculateOffsetsAndLengths();
- }
-
-
- /********************* methods ****************************/
-
- protected void calculateOffsetsAndLengths(){
- tokenWidth = tokenizerNode.getTokenLength();
- if(!tokenizerNode.isRoot()){
- --tokenWidth;//root has no parent
- }
- fanOut = CollectionUtils.nullSafeSize(tokenizerNode.getChildren());
- numCells = tokenizerNode.getNumOccurrences();
- }
-
- public int calculateWidth(){
- calculateWidthOverrideOffsetWidth(blockMeta.getNextNodeOffsetWidth());
- return width;
- }
-
- public int calculateWidthOverrideOffsetWidth(int offsetWidth){
- width = 0;
- width += UVIntTool.numBytes(tokenWidth);
- width += tokenWidth;
-
- width += UVIntTool.numBytes(fanOut);
- width += fanOut;
-
- width += UVIntTool.numBytes(numCells);
-
- if(tokenizerNode.hasOccurrences()){
- int fixedBytesPerCell = blockMeta.getFamilyOffsetWidth()
- + blockMeta.getQualifierOffsetWidth()
- + blockMeta.getTagsOffsetWidth()
- + blockMeta.getTimestampIndexWidth()
- + blockMeta.getMvccVersionIndexWidth()
- + blockMeta.getKeyValueTypeWidth()
- + blockMeta.getValueOffsetWidth()
- + blockMeta.getValueLengthWidth();
- width += numCells * fixedBytesPerCell;
- }
-
- if (!tokenizerNode.isLeaf()) {
- width += fanOut * offsetWidth;
- }
-
- return width;
- }
-
-
- /*********************** writing the compiled structure to the OutputStream ***************/
-
- public void write(OutputStream os) throws IOException{
- //info about this row trie node
- writeRowToken(os);
- writeFan(os);
- writeNumCells(os);
-
- //UFInt indexes and offsets for each cell in the row (if nub or leaf)
- writeFamilyNodeOffsets(os);
- writeQualifierNodeOffsets(os);
- writeTagNodeOffsets(os);
- writeTimestampIndexes(os);
- writeMvccVersionIndexes(os);
- writeCellTypes(os);
- writeValueOffsets(os);
- writeValueLengths(os);
- //offsets to the children of this row trie node (if branch or nub)
- writeNextRowTrieNodeOffsets(os);
- }
-
-
- /**
- * Row node token, fan, and numCells. Written once at the beginning of each row node. These 3
- * fields can reproduce all the row keys that compose the block.
- */
-
- /**
- * UVInt: tokenWidth
- * bytes: token
- */
- protected void writeRowToken(OutputStream os) throws IOException {
- UVIntTool.writeBytes(tokenWidth, os);
- int tokenStartIndex = tokenizerNode.isRoot() ? 0 : 1;
- ByteRangeUtils.write(os, tokenizerNode.getToken(), tokenStartIndex);
- }
-
- /**
- * UVInt: numFanBytes/fanOut
- * bytes: each fan byte
- */
- public void writeFan(OutputStream os) throws IOException {
- UVIntTool.writeBytes(fanOut, os);
- if (fanOut <= 0) {
- return;
- }
- ArrayList<TokenizerNode> children = tokenizerNode.getChildren();
- for (int i = 0; i < children.size(); ++i) {
- TokenizerNode child = children.get(i);
- os.write(child.getToken().get(0));// first byte of each child's token
- }
- }
-
- /**
- * UVInt: numCells, the number of cells in this row which will be 0 for branch nodes
- */
- protected void writeNumCells(OutputStream os) throws IOException {
- UVIntTool.writeBytes(numCells, os);
- }
-
-
- /**
- * The following methods write data for each cell in the row, mostly consisting of indexes or
- * offsets into the timestamp/column data structures that are written in the middle of the block.
- * We use {@link UFIntTool} to encode these indexes/offsets to allow random access during a binary
- * search of a particular column/timestamp combination.
- * <p>
- * Branch nodes will not have any data in these sections.
- * </p>
- */
-
- protected void writeFamilyNodeOffsets(OutputStream os) throws IOException {
- if (blockMeta.getFamilyOffsetWidth() <= 0) {
- return;
- }
- for (int i = 0; i < numCells; ++i) {
- int cellInsertionIndex = PrefixTreeEncoder.MULITPLE_FAMILIES_POSSIBLE ? tokenizerNode
- .getFirstInsertionIndex() + i : 0;
- int sortedIndex = prefixTreeEncoder.getFamilySorter().getSortedIndexForInsertionId(
- cellInsertionIndex);
- int indexedFamilyOffset = prefixTreeEncoder.getFamilyWriter().getOutputArrayOffset(
- sortedIndex);
- UFIntTool.writeBytes(blockMeta.getFamilyOffsetWidth(), indexedFamilyOffset, os);
- }
- }
-
- protected void writeQualifierNodeOffsets(OutputStream os) throws IOException {
- if (blockMeta.getQualifierOffsetWidth() <= 0) {
- return;
- }
- for (int i = 0; i < numCells; ++i) {
- int cellInsertionIndex = tokenizerNode.getFirstInsertionIndex() + i;
- int sortedIndex = prefixTreeEncoder.getQualifierSorter().getSortedIndexForInsertionId(
- cellInsertionIndex);
- int indexedQualifierOffset = prefixTreeEncoder.getQualifierWriter().getOutputArrayOffset(
- sortedIndex);
- UFIntTool.writeBytes(blockMeta.getQualifierOffsetWidth(), indexedQualifierOffset, os);
- }
- }
-
- protected void writeTagNodeOffsets(OutputStream os) throws IOException {
- if (blockMeta.getTagsOffsetWidth() <= 0) {
- return;
- }
- for (int i = 0; i < numCells; ++i) {
- int cellInsertionIndex = tokenizerNode.getFirstInsertionIndex() + i;
- int sortedIndex = prefixTreeEncoder.getTagSorter().getSortedIndexForInsertionId(
- cellInsertionIndex);
- int indexedTagOffset = prefixTreeEncoder.getTagWriter().getOutputArrayOffset(
- sortedIndex);
- UFIntTool.writeBytes(blockMeta.getTagsOffsetWidth(), indexedTagOffset, os);
- }
- }
-
- protected void writeTimestampIndexes(OutputStream os) throws IOException {
- if (blockMeta.getTimestampIndexWidth() <= 0) {
- return;
- }
- for (int i = 0; i < numCells; ++i) {
- int cellInsertionIndex = tokenizerNode.getFirstInsertionIndex() + i;
- long timestamp = prefixTreeEncoder.getTimestamps()[cellInsertionIndex];
- int timestampIndex = prefixTreeEncoder.getTimestampEncoder().getIndex(timestamp);
- UFIntTool.writeBytes(blockMeta.getTimestampIndexWidth(), timestampIndex, os);
- }
- }
-
- protected void writeMvccVersionIndexes(OutputStream os) throws IOException {
- if (blockMeta.getMvccVersionIndexWidth() <= 0) {
- return;
- }
- for (int i = 0; i < numCells; ++i) {
- int cellInsertionIndex = tokenizerNode.getFirstInsertionIndex() + i;
- long mvccVersion = prefixTreeEncoder.getMvccVersions()[cellInsertionIndex];
- int mvccVersionIndex = prefixTreeEncoder.getMvccVersionEncoder().getIndex(mvccVersion);
- UFIntTool.writeBytes(blockMeta.getMvccVersionIndexWidth(), mvccVersionIndex, os);
- }
- }
-
- protected void writeCellTypes(OutputStream os) throws IOException {
- if (blockMeta.isAllSameType()) {
- return;
- }
- for (int i = 0; i < numCells; ++i) {
- int cellInsertionIndex = tokenizerNode.getFirstInsertionIndex() + i;
- os.write(prefixTreeEncoder.getTypeBytes()[cellInsertionIndex]);
- }
- }
-
- protected void writeValueOffsets(OutputStream os) throws IOException {
- for (int i = 0; i < numCells; ++i) {
- int cellInsertionIndex = tokenizerNode.getFirstInsertionIndex() + i;
- long valueStartIndex = prefixTreeEncoder.getValueOffset(cellInsertionIndex);
- UFIntTool.writeBytes(blockMeta.getValueOffsetWidth(), valueStartIndex, os);
- }
- }
-
- protected void writeValueLengths(OutputStream os) throws IOException {
- for (int i = 0; i < numCells; ++i) {
- int cellInsertionIndex = tokenizerNode.getFirstInsertionIndex() + i;
- int valueLength = prefixTreeEncoder.getValueLength(cellInsertionIndex);
- UFIntTool.writeBytes(blockMeta.getValueLengthWidth(), valueLength, os);
- }
- }
-
- /**
- * If a branch or a nub, the last thing we append are the UFInt offsets to the child row nodes.
- */
- protected void writeNextRowTrieNodeOffsets(OutputStream os) throws IOException {
- ArrayList<TokenizerNode> children = tokenizerNode.getChildren();
- for (int i = 0; i < children.size(); ++i) {
- TokenizerNode child = children.get(i);
- int distanceToChild = tokenizerNode.getNegativeIndex() - child.getNegativeIndex();
- UFIntTool.writeBytes(blockMeta.getNextNodeOffsetWidth(), distanceToChild, os);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hbase/blob/f8c58930/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/row/RowSectionWriter.java
----------------------------------------------------------------------
diff --git a/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/row/RowSectionWriter.java b/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/row/RowSectionWriter.java
deleted file mode 100644
index 3d9fa13..0000000
--- a/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/row/RowSectionWriter.java
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.hbase.codec.prefixtree.encode.row;
-
-import java.io.IOException;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.yetus.audience.InterfaceAudience;
-import org.apache.hadoop.hbase.codec.prefixtree.PrefixTreeBlockMeta;
-import org.apache.hadoop.hbase.codec.prefixtree.encode.PrefixTreeEncoder;
-import org.apache.hadoop.hbase.codec.prefixtree.encode.tokenize.TokenizerNode;
-import org.apache.hadoop.hbase.util.vint.UFIntTool;
-
-import org.apache.hadoop.hbase.shaded.com.google.common.collect.Lists;
-
-/**
- * Most of the complexity of the PrefixTree is contained in the "row section". It contains the row
- * key trie structure used to search and recreate all the row keys. Each nub and leaf in this trie
- * also contains references to offsets in the other sections of the data block that enable the
- * decoder to match a row key with its qualifier, timestamp, type, value, etc.
- * <p>
- * The row section is a concatenated collection of {@link RowNodeWriter}s. See that class for the
- * internals of each row node.
- */
-@InterfaceAudience.Private
-public class RowSectionWriter {
-
- /***************** fields **************************/
-
- protected PrefixTreeEncoder prefixTreeEncoder;
-
- protected PrefixTreeBlockMeta blockMeta;
-
- protected int numBytes;
-
- protected ArrayList<TokenizerNode> nonLeaves;
- protected ArrayList<TokenizerNode> leaves;
-
- protected ArrayList<RowNodeWriter> leafWriters;
- protected ArrayList<RowNodeWriter> nonLeafWriters;
-
- protected int numLeafWriters;
- protected int numNonLeafWriters;
-
-
- /********************* construct **********************/
-
- public RowSectionWriter() {
- this.nonLeaves = Lists.newArrayList();
- this.leaves = Lists.newArrayList();
- this.leafWriters = Lists.newArrayList();
- this.nonLeafWriters = Lists.newArrayList();
- }
-
- public RowSectionWriter(PrefixTreeEncoder prefixTreeEncoder) {
- reconstruct(prefixTreeEncoder);
- }
-
- public void reconstruct(PrefixTreeEncoder prefixTreeEncoder) {
- this.prefixTreeEncoder = prefixTreeEncoder;
- this.blockMeta = prefixTreeEncoder.getBlockMeta();
- reset();
- }
-
- public void reset() {
- numBytes = 0;
- nonLeaves.clear();
- leaves.clear();
- numLeafWriters = 0;
- numNonLeafWriters = 0;
- }
-
-
- /****************** methods *******************************/
-
- public RowSectionWriter compile() {
- blockMeta.setMaxRowLength(prefixTreeEncoder.getRowTokenizer().getMaxElementLength());
- prefixTreeEncoder.getRowTokenizer().setNodeFirstInsertionIndexes();
-
- prefixTreeEncoder.getRowTokenizer().appendNodes(nonLeaves, true, false);
- prefixTreeEncoder.getRowTokenizer().appendNodes(leaves, false, true);
-
- // track the starting position of each node in final output
- int negativeIndex = 0;
-
- // create leaf writer nodes
- // leaf widths are known at this point, so add them up
- int totalLeafBytes = 0;
- for (int i = leaves.size() - 1; i >= 0; --i) {
- TokenizerNode leaf = leaves.get(i);
- RowNodeWriter leafWriter = initializeWriter(leafWriters, numLeafWriters, leaf);
- ++numLeafWriters;
- // leaves store all but their first token byte
- int leafNodeWidth = leafWriter.calculateWidthOverrideOffsetWidth(0);
- totalLeafBytes += leafNodeWidth;
- negativeIndex += leafNodeWidth;
- leaf.setNegativeIndex(negativeIndex);
- }
-
- int totalNonLeafBytesWithoutOffsets = 0;
- int totalChildPointers = 0;
- for (int i = nonLeaves.size() - 1; i >= 0; --i) {
- TokenizerNode nonLeaf = nonLeaves.get(i);
- RowNodeWriter nonLeafWriter = initializeWriter(nonLeafWriters, numNonLeafWriters, nonLeaf);
- ++numNonLeafWriters;
- totalNonLeafBytesWithoutOffsets += nonLeafWriter.calculateWidthOverrideOffsetWidth(0);
- totalChildPointers += nonLeaf.getNumChildren();
- }
-
- // figure out how wide our offset FInts are
- int offsetWidth = 0;
- while (true) {
- ++offsetWidth;
- int offsetBytes = totalChildPointers * offsetWidth;
- int totalRowBytes = totalNonLeafBytesWithoutOffsets + offsetBytes + totalLeafBytes;
- if (totalRowBytes < UFIntTool.maxValueForNumBytes(offsetWidth)) {
- // it fits
- numBytes = totalRowBytes;
- break;
- }
- }
- blockMeta.setNextNodeOffsetWidth(offsetWidth);
-
- // populate negativeIndexes
- for (int i = nonLeaves.size() - 1; i >= 0; --i) {
- TokenizerNode nonLeaf = nonLeaves.get(i);
- int writerIndex = nonLeaves.size() - i - 1;
- RowNodeWriter nonLeafWriter = nonLeafWriters.get(writerIndex);
- int nodeWidth = nonLeafWriter.calculateWidth();
- negativeIndex += nodeWidth;
- nonLeaf.setNegativeIndex(negativeIndex);
- }
-
- return this;
- }
-
- protected RowNodeWriter initializeWriter(List<RowNodeWriter> list, int index,
- TokenizerNode builderNode) {
- RowNodeWriter rowNodeWriter = null;
- //check if there is an existing node we can recycle
- if (index >= list.size()) {
- //there are not enough existing nodes, so add a new one which will be retrieved below
- list.add(new RowNodeWriter(prefixTreeEncoder, builderNode));
- }
- rowNodeWriter = list.get(index);
- rowNodeWriter.reset(builderNode);
- return rowNodeWriter;
- }
-
-
- public void writeBytes(OutputStream os) throws IOException {
- for (int i = numNonLeafWriters - 1; i >= 0; --i) {
- RowNodeWriter nonLeafWriter = nonLeafWriters.get(i);
- nonLeafWriter.write(os);
- }
- // duplicates above... written more for clarity right now
- for (int i = numLeafWriters - 1; i >= 0; --i) {
- RowNodeWriter leafWriter = leafWriters.get(i);
- leafWriter.write(os);
- }
- }
-
-
- /***************** static ******************************/
-
- protected static ArrayList<TokenizerNode> filterByLeafAndReverse(
- ArrayList<TokenizerNode> ins, boolean leaves) {
- ArrayList<TokenizerNode> outs = Lists.newArrayList();
- for (int i = ins.size() - 1; i >= 0; --i) {
- TokenizerNode n = ins.get(i);
- if (n.isLeaf() && leaves || (!n.isLeaf() && !leaves)) {
- outs.add(ins.get(i));
- }
- }
- return outs;
- }
-
-
- /************* get/set **************************/
-
- public int getNumBytes() {
- return numBytes;
- }
-
- public ArrayList<TokenizerNode> getNonLeaves() {
- return nonLeaves;
- }
-
- public ArrayList<TokenizerNode> getLeaves() {
- return leaves;
- }
-
- public ArrayList<RowNodeWriter> getNonLeafWriters() {
- return nonLeafWriters;
- }
-
- public ArrayList<RowNodeWriter> getLeafWriters() {
- return leafWriters;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/hbase/blob/f8c58930/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/tokenize/Tokenizer.java
----------------------------------------------------------------------
diff --git a/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/tokenize/Tokenizer.java b/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/tokenize/Tokenizer.java
deleted file mode 100644
index 870409c..0000000
--- a/hbase-prefix-tree/src/main/java/org/apache/hadoop/hbase/codec/prefixtree/encode/tokenize/Tokenizer.java
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.hbase.codec.prefixtree.encode.tokenize;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.yetus.audience.InterfaceAudience;
-import org.apache.hadoop.hbase.util.ArrayUtils;
-import org.apache.hadoop.hbase.util.ByteRange;
-import org.apache.hadoop.hbase.util.Bytes;
-import org.apache.hadoop.hbase.util.CollectionUtils;
-
-import org.apache.hadoop.hbase.shaded.com.google.common.collect.Lists;
-
-/**
- * Data structure used in the first stage of PrefixTree encoding:
- * <ul>
- * <li>accepts a sorted stream of ByteRanges
- * <li>splits them into a set of tokens, each held by a {@link TokenizerNode}
- * <li>connects the TokenizerNodes via standard java references
- * <li>keeps a pool of TokenizerNodes and a reusable byte[] for holding all token content
- * </ul>
- * <p><br>
- * Mainly used for turning Cell rowKeys into a trie, but also used for family and qualifier
- * encoding.
- */
-@InterfaceAudience.Private
-public class Tokenizer{
-
- /***************** fields **************************/
-
- protected int numArraysAdded = 0;
- protected long lastNodeId = -1;
- protected ArrayList<TokenizerNode> nodes;
- protected int numNodes;
- protected TokenizerNode root;
- protected byte[] tokens;
- protected int tokensLength;
-
- protected int maxElementLength = 0;
- // number of levels in the tree assuming root level is 0
- protected int treeDepth = 0;
-
-
- /******************* construct *******************/
-
- public Tokenizer() {
- this.nodes = Lists.newArrayList();
- this.tokens = new byte[0];
- }
-
- public void reset() {
- numArraysAdded = 0;
- lastNodeId = -1;
- numNodes = 0;
- tokensLength = 0;
- root = null;
- maxElementLength = 0;
- treeDepth = 0;
- }
-
-
- /***************** building *************************/
-
- public void addAll(ArrayList<ByteRange> sortedByteRanges) {
- for (int i = 0; i < sortedByteRanges.size(); ++i) {
- ByteRange byteRange = sortedByteRanges.get(i);
- addSorted(byteRange);
- }
- }
-
- public void addSorted(final ByteRange bytes) {
- ++numArraysAdded;
- if (bytes.getLength() > maxElementLength) {
- maxElementLength = bytes.getLength();
- }
- if (root == null) {
- // nodeDepth of firstNode (non-root) is 1
- root = addNode(null, 1, 0, bytes, 0);
- } else {
- root.addSorted(bytes);
- }
- }
-
- public void incrementNumOccurrencesOfLatestValue(){
- CollectionUtils.getLast(nodes).incrementNumOccurrences(1);
- }
-
- protected long nextNodeId() {
- return ++lastNodeId;
- }
-
- protected TokenizerNode addNode(TokenizerNode parent, int nodeDepth, int tokenStartOffset,
- final ByteRange token, int inputTokenOffset) {
- int inputTokenLength = token.getLength() - inputTokenOffset;
- int tokenOffset = appendTokenAndRepointByteRange(token, inputTokenOffset);
- TokenizerNode node = null;
- if (nodes.size() <= numNodes) {
- node = new TokenizerNode(this, parent, nodeDepth, tokenStartOffset, tokenOffset,
- inputTokenLength);
- nodes.add(node);
- } else {
- node = nodes.get(numNodes);
- node.reset();
- node.reconstruct(this, parent, nodeDepth, tokenStartOffset, tokenOffset, inputTokenLength);
- }
- ++numNodes;
- return node;
- }
-
- protected int appendTokenAndRepointByteRange(final ByteRange token, int inputTokenOffset) {
- int newOffset = tokensLength;
- int inputTokenLength = token.getLength() - inputTokenOffset;
- int newMinimum = tokensLength + inputTokenLength;
- tokens = ArrayUtils.growIfNecessary(tokens, newMinimum, 2 * newMinimum);
- token.deepCopySubRangeTo(inputTokenOffset, inputTokenLength, tokens, tokensLength);
- tokensLength += inputTokenLength;
- return newOffset;
- }
-
- protected void submitMaxNodeDepthCandidate(int nodeDepth) {
- if (nodeDepth > treeDepth) {
- treeDepth = nodeDepth;
- }
- }
-
-
- /********************* read ********************/
-
- public int getNumAdded(){
- return numArraysAdded;
- }
-
- // for debugging
- public ArrayList<TokenizerNode> getNodes(boolean includeNonLeaves, boolean includeLeaves) {
- ArrayList<TokenizerNode> nodes = Lists.newArrayList();
- root.appendNodesToExternalList(nodes, includeNonLeaves, includeLeaves);
- return nodes;
- }
-
- public void appendNodes(List<TokenizerNode> appendTo, boolean includeNonLeaves,
- boolean includeLeaves) {
- root.appendNodesToExternalList(appendTo, includeNonLeaves, includeLeaves);
- }
-
- public List<byte[]> getArrays() {
- List<TokenizerNode> nodes = new ArrayList<>();
- root.appendNodesToExternalList(nodes, true, true);
- List<byte[]> byteArrays = Lists.newArrayListWithCapacity(CollectionUtils.nullSafeSize(nodes));
- for (int i = 0; i < nodes.size(); ++i) {
- TokenizerNode node = nodes.get(i);
- for (int j = 0; j < node.getNumOccurrences(); ++j) {
- byte[] byteArray = node.getNewByteArray();
- byteArrays.add(byteArray);
- }
- }
- return byteArrays;
- }
-
- //currently unused, but working and possibly useful in the future
- public void getNode(TokenizerRowSearchResult resultHolder, byte[] key, int keyOffset,
- int keyLength) {
- root.getNode(resultHolder, key, keyOffset, keyLength);
- }
-
-
- /********************** write ***************************/
-
- public Tokenizer setNodeFirstInsertionIndexes() {
- root.setInsertionIndexes(0);
- return this;
- }
-
- public Tokenizer appendOutputArrayOffsets(List<Integer> offsets) {
- root.appendOutputArrayOffsets(offsets);
- return this;
- }
-
-
- /********************* print/debug ********************/
-
- protected static final Boolean INCLUDE_FULL_TREE_IN_TO_STRING = false;
-
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder();
- sb.append(getStructuralString());
- if (INCLUDE_FULL_TREE_IN_TO_STRING) {
- for (byte[] bytes : getArrays()) {
- if (sb.length() > 0) {
- sb.append("\n");
- }
- sb.append(Bytes.toString(bytes));
- }
- }
- return sb.toString();
- }
-
- public String getStructuralString() {
- List<TokenizerNode> nodes = getNodes(true, true);
- StringBuilder sb = new StringBuilder();
- for (TokenizerNode node : nodes) {
- String line = node.getPaddedTokenAndOccurrenceString();
- sb.append(line + "\n");
- }
- return sb.toString();
- }
-
-
- /****************** get/set ************************/
-
- public TokenizerNode getRoot() {
- return root;
- }
-
- public int getMaxElementLength() {
- return maxElementLength;
- }
-
- public int getTreeDepth() {
- return treeDepth;
- }
-
-}