You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2017/06/16 18:30:00 UTC
[3/4] orc git commit: ORC-194. Split TreeWriters out of WriterImpl.
http://git-wip-us.apache.org/repos/asf/orc/blob/ded204a4/java/core/src/java/org/apache/orc/impl/WriterImpl.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/WriterImpl.java b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
index cfdddad..a5d65dd 100644
--- a/java/core/src/java/org/apache/orc/impl/WriterImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
@@ -20,10 +20,7 @@ package org.apache.orc.impl;
import java.io.IOException;
import java.nio.ByteBuffer;
-import java.nio.charset.StandardCharsets;
-import java.sql.Timestamp;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.EnumSet;
import java.util.List;
import java.util.Map;
@@ -34,43 +31,25 @@ import io.airlift.compress.lz4.Lz4Compressor;
import io.airlift.compress.lz4.Lz4Decompressor;
import io.airlift.compress.lzo.LzoCompressor;
import io.airlift.compress.lzo.LzoDecompressor;
-import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
-import org.apache.hadoop.hive.ql.util.JavaDataModel;
-import org.apache.orc.BinaryColumnStatistics;
import org.apache.orc.ColumnStatistics;
import org.apache.orc.CompressionCodec;
import org.apache.orc.CompressionKind;
import org.apache.orc.MemoryManager;
-import org.apache.orc.OrcConf;
import org.apache.orc.OrcFile;
import org.apache.orc.OrcProto;
import org.apache.orc.OrcUtils;
import org.apache.orc.PhysicalWriter;
-import org.apache.orc.StringColumnStatistics;
import org.apache.orc.StripeInformation;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;
-import org.apache.orc.util.BloomFilter;
-import org.apache.orc.util.BloomFilterIO;
-import org.apache.orc.util.BloomFilterUtf8;
+import org.apache.orc.impl.writer.TreeWriter;
+import org.apache.orc.impl.writer.WriterContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.common.type.HiveDecimal;
-import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.hadoop.io.Text;
import com.google.protobuf.ByteString;
@@ -108,7 +87,6 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
private final PhysicalWriter physicalWriter;
private final OrcFile.WriterVersion writerVersion;
- private int columnCount;
private long rowCount = 0;
private long rowsInStripe = 0;
private long rawDataSize = 0;
@@ -133,7 +111,7 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
private final boolean[] bloomFilterColumns;
private final double bloomFilterFpp;
private final OrcFile.BloomFilterVersion bloomFilterVersion;
- private boolean writeTimeZone;
+ private final boolean writeTimeZone;
public WriterImpl(FileSystem fs,
Path path,
@@ -155,6 +133,7 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
} else {
callbackContext = null;
}
+ writeTimeZone = hasTimestamp(schema);
this.adjustedStripeSize = opts.getStripeSize();
this.version = opts.getVersion();
this.encodingStrategy = opts.getEncodingStrategy();
@@ -181,7 +160,7 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
this.physicalWriter = opts.getPhysicalWriter() == null ?
new PhysicalFsWriter(fs, path, opts) : opts.getPhysicalWriter();
physicalWriter.writeHeader();
- treeWriter = createTreeWriter(schema, new StreamFactory(), false);
+ treeWriter = TreeWriter.Factory.create(schema, new StreamFactory(), false);
if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE) {
throw new IllegalArgumentException("Row stride must be at least " +
MIN_ROW_INDEX_STRIDE);
@@ -278,2524 +257,142 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
return false;
}
- private static class RowIndexPositionRecorder implements PositionRecorder {
- private final OrcProto.RowIndexEntry.Builder builder;
- RowIndexPositionRecorder(OrcProto.RowIndexEntry.Builder builder) {
- this.builder = builder;
- }
-
- @Override
- public void addPosition(long position) {
- builder.addPositions(position);
- }
- }
-
- CompressionCodec getCustomizedCodec(OrcProto.Stream.Kind kind) {
- // TODO: modify may create a new codec here. We want to end() it when the stream is closed,
- // but at this point there's no close() for the stream.
- CompressionCodec result = physicalWriter.getCompressionCodec();
- if (result != null) {
- switch (kind) {
- case BLOOM_FILTER:
- case DATA:
- case DICTIONARY_DATA:
- case BLOOM_FILTER_UTF8:
- if (compressionStrategy == OrcFile.CompressionStrategy.SPEED) {
- result = result.modify(EnumSet.of(CompressionCodec.Modifier.FAST,
- CompressionCodec.Modifier.TEXT));
- } else {
- result = result.modify(EnumSet.of(CompressionCodec.Modifier.DEFAULT,
- CompressionCodec.Modifier.TEXT));
- }
- break;
- case LENGTH:
- case DICTIONARY_COUNT:
- case PRESENT:
- case ROW_INDEX:
- case SECONDARY:
- // easily compressed using the fastest modes
- result = result.modify(EnumSet.of(CompressionCodec.Modifier.FASTEST,
- CompressionCodec.Modifier.BINARY));
- break;
- default:
- LOG.info("Missing ORC compression modifiers for " + kind);
- break;
- }
- }
- return result;
- }
-
- /**
- * Interface from the Writer to the TreeWriters. This limits the visibility
- * that the TreeWriters have into the Writer.
- */
- private class StreamFactory {
- /**
- * Create a stream to store part of a column.
- * @param column the column id for the stream
- * @param kind the kind of stream
- * @return The output outStream that the section needs to be written to.
- */
- public OutStream createStream(int column,
- OrcProto.Stream.Kind kind
- ) throws IOException {
- final StreamName name = new StreamName(column, kind);
- CompressionCodec codec = getCustomizedCodec(kind);
-
- return new OutStream(physicalWriter.toString(), bufferSize, codec,
- physicalWriter.createDataStream(name));
- }
-
- /**
- * Get the next column id.
- * @return a number from 0 to the number of columns - 1
- */
- public int getNextColumnId() {
- return columnCount++;
- }
-
- /**
- * Get the stride rate of the row index.
- */
- public int getRowIndexStride() {
- return rowIndexStride;
- }
-
- /**
- * Should be building the row index.
- * @return true if we are building the index
- */
- public boolean buildIndex() {
- return buildIndex;
- }
-
- /**
- * Is the ORC file compressed?
- * @return are the streams compressed
- */
- public boolean isCompressed() {
- return physicalWriter.getCompressionCodec() != null;
- }
-
- /**
- * Get the encoding strategy to use.
- * @return encoding strategy
- */
- public OrcFile.EncodingStrategy getEncodingStrategy() {
- return encodingStrategy;
- }
-
- /**
- * Get the bloom filter columns
- * @return bloom filter columns
- */
- public boolean[] getBloomFilterColumns() {
- return bloomFilterColumns;
- }
-
- /**
- * Get bloom filter false positive percentage.
- * @return fpp
- */
- public double getBloomFilterFPP() {
- return bloomFilterFpp;
- }
-
- /**
- * Get the writer's configuration.
- * @return configuration
- */
- public Configuration getConfiguration() {
- return conf;
- }
-
- /**
- * Get the version of the file to write.
- */
- public OrcFile.Version getVersion() {
- return version;
- }
-
- public void useWriterTimeZone(boolean val) {
- writeTimeZone = val;
- }
-
- public boolean hasWriterTimeZone() {
- return writeTimeZone;
- }
-
- public OrcFile.BloomFilterVersion getBloomFilterVersion() {
- return bloomFilterVersion;
- }
-
- public void writeIndex(StreamName name,
- OrcProto.RowIndex.Builder index) throws IOException {
- physicalWriter.writeIndex(name, index, getCustomizedCodec(name.getKind()));
- }
-
- public void writeBloomFilter(StreamName name,
- OrcProto.BloomFilterIndex.Builder bloom
- ) throws IOException {
- physicalWriter.writeBloomFilter(name, bloom,
- getCustomizedCodec(name.getKind()));
- }
- }
-
- /**
- * The writers for the specific writers of each type. This provides
- * the generic API that they must all implement.
- */
- interface TreeWriter {
-
- /**
- * Estimate the memory currently used to buffer the stripe.
- * @return the number of bytes
- */
- long estimateMemory();
-
- /**
- * Estimate the memory used if the file was read into Hive's Writable
- * types. This is used as an estimate for the query optimizer.
- * @return the number of bytes
- */
- long getRawDataSize();
-
- /**
- * Write a VectorizedRowBath to the file. This is called by the WriterImpl
- * at the top level.
- * @param batch the list of all of the columns
- * @param offset the first row from the batch to write
- * @param length the number of rows to write
- */
- void writeRootBatch(VectorizedRowBatch batch, int offset,
- int length) throws IOException;
-
- /**
- * Write a ColumnVector to the file. This is called recursively by
- * writeRootBatch.
- * @param vector the data to write
- * @param offset the first value offset to write.
- * @param length the number of values to write
- */
- void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException;
-
- /**
- * Create a row index entry at the current point in the stripe.
- */
- void createRowIndexEntry() throws IOException;
-
- /**
- * Write the stripe out to the file.
- * @param stripeFooter the stripe footer that contains the information about the
- * layout of the stripe. The TreeWriterBase is required to update
- * the footer with its information.
- * @param stats the stripe statistics information
- * @param requiredIndexEntries the number of index entries that are
- * required. this is to check to make sure the
- * row index is well formed.
- */
- void writeStripe(OrcProto.StripeFooter.Builder stripeFooter,
- OrcProto.StripeStatistics.Builder stats,
- int requiredIndexEntries) throws IOException;
-
- /**
- * During a stripe append, we need to update the file statistics.
- * @param stripeStatistics the statistics for the new stripe
- */
- void updateFileStatistics(OrcProto.StripeStatistics stripeStatistics);
-
- /**
- * Add the file statistics to the file footer.
- * @param footer the file footer builder
- */
- void writeFileStatistics(OrcProto.Footer.Builder footer);
- }
-
- /**
- * The parent class of all of the writers for each column. Each column
- * is written by an instance of this class. The compound types (struct,
- * list, map, and union) have children tree writers that write the children
- * types.
- */
- private abstract static class TreeWriterBase implements TreeWriter {
- protected final int id;
- protected final BitFieldWriter isPresent;
- private final boolean isCompressed;
- protected final ColumnStatisticsImpl indexStatistics;
- protected final ColumnStatisticsImpl stripeColStatistics;
- protected final ColumnStatisticsImpl fileStatistics;
- protected final RowIndexPositionRecorder rowIndexPosition;
- private final OrcProto.RowIndex.Builder rowIndex;
- private final OrcProto.RowIndexEntry.Builder rowIndexEntry;
- protected final BloomFilter bloomFilter;
- protected final BloomFilterUtf8 bloomFilterUtf8;
- protected final boolean createBloomFilter;
- private final OrcProto.BloomFilterIndex.Builder bloomFilterIndex;
- private final OrcProto.BloomFilterIndex.Builder bloomFilterIndexUtf8;
- protected final OrcProto.BloomFilter.Builder bloomFilterEntry;
- private boolean foundNulls;
- private OutStream isPresentOutStream;
- private final StreamFactory streamFactory;
-
- /**
- * Create a tree writer.
- * @param columnId the column id of the column to write
- * @param schema the row schema
- * @param streamFactory limited access to the Writer's data.
- * @param nullable can the value be null?
- */
- TreeWriterBase(int columnId,
- TypeDescription schema,
- StreamFactory streamFactory,
- boolean nullable) throws IOException {
- this.streamFactory = streamFactory;
- this.isCompressed = streamFactory.isCompressed();
- this.id = columnId;
- if (nullable) {
- isPresentOutStream = streamFactory.createStream(id,
- OrcProto.Stream.Kind.PRESENT);
- isPresent = new BitFieldWriter(isPresentOutStream, 1);
- } else {
- isPresent = null;
- }
- this.foundNulls = false;
- createBloomFilter = streamFactory.getBloomFilterColumns()[columnId];
- indexStatistics = ColumnStatisticsImpl.create(schema);
- stripeColStatistics = ColumnStatisticsImpl.create(schema);
- fileStatistics = ColumnStatisticsImpl.create(schema);
- if (streamFactory.buildIndex()) {
- rowIndex = OrcProto.RowIndex.newBuilder();
- rowIndexEntry = OrcProto.RowIndexEntry.newBuilder();
- rowIndexPosition = new RowIndexPositionRecorder(rowIndexEntry);
- } else {
- rowIndex = null;
- rowIndexEntry = null;
- rowIndexPosition = null;
- }
- if (createBloomFilter) {
- bloomFilterEntry = OrcProto.BloomFilter.newBuilder();
- if (streamFactory.getBloomFilterVersion() == OrcFile.BloomFilterVersion.ORIGINAL) {
- bloomFilter = new BloomFilter(streamFactory.getRowIndexStride(),
- streamFactory.getBloomFilterFPP());
- bloomFilterIndex = OrcProto.BloomFilterIndex.newBuilder();
- } else {
- bloomFilter = null;
- bloomFilterIndex = null;
- }
- bloomFilterUtf8 = new BloomFilterUtf8(streamFactory.getRowIndexStride(),
- streamFactory.getBloomFilterFPP());
- bloomFilterIndexUtf8 = OrcProto.BloomFilterIndex.newBuilder();
- } else {
- bloomFilterEntry = null;
- bloomFilterIndex = null;
- bloomFilterIndexUtf8 = null;
- bloomFilter = null;
- bloomFilterUtf8 = null;
- }
- }
-
- protected OrcProto.RowIndex.Builder getRowIndex() {
- return rowIndex;
- }
-
- protected ColumnStatisticsImpl getStripeStatistics() {
- return stripeColStatistics;
- }
-
- protected OrcProto.RowIndexEntry.Builder getRowIndexEntry() {
- return rowIndexEntry;
- }
-
- IntegerWriter createIntegerWriter(PositionedOutputStream output,
- boolean signed, boolean isDirectV2,
- StreamFactory writer) {
- if (isDirectV2) {
- boolean alignedBitpacking = false;
- if (writer.getEncodingStrategy().equals(OrcFile.EncodingStrategy.SPEED)) {
- alignedBitpacking = true;
- }
- return new RunLengthIntegerWriterV2(output, signed, alignedBitpacking);
- } else {
- return new RunLengthIntegerWriter(output, signed);
- }
- }
-
- boolean isNewWriteFormat(StreamFactory writer) {
- return writer.getVersion() != OrcFile.Version.V_0_11;
- }
-
- /**
- * Handle the top level object write.
- *
- * This default method is used for all types except structs, which are the
- * typical case. VectorizedRowBatch assumes the top level object is a
- * struct, so we use the first column for all other types.
- * @param batch the batch to write from
- * @param offset the row to start on
- * @param length the number of rows to write
- */
- public void writeRootBatch(VectorizedRowBatch batch, int offset,
- int length) throws IOException {
- writeBatch(batch.cols[0], offset, length);
- }
-
- /**
- * Write the values from the given vector from offset for length elements.
- * @param vector the vector to write from
- * @param offset the first value from the vector to write
- * @param length the number of values from the vector to write
- */
- @Override
- public void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- if (vector.noNulls) {
- indexStatistics.increment(length);
- if (isPresent != null) {
- for (int i = 0; i < length; ++i) {
- isPresent.write(1);
- }
- }
- } else {
- if (vector.isRepeating) {
- boolean isNull = vector.isNull[0];
- if (isPresent != null) {
- for (int i = 0; i < length; ++i) {
- isPresent.write(isNull ? 0 : 1);
- }
- }
- if (isNull) {
- foundNulls = true;
- indexStatistics.setNull();
- } else {
- indexStatistics.increment(length);
- }
- } else {
- // count the number of non-null values
- int nonNullCount = 0;
- for(int i = 0; i < length; ++i) {
- boolean isNull = vector.isNull[i + offset];
- if (!isNull) {
- nonNullCount += 1;
- }
- if (isPresent != null) {
- isPresent.write(isNull ? 0 : 1);
- }
- }
- indexStatistics.increment(nonNullCount);
- if (nonNullCount != length) {
- foundNulls = true;
- indexStatistics.setNull();
- }
- }
- }
- }
-
- private void removeIsPresentPositions() {
- for(int i=0; i < rowIndex.getEntryCount(); ++i) {
- OrcProto.RowIndexEntry.Builder entry = rowIndex.getEntryBuilder(i);
- List<Long> positions = entry.getPositionsList();
- // bit streams use 3 positions if uncompressed, 4 if compressed
- positions = positions.subList(isCompressed ? 4 : 3, positions.size());
- entry.clearPositions();
- entry.addAllPositions(positions);
- }
- }
-
- public void writeStripe(OrcProto.StripeFooter.Builder builder,
- OrcProto.StripeStatistics.Builder stats,
- int requiredIndexEntries) throws IOException {
- if (isPresent != null) {
- isPresent.flush();
-
- // if no nulls are found in a stream, then suppress the stream
- if(!foundNulls) {
- isPresentOutStream.suppress();
- // since isPresent bitstream is suppressed, update the index to
- // remove the positions of the isPresent stream
- if (rowIndex != null) {
- removeIsPresentPositions();
- }
- }
- }
-
- // merge stripe-level column statistics to file statistics and write it to
- // stripe statistics
- fileStatistics.merge(stripeColStatistics);
- stats.addColStats(stripeColStatistics.serialize());
- stripeColStatistics.reset();
-
- // reset the flag for next stripe
- foundNulls = false;
-
- builder.addColumns(getEncoding());
- if (streamFactory.hasWriterTimeZone()) {
- builder.setWriterTimezone(TimeZone.getDefault().getID());
- }
- if (rowIndex != null) {
- if (rowIndex.getEntryCount() != requiredIndexEntries) {
- throw new IllegalArgumentException("Column has wrong number of " +
- "index entries found: " + rowIndex.getEntryCount() + " expected: " +
- requiredIndexEntries);
- }
- streamFactory.writeIndex(new StreamName(id, OrcProto.Stream.Kind.ROW_INDEX), rowIndex);
- rowIndex.clear();
- rowIndexEntry.clear();
- }
-
- // write the bloom filter to out stream
- if (bloomFilterIndex != null) {
- streamFactory.writeBloomFilter(new StreamName(id,
- OrcProto.Stream.Kind.BLOOM_FILTER), bloomFilterIndex);
- bloomFilterIndex.clear();
- }
- // write the bloom filter to out stream
- if (bloomFilterIndexUtf8 != null) {
- streamFactory.writeBloomFilter(new StreamName(id,
- OrcProto.Stream.Kind.BLOOM_FILTER_UTF8), bloomFilterIndexUtf8);
- bloomFilterIndexUtf8.clear();
- }
- }
-
- /**
- * Get the encoding for this column.
- * @return the information about the encoding of this column
- */
- OrcProto.ColumnEncoding.Builder getEncoding() {
- OrcProto.ColumnEncoding.Builder builder =
- OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT);
- if (createBloomFilter) {
- builder.setBloomEncoding(BloomFilterIO.Encoding.CURRENT.getId());
- }
- return builder;
- }
-
- /**
- * Create a row index entry with the previous location and the current
- * index statistics. Also merges the index statistics into the file
- * statistics before they are cleared. Finally, it records the start of the
- * next index and ensures all of the children columns also create an entry.
- */
- public void createRowIndexEntry() throws IOException {
- stripeColStatistics.merge(indexStatistics);
- rowIndexEntry.setStatistics(indexStatistics.serialize());
- indexStatistics.reset();
- rowIndex.addEntry(rowIndexEntry);
- rowIndexEntry.clear();
- addBloomFilterEntry();
- recordPosition(rowIndexPosition);
- }
-
- void addBloomFilterEntry() {
- if (createBloomFilter) {
- if (bloomFilter != null) {
- BloomFilterIO.serialize(bloomFilterEntry, bloomFilter);
- bloomFilterIndex.addBloomFilter(bloomFilterEntry.build());
- bloomFilter.reset();
- }
- if (bloomFilterUtf8 != null) {
- BloomFilterIO.serialize(bloomFilterEntry, bloomFilterUtf8);
- bloomFilterIndexUtf8.addBloomFilter(bloomFilterEntry.build());
- bloomFilterUtf8.reset();
- }
- }
- }
-
- @Override
- public void updateFileStatistics(OrcProto.StripeStatistics stats) {
- fileStatistics.merge(ColumnStatisticsImpl.deserialize(stats.getColStats(id)));
- }
-
- /**
- * Record the current position in each of this column's streams.
- * @param recorder where should the locations be recorded
- */
- void recordPosition(PositionRecorder recorder) throws IOException {
- if (isPresent != null) {
- isPresent.getPosition(recorder);
- }
- }
-
- /**
- * Estimate how much memory the writer is consuming excluding the streams.
- * @return the number of bytes.
- */
- public long estimateMemory() {
- long result = 0;
- if (isPresent != null) {
- result = isPresentOutStream.getBufferSize();
- }
- return result;
- }
-
- @Override
- public void writeFileStatistics(OrcProto.Footer.Builder footer) {
- footer.addStatistics(fileStatistics.serialize());
- }
- }
-
- private static class BooleanTreeWriter extends TreeWriterBase {
- private final BitFieldWriter writer;
-
- BooleanTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- PositionedOutputStream out = writer.createStream(id,
- OrcProto.Stream.Kind.DATA);
- this.writer = new BitFieldWriter(out, 1);
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- }
-
- @Override
- public void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- LongColumnVector vec = (LongColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- int value = vec.vector[0] == 0 ? 0 : 1;
- indexStatistics.updateBoolean(value != 0, length);
- for(int i=0; i < length; ++i) {
- writer.write(value);
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- int value = vec.vector[i + offset] == 0 ? 0 : 1;
- writer.write(value);
- indexStatistics.updateBoolean(value != 0, 1);
- }
- }
- }
- }
-
- @Override
- public void writeStripe(OrcProto.StripeFooter.Builder builder,
- OrcProto.StripeStatistics.Builder stats,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, stats, requiredIndexEntries);
- writer.flush();
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- }
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- writer.getPosition(recorder);
- }
-
- @Override
- public long estimateMemory() {
- return super.estimateMemory() + writer.estimateMemory();
- }
-
- @Override
- public long getRawDataSize() {
- long num = fileStatistics.getNumberOfValues();
- return num * JavaDataModel.get().primitive1();
- }
- }
-
- private static class ByteTreeWriter extends TreeWriterBase {
- private final RunLengthByteWriter writer;
-
- ByteTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- this.writer = new RunLengthByteWriter(writer.createStream(id,
- OrcProto.Stream.Kind.DATA));
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- }
-
- @Override
- public void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- LongColumnVector vec = (LongColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- byte value = (byte) vec.vector[0];
- indexStatistics.updateInteger(value, length);
- if (createBloomFilter) {
- if (bloomFilter != null) {
- bloomFilter.addLong(value);
- }
- bloomFilterUtf8.addLong(value);
- }
- for(int i=0; i < length; ++i) {
- writer.write(value);
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- byte value = (byte) vec.vector[i + offset];
- writer.write(value);
- indexStatistics.updateInteger(value, 1);
- if (createBloomFilter) {
- if (bloomFilter != null) {
- bloomFilter.addLong(value);
- }
- bloomFilterUtf8.addLong(value);
- }
- }
- }
- }
- }
-
- @Override
- public void writeStripe(OrcProto.StripeFooter.Builder builder,
- OrcProto.StripeStatistics.Builder stats,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, stats, requiredIndexEntries);
- writer.flush();
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- }
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- writer.getPosition(recorder);
- }
-
- @Override
- public long estimateMemory() {
- return super.estimateMemory() + writer.estimateMemory();
- }
-
- @Override
- public long getRawDataSize() {
- long num = fileStatistics.getNumberOfValues();
- return num * JavaDataModel.get().primitive1();
- }
- }
-
- private static class IntegerTreeWriter extends TreeWriterBase {
- private final IntegerWriter writer;
- private boolean isDirectV2 = true;
- private final boolean isLong;
-
- IntegerTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- OutStream out = writer.createStream(id,
- OrcProto.Stream.Kind.DATA);
- this.isDirectV2 = isNewWriteFormat(writer);
- this.writer = createIntegerWriter(out, true, isDirectV2, writer);
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- this.isLong = schema.getCategory() == TypeDescription.Category.LONG;
- }
-
- @Override
- OrcProto.ColumnEncoding.Builder getEncoding() {
- OrcProto.ColumnEncoding.Builder result = super.getEncoding();
- if (isDirectV2) {
- result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2);
- } else {
- result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT);
- }
- return result;
- }
-
- @Override
- public void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- LongColumnVector vec = (LongColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- long value = vec.vector[0];
- indexStatistics.updateInteger(value, length);
- if (createBloomFilter) {
- if (bloomFilter != null) {
- bloomFilter.addLong(value);
- }
- bloomFilterUtf8.addLong(value);
- }
- for(int i=0; i < length; ++i) {
- writer.write(value);
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- long value = vec.vector[i + offset];
- writer.write(value);
- indexStatistics.updateInteger(value, 1);
- if (createBloomFilter) {
- if (bloomFilter != null) {
- bloomFilter.addLong(value);
- }
- bloomFilterUtf8.addLong(value);
- }
- }
- }
- }
- }
-
- @Override
- public void writeStripe(OrcProto.StripeFooter.Builder builder,
- OrcProto.StripeStatistics.Builder stats,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, stats, requiredIndexEntries);
- writer.flush();
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- }
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- writer.getPosition(recorder);
- }
-
- @Override
- public long estimateMemory() {
- return super.estimateMemory() + writer.estimateMemory();
- }
-
- @Override
- public long getRawDataSize() {
- JavaDataModel jdm = JavaDataModel.get();
- long num = fileStatistics.getNumberOfValues();
- return num * (isLong ? jdm.primitive2() : jdm.primitive1());
- }
- }
-
- private static class FloatTreeWriter extends TreeWriterBase {
- private final PositionedOutputStream stream;
- private final SerializationUtils utils;
-
- FloatTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- this.stream = writer.createStream(id,
- OrcProto.Stream.Kind.DATA);
- this.utils = new SerializationUtils();
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- }
-
- @Override
- public void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- DoubleColumnVector vec = (DoubleColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- float value = (float) vec.vector[0];
- indexStatistics.updateDouble(value);
- if (createBloomFilter) {
- if (bloomFilter != null) {
- bloomFilter.addDouble(value);
- }
- bloomFilterUtf8.addDouble(value);
- }
- for(int i=0; i < length; ++i) {
- utils.writeFloat(stream, value);
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- float value = (float) vec.vector[i + offset];
- utils.writeFloat(stream, value);
- indexStatistics.updateDouble(value);
- if (createBloomFilter) {
- if (bloomFilter != null) {
- bloomFilter.addDouble(value);
- }
- bloomFilterUtf8.addDouble(value);
- }
- }
- }
- }
- }
-
-
- @Override
- public void writeStripe(OrcProto.StripeFooter.Builder builder,
- OrcProto.StripeStatistics.Builder stats,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, stats, requiredIndexEntries);
- stream.flush();
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- }
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- stream.getPosition(recorder);
- }
-
- @Override
- public long estimateMemory() {
- return super.estimateMemory() + stream.getBufferSize();
- }
-
- @Override
- public long getRawDataSize() {
- long num = fileStatistics.getNumberOfValues();
- return num * JavaDataModel.get().primitive1();
- }
- }
-
- private static class DoubleTreeWriter extends TreeWriterBase {
- private final PositionedOutputStream stream;
- private final SerializationUtils utils;
-
- DoubleTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- this.stream = writer.createStream(id,
- OrcProto.Stream.Kind.DATA);
- this.utils = new SerializationUtils();
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- }
-
- @Override
- public void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- DoubleColumnVector vec = (DoubleColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- double value = vec.vector[0];
- indexStatistics.updateDouble(value);
- if (createBloomFilter) {
- if (bloomFilter != null) {
- bloomFilter.addDouble(value);
- }
- bloomFilterUtf8.addDouble(value);
- }
- for(int i=0; i < length; ++i) {
- utils.writeDouble(stream, value);
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- double value = vec.vector[i + offset];
- utils.writeDouble(stream, value);
- indexStatistics.updateDouble(value);
- if (createBloomFilter) {
- if (bloomFilter != null) {
- bloomFilter.addDouble(value);
- }
- bloomFilterUtf8.addDouble(value);
- }
- }
- }
- }
- }
-
- @Override
- public void writeStripe(OrcProto.StripeFooter.Builder builder,
- OrcProto.StripeStatistics.Builder stats,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, stats, requiredIndexEntries);
- stream.flush();
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- }
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- stream.getPosition(recorder);
- }
-
- @Override
- public long estimateMemory() {
- return super.estimateMemory() + stream.getBufferSize();
- }
-
- @Override
- public long getRawDataSize() {
- long num = fileStatistics.getNumberOfValues();
- return num * JavaDataModel.get().primitive2();
- }
- }
-
- private static abstract class StringBaseTreeWriter extends TreeWriterBase {
- private static final int INITIAL_DICTIONARY_SIZE = 4096;
- private final OutStream stringOutput;
- protected final IntegerWriter lengthOutput;
- private final IntegerWriter rowOutput;
- protected final StringRedBlackTree dictionary =
- new StringRedBlackTree(INITIAL_DICTIONARY_SIZE);
- protected final DynamicIntArray rows = new DynamicIntArray();
- protected final PositionedOutputStream directStreamOutput;
- private final List<OrcProto.RowIndexEntry> savedRowIndex =
- new ArrayList<>();
- private final boolean buildIndex;
- private final List<Long> rowIndexValueCount = new ArrayList<>();
- // If the number of keys in a dictionary is greater than this fraction of
- //the total number of non-null rows, turn off dictionary encoding
- private final double dictionaryKeySizeThreshold;
- protected boolean useDictionaryEncoding = true;
- private boolean isDirectV2 = true;
- private boolean doneDictionaryCheck;
- private final boolean strideDictionaryCheck;
-
- StringBaseTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- this.isDirectV2 = isNewWriteFormat(writer);
- directStreamOutput = writer.createStream(id, OrcProto.Stream.Kind.DATA);
- stringOutput = writer.createStream(id,
- OrcProto.Stream.Kind.DICTIONARY_DATA);
- lengthOutput = createIntegerWriter(writer.createStream(id,
- OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer);
- rowOutput = createIntegerWriter(directStreamOutput, false, isDirectV2,
- writer);
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- rowIndexValueCount.add(0L);
- buildIndex = writer.buildIndex();
- Configuration conf = writer.getConfiguration();
- dictionaryKeySizeThreshold =
- OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.getDouble(conf);
- strideDictionaryCheck =
- OrcConf.ROW_INDEX_STRIDE_DICTIONARY_CHECK.getBoolean(conf);
- doneDictionaryCheck = false;
- }
-
- private void checkDictionaryEncoding() {
- if (!doneDictionaryCheck) {
- // Set the flag indicating whether or not to use dictionary encoding
- // based on whether or not the fraction of distinct keys over number of
- // non-null rows is less than the configured threshold
- float ratio = rows.size() > 0 ? (float) (dictionary.size()) / rows.size() : 0.0f;
- useDictionaryEncoding = !isDirectV2 || ratio <= dictionaryKeySizeThreshold;
- doneDictionaryCheck = true;
- }
- }
-
- @Override
- public void writeStripe(OrcProto.StripeFooter.Builder builder,
- OrcProto.StripeStatistics.Builder stats,
- int requiredIndexEntries) throws IOException {
- // if rows in stripe is less than dictionaryCheckAfterRows, dictionary
- // checking would not have happened. So do it again here.
- checkDictionaryEncoding();
-
- if (useDictionaryEncoding) {
- flushDictionary();
- } else {
- // flushout any left over entries from dictionary
- if (rows.size() > 0) {
- flushDictionary();
- }
-
- // suppress the stream for every stripe if dictionary is disabled
- stringOutput.suppress();
- }
-
- // we need to build the rowindex before calling super, since it
- // writes it out.
- super.writeStripe(builder, stats, requiredIndexEntries);
- if (useDictionaryEncoding) {
- stringOutput.flush();
- lengthOutput.flush();
- rowOutput.flush();
- } else {
- directStreamOutput.flush();
- lengthOutput.flush();
- }
- // reset all of the fields to be ready for the next stripe.
- dictionary.clear();
- savedRowIndex.clear();
- rowIndexValueCount.clear();
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- rowIndexValueCount.add(0L);
-
- if (!useDictionaryEncoding) {
- // record the start positions of first index stride of next stripe i.e
- // beginning of the direct streams when dictionary is disabled
- recordDirectStreamPosition();
- }
- }
-
- private void flushDictionary() throws IOException {
- final int[] dumpOrder = new int[dictionary.size()];
-
- if (useDictionaryEncoding) {
- // Write the dictionary by traversing the red-black tree writing out
- // the bytes and lengths; and creating the map from the original order
- // to the final sorted order.
-
- dictionary.visit(new StringRedBlackTree.Visitor() {
- private int currentId = 0;
- @Override
- public void visit(StringRedBlackTree.VisitorContext context
- ) throws IOException {
- context.writeBytes(stringOutput);
- lengthOutput.write(context.getLength());
- dumpOrder[context.getOriginalPosition()] = currentId++;
- }
- });
- } else {
- // for direct encoding, we don't want the dictionary data stream
- stringOutput.suppress();
- }
- int length = rows.size();
- int rowIndexEntry = 0;
- OrcProto.RowIndex.Builder rowIndex = getRowIndex();
- Text text = new Text();
- // write the values translated into the dump order.
- for(int i = 0; i <= length; ++i) {
- // now that we are writing out the row values, we can finalize the
- // row index
- if (buildIndex) {
- while (i == rowIndexValueCount.get(rowIndexEntry) &&
- rowIndexEntry < savedRowIndex.size()) {
- OrcProto.RowIndexEntry.Builder base =
- savedRowIndex.get(rowIndexEntry++).toBuilder();
- if (useDictionaryEncoding) {
- rowOutput.getPosition(new RowIndexPositionRecorder(base));
- } else {
- PositionRecorder posn = new RowIndexPositionRecorder(base);
- directStreamOutput.getPosition(posn);
- lengthOutput.getPosition(posn);
- }
- rowIndex.addEntry(base.build());
- }
- }
- if (i != length) {
- if (useDictionaryEncoding) {
- rowOutput.write(dumpOrder[rows.get(i)]);
- } else {
- dictionary.getText(text, rows.get(i));
- directStreamOutput.write(text.getBytes(), 0, text.getLength());
- lengthOutput.write(text.getLength());
- }
- }
- }
- rows.clear();
- }
-
- @Override
- OrcProto.ColumnEncoding.Builder getEncoding() {
- OrcProto.ColumnEncoding.Builder result = super.getEncoding();
- if (useDictionaryEncoding) {
- result.setDictionarySize(dictionary.size());
- if(isDirectV2) {
- result.setKind(OrcProto.ColumnEncoding.Kind.DICTIONARY_V2);
- } else {
- result.setKind(OrcProto.ColumnEncoding.Kind.DICTIONARY);
- }
- } else {
- if(isDirectV2) {
- result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2);
- } else {
- result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT);
- }
- }
- return result;
- }
-
- /**
- * This method doesn't call the super method, because unlike most of the
- * other TreeWriters, this one can't record the position in the streams
- * until the stripe is being flushed. Therefore it saves all of the entries
- * and augments them with the final information as the stripe is written.
- */
- @Override
- public void createRowIndexEntry() throws IOException {
- getStripeStatistics().merge(indexStatistics);
- OrcProto.RowIndexEntry.Builder rowIndexEntry = getRowIndexEntry();
- rowIndexEntry.setStatistics(indexStatistics.serialize());
- indexStatistics.reset();
- OrcProto.RowIndexEntry base = rowIndexEntry.build();
- savedRowIndex.add(base);
- rowIndexEntry.clear();
- addBloomFilterEntry();
- recordPosition(rowIndexPosition);
- rowIndexValueCount.add((long) rows.size());
- if (strideDictionaryCheck) {
- checkDictionaryEncoding();
- }
- if (!useDictionaryEncoding) {
- if (rows.size() > 0) {
- flushDictionary();
- // just record the start positions of next index stride
- recordDirectStreamPosition();
- } else {
- // record the start positions of next index stride
- recordDirectStreamPosition();
- getRowIndex().addEntry(base);
- }
- }
- }
-
- private void recordDirectStreamPosition() throws IOException {
- if (rowIndexPosition != null) {
- directStreamOutput.getPosition(rowIndexPosition);
- lengthOutput.getPosition(rowIndexPosition);
- }
- }
-
- @Override
- public long estimateMemory() {
- long parent = super.estimateMemory();
- if (useDictionaryEncoding) {
- return parent + dictionary.getSizeInBytes() + rows.getSizeInBytes();
- } else {
- return parent + lengthOutput.estimateMemory() +
- directStreamOutput.getBufferSize();
- }
- }
-
- @Override
- public long getRawDataSize() {
- // ORC strings are converted to java Strings. so use JavaDataModel to
- // compute the overall size of strings
- StringColumnStatistics scs = (StringColumnStatistics) fileStatistics;
- long numVals = fileStatistics.getNumberOfValues();
- if (numVals == 0) {
- return 0;
- } else {
- int avgSize = (int) (scs.getSum() / numVals);
- return numVals * JavaDataModel.get().lengthForStringOfLength(avgSize);
- }
- }
- }
-
- private static class StringTreeWriter extends StringBaseTreeWriter {
- StringTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- }
-
- @Override
- public void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- BytesColumnVector vec = (BytesColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- if (useDictionaryEncoding) {
- int id = dictionary.add(vec.vector[0], vec.start[0], vec.length[0]);
- for(int i=0; i < length; ++i) {
- rows.add(id);
- }
- } else {
- for(int i=0; i < length; ++i) {
- directStreamOutput.write(vec.vector[0], vec.start[0],
- vec.length[0]);
- lengthOutput.write(vec.length[0]);
- }
- }
- indexStatistics.updateString(vec.vector[0], vec.start[0],
- vec.length[0], length);
- if (createBloomFilter) {
- if (bloomFilter != null) {
- // translate from UTF-8 to the default charset
- bloomFilter.addString(new String(vec.vector[0], vec.start[0],
- vec.length[0], StandardCharsets.UTF_8));
- }
- bloomFilterUtf8.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- if (useDictionaryEncoding) {
- rows.add(dictionary.add(vec.vector[offset + i],
- vec.start[offset + i], vec.length[offset + i]));
- } else {
- directStreamOutput.write(vec.vector[offset + i],
- vec.start[offset + i], vec.length[offset + i]);
- lengthOutput.write(vec.length[offset + i]);
- }
- indexStatistics.updateString(vec.vector[offset + i],
- vec.start[offset + i], vec.length[offset + i], 1);
- if (createBloomFilter) {
- if (bloomFilter != null) {
- // translate from UTF-8 to the default charset
- bloomFilter.addString(new String(vec.vector[offset + i],
- vec.start[offset + i], vec.length[offset + i],
- StandardCharsets.UTF_8));
- }
- bloomFilterUtf8.addBytes(vec.vector[offset + i],
- vec.start[offset + i], vec.length[offset + i]);
- }
- }
- }
- }
- }
- }
-
- /**
- * Under the covers, char is written to ORC the same way as string.
- */
- private static class CharTreeWriter extends StringBaseTreeWriter {
- private final int itemLength;
- private final byte[] padding;
-
- CharTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- itemLength = schema.getMaxLength();
- padding = new byte[itemLength];
- }
-
- @Override
- public void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- BytesColumnVector vec = (BytesColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- byte[] ptr;
- int ptrOffset;
- if (vec.length[0] >= itemLength) {
- ptr = vec.vector[0];
- ptrOffset = vec.start[0];
- } else {
- ptr = padding;
- ptrOffset = 0;
- System.arraycopy(vec.vector[0], vec.start[0], ptr, 0,
- vec.length[0]);
- Arrays.fill(ptr, vec.length[0], itemLength, (byte) ' ');
- }
- if (useDictionaryEncoding) {
- int id = dictionary.add(ptr, ptrOffset, itemLength);
- for(int i=0; i < length; ++i) {
- rows.add(id);
- }
- } else {
- for(int i=0; i < length; ++i) {
- directStreamOutput.write(ptr, ptrOffset, itemLength);
- lengthOutput.write(itemLength);
- }
- }
- indexStatistics.updateString(ptr, ptrOffset, itemLength, length);
- if (createBloomFilter) {
- if (bloomFilter != null) {
- // translate from UTF-8 to the default charset
- bloomFilter.addString(new String(vec.vector[0], vec.start[0],
- vec.length[0], StandardCharsets.UTF_8));
- }
- bloomFilterUtf8.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- byte[] ptr;
- int ptrOffset;
- if (vec.length[offset + i] >= itemLength) {
- ptr = vec.vector[offset + i];
- ptrOffset = vec.start[offset + i];
- } else {
- // it is the wrong length, so copy it
- ptr = padding;
- ptrOffset = 0;
- System.arraycopy(vec.vector[offset + i], vec.start[offset + i],
- ptr, 0, vec.length[offset + i]);
- Arrays.fill(ptr, vec.length[offset + i], itemLength, (byte) ' ');
- }
- if (useDictionaryEncoding) {
- rows.add(dictionary.add(ptr, ptrOffset, itemLength));
- } else {
- directStreamOutput.write(ptr, ptrOffset, itemLength);
- lengthOutput.write(itemLength);
- }
- indexStatistics.updateString(ptr, ptrOffset, itemLength, 1);
- if (createBloomFilter) {
- if (bloomFilter != null) {
- // translate from UTF-8 to the default charset
- bloomFilter.addString(new String(vec.vector[offset + i],
- vec.start[offset + i], vec.length[offset + i],
- StandardCharsets.UTF_8));
- }
- bloomFilterUtf8.addBytes(vec.vector[offset + i],
- vec.start[offset + i], vec.length[offset + i]);
- }
- }
- }
- }
- }
- }
-
- /**
- * Under the covers, varchar is written to ORC the same way as string.
- */
- private static class VarcharTreeWriter extends StringBaseTreeWriter {
- private final int maxLength;
-
- VarcharTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- maxLength = schema.getMaxLength();
- }
-
- @Override
- public void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- BytesColumnVector vec = (BytesColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- int itemLength = Math.min(vec.length[0], maxLength);
- if (useDictionaryEncoding) {
- int id = dictionary.add(vec.vector[0], vec.start[0], itemLength);
- for(int i=0; i < length; ++i) {
- rows.add(id);
- }
- } else {
- for(int i=0; i < length; ++i) {
- directStreamOutput.write(vec.vector[0], vec.start[0],
- itemLength);
- lengthOutput.write(itemLength);
- }
- }
- indexStatistics.updateString(vec.vector[0], vec.start[0],
- itemLength, length);
- if (createBloomFilter) {
- if (bloomFilter != null) {
- // translate from UTF-8 to the default charset
- bloomFilter.addString(new String(vec.vector[0],
- vec.start[0], itemLength,
- StandardCharsets.UTF_8));
- }
- bloomFilterUtf8.addBytes(vec.vector[0],
- vec.start[0], itemLength);
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- int itemLength = Math.min(vec.length[offset + i], maxLength);
- if (useDictionaryEncoding) {
- rows.add(dictionary.add(vec.vector[offset + i],
- vec.start[offset + i], itemLength));
- } else {
- directStreamOutput.write(vec.vector[offset + i],
- vec.start[offset + i], itemLength);
- lengthOutput.write(itemLength);
- }
- indexStatistics.updateString(vec.vector[offset + i],
- vec.start[offset + i], itemLength, 1);
- if (createBloomFilter) {
- if (bloomFilter != null) {
- // translate from UTF-8 to the default charset
- bloomFilter.addString(new String(vec.vector[offset + i],
- vec.start[offset + i], itemLength,
- StandardCharsets.UTF_8));
- }
- bloomFilterUtf8.addBytes(vec.vector[offset + i],
- vec.start[offset + i], itemLength);
- }
- }
- }
- }
- }
- }
-
- private static class BinaryTreeWriter extends TreeWriterBase {
- private final PositionedOutputStream stream;
- private final IntegerWriter length;
- private boolean isDirectV2 = true;
-
- BinaryTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- this.stream = writer.createStream(id,
- OrcProto.Stream.Kind.DATA);
- this.isDirectV2 = isNewWriteFormat(writer);
- this.length = createIntegerWriter(writer.createStream(id,
- OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer);
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- }
-
- @Override
- OrcProto.ColumnEncoding.Builder getEncoding() {
- OrcProto.ColumnEncoding.Builder result = super.getEncoding();
- if (isDirectV2) {
- result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2);
- } else {
- result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT);
- }
- return result;
- }
-
- @Override
- public void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- BytesColumnVector vec = (BytesColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- for(int i=0; i < length; ++i) {
- stream.write(vec.vector[0], vec.start[0],
- vec.length[0]);
- this.length.write(vec.length[0]);
- }
- indexStatistics.updateBinary(vec.vector[0], vec.start[0],
- vec.length[0], length);
- if (createBloomFilter) {
- if (bloomFilter != null) {
- bloomFilter.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
- }
- bloomFilterUtf8.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- stream.write(vec.vector[offset + i],
- vec.start[offset + i], vec.length[offset + i]);
- this.length.write(vec.length[offset + i]);
- indexStatistics.updateBinary(vec.vector[offset + i],
- vec.start[offset + i], vec.length[offset + i], 1);
- if (createBloomFilter) {
- if (bloomFilter != null) {
- bloomFilter.addBytes(vec.vector[offset + i],
- vec.start[offset + i], vec.length[offset + i]);
- }
- bloomFilterUtf8.addBytes(vec.vector[offset + i],
- vec.start[offset + i], vec.length[offset + i]);
- }
- }
- }
- }
- }
-
-
- @Override
- public void writeStripe(OrcProto.StripeFooter.Builder builder,
- OrcProto.StripeStatistics.Builder stats,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, stats, requiredIndexEntries);
- stream.flush();
- length.flush();
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- }
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- stream.getPosition(recorder);
- length.getPosition(recorder);
- }
-
- @Override
- public long estimateMemory() {
- return super.estimateMemory() + stream.getBufferSize() +
- length.estimateMemory();
- }
-
- @Override
- public long getRawDataSize() {
- // get total length of binary blob
- BinaryColumnStatistics bcs = (BinaryColumnStatistics) fileStatistics;
- return bcs.getSum();
- }
- }
-
- public static final int MILLIS_PER_SECOND = 1000;
- public static final String BASE_TIMESTAMP_STRING = "2015-01-01 00:00:00";
-
- private static class TimestampTreeWriter extends TreeWriterBase {
- private final IntegerWriter seconds;
- private final IntegerWriter nanos;
- private final boolean isDirectV2;
- private final TimeZone localTimezone;
- private final long baseEpochSecsLocalTz;
-
- TimestampTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- this.isDirectV2 = isNewWriteFormat(writer);
- this.seconds = createIntegerWriter(writer.createStream(id,
- OrcProto.Stream.Kind.DATA), true, isDirectV2, writer);
- this.nanos = createIntegerWriter(writer.createStream(id,
- OrcProto.Stream.Kind.SECONDARY), false, isDirectV2, writer);
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- this.localTimezone = TimeZone.getDefault();
- // for unit tests to set different time zones
- this.baseEpochSecsLocalTz = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / MILLIS_PER_SECOND;
- writer.useWriterTimeZone(true);
- }
-
- @Override
- OrcProto.ColumnEncoding.Builder getEncoding() {
- OrcProto.ColumnEncoding.Builder result = super.getEncoding();
- if (isDirectV2) {
- result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2);
- } else {
- result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT);
- }
- return result;
- }
-
- @Override
- public void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- TimestampColumnVector vec = (TimestampColumnVector) vector;
- Timestamp val;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- val = vec.asScratchTimestamp(0);
- long millis = val.getTime();
- long utc = SerializationUtils.convertToUtc(localTimezone, millis);
- indexStatistics.updateTimestamp(utc);
- if (createBloomFilter) {
- if (bloomFilter != null) {
- bloomFilter.addLong(millis);
- }
- bloomFilterUtf8.addLong(utc);
- }
- final long secs = millis / MILLIS_PER_SECOND - baseEpochSecsLocalTz;
- final long nano = formatNanos(val.getNanos());
- for(int i=0; i < length; ++i) {
- seconds.write(secs);
- nanos.write(nano);
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- val = vec.asScratchTimestamp(i + offset);
- long millis = val.getTime();
- long secs = millis / MILLIS_PER_SECOND - baseEpochSecsLocalTz;
- long utc = SerializationUtils.convertToUtc(localTimezone, millis);
- seconds.write(secs);
- nanos.write(formatNanos(val.getNanos()));
- indexStatistics.updateTimestamp(utc);
- if (createBloomFilter) {
- if (bloomFilter != null) {
- bloomFilter.addLong(millis);
- }
- bloomFilterUtf8.addLong(utc);
- }
- }
- }
- }
- }
-
- @Override
- public void writeStripe(OrcProto.StripeFooter.Builder builder,
- OrcProto.StripeStatistics.Builder stats,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, stats, requiredIndexEntries);
- seconds.flush();
- nanos.flush();
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- }
-
- private static long formatNanos(int nanos) {
- if (nanos == 0) {
- return 0;
- } else if (nanos % 100 != 0) {
- return ((long) nanos) << 3;
- } else {
- nanos /= 100;
- int trailingZeros = 1;
- while (nanos % 10 == 0 && trailingZeros < 7) {
- nanos /= 10;
- trailingZeros += 1;
- }
- return ((long) nanos) << 3 | trailingZeros;
- }
- }
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- seconds.getPosition(recorder);
- nanos.getPosition(recorder);
- }
-
- @Override
- public long estimateMemory() {
- return super.estimateMemory() + seconds.estimateMemory() +
- nanos.estimateMemory();
- }
-
- @Override
- public long getRawDataSize() {
- return fileStatistics.getNumberOfValues() *
- JavaDataModel.get().lengthOfTimestamp();
- }
- }
-
- private static class DateTreeWriter extends TreeWriterBase {
- private final IntegerWriter writer;
- private final boolean isDirectV2;
-
- DateTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- OutStream out = writer.createStream(id,
- OrcProto.Stream.Kind.DATA);
- this.isDirectV2 = isNewWriteFormat(writer);
- this.writer = createIntegerWriter(out, true, isDirectV2, writer);
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- }
-
- @Override
- public void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- LongColumnVector vec = (LongColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- int value = (int) vec.vector[0];
- indexStatistics.updateDate(value);
- if (createBloomFilter) {
- if (bloomFilter != null) {
- bloomFilter.addLong(value);
- }
- bloomFilterUtf8.addLong(value);
- }
- for(int i=0; i < length; ++i) {
- writer.write(value);
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- int value = (int) vec.vector[i + offset];
- writer.write(value);
- indexStatistics.updateDate(value);
- if (createBloomFilter) {
- if (bloomFilter != null) {
- bloomFilter.addLong(value);
- }
- bloomFilterUtf8.addLong(value);
- }
- }
- }
- }
- }
-
- @Override
- public void writeStripe(OrcProto.StripeFooter.Builder builder,
- OrcProto.StripeStatistics.Builder stats,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, stats, requiredIndexEntries);
- writer.flush();
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- }
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- writer.getPosition(recorder);
- }
-
- @Override
- OrcProto.ColumnEncoding.Builder getEncoding() {
- OrcProto.ColumnEncoding.Builder result = super.getEncoding();
- if (isDirectV2) {
- result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2);
- } else {
- result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT);
- }
- return result;
- }
-
- @Override
- public long estimateMemory() {
- return super.estimateMemory() + writer.estimateMemory();
- }
-
- @Override
- public long getRawDataSize() {
- return fileStatistics.getNumberOfValues() *
- JavaDataModel.get().lengthOfDate();
- }
- }
-
- private static class DecimalTreeWriter extends TreeWriterBase {
- private final PositionedOutputStream valueStream;
-
- // These scratch buffers allow us to serialize decimals much faster.
- private final long[] scratchLongs;
- private final byte[] scratchBuffer;
-
- private final IntegerWriter scaleStream;
- private final boolean isDirectV2;
-
- DecimalTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- this.isDirectV2 = isNewWriteFormat(writer);
- valueStream = writer.createStream(id, OrcProto.Stream.Kind.DATA);
- scratchLongs = new long[HiveDecimal.SCRATCH_LONGS_LEN];
- scratchBuffer = new byte[HiveDecimal.SCRATCH_BUFFER_LEN_TO_BYTES];
- this.scaleStream = createIntegerWriter(writer.createStream(id,
- OrcProto.Stream.Kind.SECONDARY), true, isDirectV2, writer);
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- }
-
- @Override
- OrcProto.ColumnEncoding.Builder getEncoding() {
- OrcProto.ColumnEncoding.Builder result = super.getEncoding();
- if (isDirectV2) {
- result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2);
- } else {
- result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT);
- }
- return result;
- }
-
- @Override
- public void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- DecimalColumnVector vec = (DecimalColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- HiveDecimalWritable value = vec.vector[0];
- indexStatistics.updateDecimal(value);
- if (createBloomFilter) {
- String str = value.toString(scratchBuffer);
- if (bloomFilter != null) {
- bloomFilter.addString(str);
- }
- bloomFilterUtf8.addString(str);
- }
- for(int i=0; i < length; ++i) {
- value.serializationUtilsWrite(valueStream,
- scratchLongs);
- scaleStream.write(value.scale());
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- HiveDecimalWritable value = vec.vector[i + offset];
- value.serializationUtilsWrite(valueStream, scratchLongs);
- scaleStream.write(value.scale());
- indexStatistics.updateDecimal(value);
- if (createBloomFilter) {
- String str = value.toString(scratchBuffer);
- if (bloomFilter != null) {
- bloomFilter.addString(str);
- }
- bloomFilterUtf8.addString(str);
- }
- }
- }
- }
- }
-
- @Override
- public void writeStripe(OrcProto.StripeFooter.Builder builder,
- OrcProto.StripeStatistics.Builder stats,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, stats, requiredIndexEntries);
- valueStream.flush();
- scaleStream.flush();
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- }
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- valueStream.getPosition(recorder);
- scaleStream.getPosition(recorder);
- }
-
- @Override
- public long estimateMemory() {
- return super.estimateMemory() + valueStream.getBufferSize() +
- scaleStream.estimateMemory();
- }
-
- @Override
- public long getRawDataSize() {
- return fileStatistics.getNumberOfValues() *
- JavaDataModel.get().lengthOfDecimal();
- }
- }
-
- private static class StructTreeWriter extends TreeWriterBase {
- final TreeWriter[] childrenWriters;
-
- StructTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- List<TypeDescription> children = schema.getChildren();
- childrenWriters = new TreeWriterBase[children.size()];
- for(int i=0; i < childrenWriters.length; ++i) {
- childrenWriters[i] = createTreeWriter(
- children.get(i), writer,
- true);
- }
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- }
-
- @Override
- public void writeRootBatch(VectorizedRowBatch batch, int offset,
- int length) throws IOException {
- // update the statistics for the root column
- indexStatistics.increment(length);
- // I'm assuming that the root column isn't nullable so that I don't need
- // to update isPresent.
- for(int i=0; i < childrenWriters.length; ++i) {
- childrenWriters[i].writeBatch(batch.cols[i], offset, length);
- }
- }
-
- private static void writeFields(StructColumnVector vector,
- TreeWriter[] childrenWriters,
- int offset, int length) throws IOException {
- for(int field=0; field < childrenWriters.length; ++field) {
- childrenWriters[field].writeBatch(vector.fields[field], offset, length);
- }
- }
-
- @Override
- public void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- StructColumnVector vec = (StructColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- writeFields(vec, childrenWriters, offset, length);
- }
- } else if (vector.noNulls) {
- writeFields(vec, childrenWriters, offset, length);
- } else {
- // write the records in runs
- int currentRun = 0;
- boolean started = false;
- for(int i=0; i < length; ++i) {
- if (!vec.isNull[i + offset]) {
- if (!started) {
- started = true;
- currentRun = i;
- }
- } else if (started) {
- started = false;
- writeFields(vec, childrenWriters, offset + currentRun,
- i - currentRun);
- }
- }
- if (started) {
- writeFields(vec, childrenWriters, offset + currentRun,
- length - currentRun);
- }
- }
- }
-
- @Override
- public void createRowIndexEntry() throws IOException {
- super.createRowIndexEntry();
- for(TreeWriter child: childrenWriters) {
- child.createRowIndexEntry();
- }
- }
-
- @Override
- public void writeStripe(OrcProto.StripeFooter.Builder builder,
- OrcProto.StripeStatistics.Builder stats,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, stats, requiredIndexEntries);
- for(TreeWriter child: childrenWriters) {
- child.writeStripe(builder, stats, requiredIndexEntries);
- }
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- }
-
- @Override
- public void updateFileStatistics(OrcProto.StripeStatistics stats) {
- super.updateFileStatistics(stats);
- for(TreeWriter child: childrenWriters) {
- child.updateFileStatistics(stats);
- }
- }
-
- @Override
- public long estimateMemory() {
- long result = 0;
- for(TreeWriter writer: childrenWriters) {
- result += writer.estimateMemory();
- }
- return super.estimateMemory() + result;
- }
-
- @Override
- public long getRawDataSize() {
- long result = 0;
- for(TreeWriter writer: childrenWriters) {
- result += writer.getRawDataSize();
- }
- return result;
- }
-
- @Override
- public void writeFileStatistics(OrcProto.Footer.Builder footer) {
- super.writeFileStatistics(footer);
- for(TreeWriter child: childrenWriters) {
- child.writeFileStatistics(footer);
- }
- }
- }
-
- private static class ListTreeWriter extends TreeWriterBase {
- private final IntegerWriter lengths;
- private final boolean isDirectV2;
- private final TreeWriter childWriter;
-
- ListTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- this.isDirectV2 = isNewWriteFormat(writer);
- childWriter =
- createTreeWriter(schema.getChildren().get(0), writer, true);
- lengths = createIntegerWriter(writer.createStream(columnId,
- OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer);
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- }
-
- @Override
- OrcProto.ColumnEncoding.Builder getEncoding() {
- OrcProto.ColumnEncoding.Builder result = super.getEncoding();
- if (isDirectV2) {
- result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2);
- } else {
- result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT);
- }
- return result;
- }
-
- @Override
- public void createRowIndexEntry() throws IOException {
- super.createRowIndexEntry();
- childWriter.createRowIndexEntry();
- }
-
- @Override
- public void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- ListColumnVector vec = (ListColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- int childOffset = (int) vec.offsets[0];
- int childLength = (int) vec.lengths[0];
- for(int i=0; i < length; ++i) {
- lengths.write(childLength);
- childWriter.writeBatch(vec.child, childOffset, childLength);
- }
- if (createBloomFilter) {
- if (bloomFilter != null) {
- bloomFilter.addLong(childLength);
- }
- bloomFilterUtf8.addLong(childLength);
- }
- }
- } else {
- // write the elements in runs
- int currentOffset = 0;
- int currentLength = 0;
- for(int i=0; i < length; ++i) {
- if (!vec.isNull[i + offset]) {
- int nextLength = (int) vec.lengths[offset + i];
- int nextOffset = (int) vec.offsets[offset + i];
- lengths.write(nextLength);
- if (currentLength == 0) {
- currentOffset = nextOffset;
- currentLength = nextLength;
- } else if (currentOffset + currentLength != nextOffset) {
- childWriter.writeBatch(vec.child, currentOffset,
- currentLength);
- currentOffset = nextOffset;
- currentLength = nextLength;
- } else {
- currentLength += nextLength;
- }
- if (createBloomFilter) {
- if (bloomFilter != null) {
- bloomFilter.addLong(nextLength);
- }
- bloomFilterUtf8.addLong(nextLength);
- }
+ CompressionCodec getCustomizedCodec(OrcProto.Stream.Kind kind) {
+ // TODO: modify may create a new codec here. We want to end() it when the stream is closed,
+ // but at this point there's no close() for the stream.
+ CompressionCodec result = physicalWriter.getCompressionCodec();
+ if (result != null) {
+ switch (kind) {
+ case BLOOM_FILTER:
+ case DATA:
+ case DICTIONARY_DATA:
+ case BLOOM_FILTER_UTF8:
+ if (compressionStrategy == OrcFile.CompressionStrategy.SPEED) {
+ result = result.modify(EnumSet.of(CompressionCodec.Modifier.FAST,
+ CompressionCodec.Modifier.TEXT));
+ } else {
+ result = result.modify(EnumSet.of(CompressionCodec.Modifier.DEFAULT,
+ CompressionCodec.Modifier.TEXT));
}
- }
- if (currentLength != 0) {
- childWriter.writeBatch(vec.child, currentOffset,
- currentLength);
- }
- }
- }
-
- @Override
- public void writeStripe(OrcProto.StripeFooter.Builder builder,
- OrcProto.StripeStatistics.Builder stats,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, stats, requiredIndexEntries);
- lengths.flush();
- childWriter.writeStripe(builder, stats, requiredIndexEntries);
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
+ break;
+ case LENGTH:
+ case DICTIONARY_COUNT:
+ case PRESENT:
+ case ROW_INDEX:
+ case SECONDARY:
+ // easily compressed using the fastest modes
+ result = result.modify(EnumSet.of(CompressionCodec.Modifier.FASTEST,
+ CompressionCodec.Modifier.BINARY));
+ break;
+ default:
+ LOG.info("Missing ORC compression modifiers for " + kind);
+ break;
}
}
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- lengths.getPosition(recorder);
- }
-
- @Override
- public void updateFileStatistics(OrcProto.StripeStatistics stats) {
- super.updateFileStatistics(stats);
- childWriter.updateFileStatistics(stats);
- }
-
- @Override
- public long estimateMemory() {
- return super.estimateMemory() + lengths.estimateMemory() +
- childWriter.estimateMemory();
- }
-
- @Override
- public long getRawDataSize() {
- return childWriter.getRawDataSize();
- }
-
- @Override
- public void writeFileStatistics(OrcProto.Footer.Builder footer) {
- super.writeFileStatistics(footer);
- childWriter.writeFileStatistics(footer);
- }
+ return result;
}
- private static class MapTreeWriter extends TreeWriterBase {
- private final IntegerWriter lengths;
- private final boolean isDirectV2;
- private final TreeWriter keyWriter;
- private final TreeWriter valueWriter;
-
- MapTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- this.isDirectV2 = isNewWriteFormat(writer);
- List<TypeDescription> children = schema.getChildren();
- keyWriter = createTreeWriter(children.get(0), writer, true);
- valueWriter = createTreeWriter(children.get(1), writer, true);
- lengths = createIntegerWriter(writer.createStream(columnId,
- OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer);
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- }
-
- @Override
- OrcProto.ColumnEncoding.Builder getEncoding() {
- OrcProto.ColumnEncoding.Builder result = super.getEncoding();
- if (isDirectV2) {
- result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2);
- } else {
- result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT);
- }
- return result;
- }
-
- @Override
- public void createRowIndexEntry() throws IOException {
- super.createRowIndexEntry();
- keyWriter.createRowIndexEntry();
- valueWriter.createRowIndexEntry();
- }
-
- @Override
- public void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- MapColumnVector vec = (MapColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- int childOffset = (int) vec.offsets[0];
- int childLength = (int) vec.lengths[0];
- for(int i=0; i < length; ++i) {
- lengths.write(childLength);
- keyWriter.writeBatch(vec.keys, childOffset, childLength);
- valueWriter.writeBatch(vec.values, childOffset, childLength);
- }
- if (createBloomFilter) {
- if (bloomFilter != null) {
- bloomFilter.addLong(childLength);
- }
- bloomFilterUtf8.addLong(childLength);
- }
- }
- } else {
- // write the elements in runs
- int currentOffset = 0;
- int currentLength = 0;
- for(int i=0; i < length; ++i) {
- if (!vec.isNull[i + offset]) {
- int nextLength = (int) vec.lengths[offset + i];
- int nextOffset = (int) vec.offsets[offset + i];
- lengths.write(nextLength);
- if (currentLength == 0) {
- currentOffset = nextOffset;
- currentLength = nextLength;
- } else if (currentOffset + currentLength != nextOffset) {
- keyWriter.writeBatch(vec.keys, currentOffset,
- currentLength);
- valueWriter.writeBatch(vec.values, currentOffset,
- currentLength);
- currentOffset = nextOffset;
- currentLength = nextLength;
- } else {
- currentLength += nextLength;
- }
- if (createBloomFilter) {
- if (bloomFilter != null) {
- bloomFilter.addLong(nextLength);
- }
- bloomFilterUtf8.addLong(nextLength);
- }
- }
- }
- if (currentLength != 0) {
- keyWriter.writeBatch(vec.keys, currentOffset,
- currentLength);
- valueWriter.writeBatch(vec.values, currentOffset,
- currentLength);
- }
- }
- }
-
- @Override
- public void writeStripe(OrcProto.StripeFooter.Builder builder,
- OrcProto.StripeStatistics.Builder stats,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, stats, requiredIndexEntries);
- lengths.flush();
- keyWriter.writeStripe(builder, stats, requiredIndexEntries);
- valueWriter.writeStripe(builder, stats, requiredIndexEntries);
- if (rowIndexPosition != null) {
- recordPosition(rowIndexPosition);
- }
- }
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- lengths.getPosition(recorder);
- }
-
- @Override
- public void updateFileStatistics(OrcProto.StripeStatistics stats) {
- super.updateFileStatistics(stats);
- keyWriter.updateFileStatistics(stats);
- valueWriter.updateFileStatistics(stats);
- }
+ /**
+ * Interface from the Writer to the TreeWriters. This limits the visibility
+ * that the TreeWriters have into the Writer.
+ */
+ private class StreamFactory implements WriterContext {
+ /**
+ * Create a stream to store part of a column.
+ * @param column the column id for the stream
+ * @param kind the kind of stream
+ * @return The output outStream that the section needs to be written to.
+ */
+ public OutStream createStream(int column,
+ OrcProto.Stream.Kind kind
+ ) throws IOException {
+ final StreamName name = new StreamName(column, kind);
+ CompressionCodec codec = getCustomizedCodec(kind);
- @Ov
<TRUNCATED>