You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by om...@apache.org on 2015/12/12 00:28:10 UTC
[13/16] hive git commit: HIVE-11890. Create ORC submodue. (omalley
reviewed by prasanthj)
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/TypeDescription.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/TypeDescription.java b/orc/src/java/org/apache/orc/TypeDescription.java
new file mode 100644
index 0000000..fc945e4
--- /dev/null
+++ b/orc/src/java/org/apache/orc/TypeDescription.java
@@ -0,0 +1,540 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * This is the description of the types in an ORC file.
+ */
+public class TypeDescription {
+ private static final int MAX_PRECISION = 38;
+ private static final int MAX_SCALE = 38;
+ private static final int DEFAULT_PRECISION = 38;
+ private static final int DEFAULT_SCALE = 10;
+ private static final int DEFAULT_LENGTH = 256;
+ public enum Category {
+ BOOLEAN("boolean", true),
+ BYTE("tinyint", true),
+ SHORT("smallint", true),
+ INT("int", true),
+ LONG("bigint", true),
+ FLOAT("float", true),
+ DOUBLE("double", true),
+ STRING("string", true),
+ DATE("date", true),
+ TIMESTAMP("timestamp", true),
+ BINARY("binary", true),
+ DECIMAL("decimal", true),
+ VARCHAR("varchar", true),
+ CHAR("char", true),
+ LIST("array", false),
+ MAP("map", false),
+ STRUCT("struct", false),
+ UNION("union", false);
+
+ Category(String name, boolean isPrimitive) {
+ this.name = name;
+ this.isPrimitive = isPrimitive;
+ }
+
+ final boolean isPrimitive;
+ final String name;
+
+ public boolean isPrimitive() {
+ return isPrimitive;
+ }
+
+ public String getName() {
+ return name;
+ }
+ }
+
+ public static TypeDescription createBoolean() {
+ return new TypeDescription(Category.BOOLEAN);
+ }
+
+ public static TypeDescription createByte() {
+ return new TypeDescription(Category.BYTE);
+ }
+
+ public static TypeDescription createShort() {
+ return new TypeDescription(Category.SHORT);
+ }
+
+ public static TypeDescription createInt() {
+ return new TypeDescription(Category.INT);
+ }
+
+ public static TypeDescription createLong() {
+ return new TypeDescription(Category.LONG);
+ }
+
+ public static TypeDescription createFloat() {
+ return new TypeDescription(Category.FLOAT);
+ }
+
+ public static TypeDescription createDouble() {
+ return new TypeDescription(Category.DOUBLE);
+ }
+
+ public static TypeDescription createString() {
+ return new TypeDescription(Category.STRING);
+ }
+
+ public static TypeDescription createDate() {
+ return new TypeDescription(Category.DATE);
+ }
+
+ public static TypeDescription createTimestamp() {
+ return new TypeDescription(Category.TIMESTAMP);
+ }
+
+ public static TypeDescription createBinary() {
+ return new TypeDescription(Category.BINARY);
+ }
+
+ public static TypeDescription createDecimal() {
+ return new TypeDescription(Category.DECIMAL);
+ }
+
+ /**
+ * For decimal types, set the precision.
+ * @param precision the new precision
+ * @return this
+ */
+ public TypeDescription withPrecision(int precision) {
+ if (category != Category.DECIMAL) {
+ throw new IllegalArgumentException("precision is only allowed on decimal"+
+ " and not " + category.name);
+ } else if (precision < 1 || precision > MAX_PRECISION || scale > precision){
+ throw new IllegalArgumentException("precision " + precision +
+ " is out of range 1 .. " + scale);
+ }
+ this.precision = precision;
+ return this;
+ }
+
+ /**
+ * For decimal types, set the scale.
+ * @param scale the new scale
+ * @return this
+ */
+ public TypeDescription withScale(int scale) {
+ if (category != Category.DECIMAL) {
+ throw new IllegalArgumentException("scale is only allowed on decimal"+
+ " and not " + category.name);
+ } else if (scale < 0 || scale > MAX_SCALE || scale > precision) {
+ throw new IllegalArgumentException("scale is out of range at " + scale);
+ }
+ this.scale = scale;
+ return this;
+ }
+
+ public static TypeDescription createVarchar() {
+ return new TypeDescription(Category.VARCHAR);
+ }
+
+ public static TypeDescription createChar() {
+ return new TypeDescription(Category.CHAR);
+ }
+
+ /**
+ * Set the maximum length for char and varchar types.
+ * @param maxLength the maximum value
+ * @return this
+ */
+ public TypeDescription withMaxLength(int maxLength) {
+ if (category != Category.VARCHAR && category != Category.CHAR) {
+ throw new IllegalArgumentException("maxLength is only allowed on char" +
+ " and varchar and not " + category.name);
+ }
+ this.maxLength = maxLength;
+ return this;
+ }
+
+ public static TypeDescription createList(TypeDescription childType) {
+ TypeDescription result = new TypeDescription(Category.LIST);
+ result.children.add(childType);
+ childType.parent = result;
+ return result;
+ }
+
+ public static TypeDescription createMap(TypeDescription keyType,
+ TypeDescription valueType) {
+ TypeDescription result = new TypeDescription(Category.MAP);
+ result.children.add(keyType);
+ result.children.add(valueType);
+ keyType.parent = result;
+ valueType.parent = result;
+ return result;
+ }
+
+ public static TypeDescription createUnion() {
+ return new TypeDescription(Category.UNION);
+ }
+
+ public static TypeDescription createStruct() {
+ return new TypeDescription(Category.STRUCT);
+ }
+
+ /**
+ * Add a child to a union type.
+ * @param child a new child type to add
+ * @return the union type.
+ */
+ public TypeDescription addUnionChild(TypeDescription child) {
+ if (category != Category.UNION) {
+ throw new IllegalArgumentException("Can only add types to union type" +
+ " and not " + category);
+ }
+ children.add(child);
+ child.parent = this;
+ return this;
+ }
+
+ /**
+ * Add a field to a struct type as it is built.
+ * @param field the field name
+ * @param fieldType the type of the field
+ * @return the struct type
+ */
+ public TypeDescription addField(String field, TypeDescription fieldType) {
+ if (category != Category.STRUCT) {
+ throw new IllegalArgumentException("Can only add fields to struct type" +
+ " and not " + category);
+ }
+ fieldNames.add(field);
+ children.add(fieldType);
+ fieldType.parent = this;
+ return this;
+ }
+
+ /**
+ * Get the id for this type.
+ * The first call will cause all of the the ids in tree to be assigned, so
+ * it should not be called before the type is completely built.
+ * @return the sequential id
+ */
+ public int getId() {
+ // if the id hasn't been assigned, assign all of the ids from the root
+ if (id == -1) {
+ TypeDescription root = this;
+ while (root.parent != null) {
+ root = root.parent;
+ }
+ root.assignIds(0);
+ }
+ return id;
+ }
+
+ /**
+ * Get the maximum id assigned to this type or its children.
+ * The first call will cause all of the the ids in tree to be assigned, so
+ * it should not be called before the type is completely built.
+ * @return the maximum id assigned under this type
+ */
+ public int getMaximumId() {
+ // if the id hasn't been assigned, assign all of the ids from the root
+ if (maxId == -1) {
+ TypeDescription root = this;
+ while (root.parent != null) {
+ root = root.parent;
+ }
+ root.assignIds(0);
+ }
+ return maxId;
+ }
+
+ private ColumnVector createColumn() {
+ switch (category) {
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case TIMESTAMP:
+ case DATE:
+ return new LongColumnVector();
+ case FLOAT:
+ case DOUBLE:
+ return new DoubleColumnVector();
+ case DECIMAL:
+ return new DecimalColumnVector(precision, scale);
+ case STRING:
+ case BINARY:
+ case CHAR:
+ case VARCHAR:
+ return new BytesColumnVector();
+ case STRUCT: {
+ ColumnVector[] fieldVector = new ColumnVector[children.size()];
+ for(int i=0; i < fieldVector.length; ++i) {
+ fieldVector[i] = children.get(i).createColumn();
+ }
+ return new StructColumnVector(VectorizedRowBatch.DEFAULT_SIZE,
+ fieldVector);
+ }
+ case UNION: {
+ ColumnVector[] fieldVector = new ColumnVector[children.size()];
+ for(int i=0; i < fieldVector.length; ++i) {
+ fieldVector[i] = children.get(i).createColumn();
+ }
+ return new UnionColumnVector(VectorizedRowBatch.DEFAULT_SIZE,
+ fieldVector);
+ }
+ case LIST:
+ return new ListColumnVector(VectorizedRowBatch.DEFAULT_SIZE,
+ children.get(0).createColumn());
+ case MAP:
+ return new MapColumnVector(VectorizedRowBatch.DEFAULT_SIZE,
+ children.get(0).createColumn(), children.get(1).createColumn());
+ default:
+ throw new IllegalArgumentException("Unknown type " + category);
+ }
+ }
+
+ public VectorizedRowBatch createRowBatch() {
+ VectorizedRowBatch result;
+ if (category == Category.STRUCT) {
+ result = new VectorizedRowBatch(children.size(),
+ VectorizedRowBatch.DEFAULT_SIZE);
+ for(int i=0; i < result.cols.length; ++i) {
+ result.cols[i] = children.get(i).createColumn();
+ }
+ } else {
+ result = new VectorizedRowBatch(1, VectorizedRowBatch.DEFAULT_SIZE);
+ result.cols[0] = createColumn();
+ }
+ result.reset();
+ return result;
+ }
+
+ /**
+ * Get the kind of this type.
+ * @return get the category for this type.
+ */
+ public Category getCategory() {
+ return category;
+ }
+
+ /**
+ * Get the maximum length of the type. Only used for char and varchar types.
+ * @return the maximum length of the string type
+ */
+ public int getMaxLength() {
+ return maxLength;
+ }
+
+ /**
+ * Get the precision of the decimal type.
+ * @return the number of digits for the precision.
+ */
+ public int getPrecision() {
+ return precision;
+ }
+
+ /**
+ * Get the scale of the decimal type.
+ * @return the number of digits for the scale.
+ */
+ public int getScale() {
+ return scale;
+ }
+
+ /**
+ * For struct types, get the list of field names.
+ * @return the list of field names.
+ */
+ public List<String> getFieldNames() {
+ return Collections.unmodifiableList(fieldNames);
+ }
+
+ /**
+ * Get the subtypes of this type.
+ * @return the list of children types
+ */
+ public List<TypeDescription> getChildren() {
+ return children == null ? null : Collections.unmodifiableList(children);
+ }
+
+ /**
+ * Assign ids to all of the nodes under this one.
+ * @param startId the lowest id to assign
+ * @return the next available id
+ */
+ private int assignIds(int startId) {
+ id = startId++;
+ if (children != null) {
+ for (TypeDescription child : children) {
+ startId = child.assignIds(startId);
+ }
+ }
+ maxId = startId - 1;
+ return startId;
+ }
+
+ private TypeDescription(Category category) {
+ this.category = category;
+ if (category.isPrimitive) {
+ children = null;
+ } else {
+ children = new ArrayList<>();
+ }
+ if (category == Category.STRUCT) {
+ fieldNames = new ArrayList<>();
+ } else {
+ fieldNames = null;
+ }
+ }
+
+ private int id = -1;
+ private int maxId = -1;
+ private TypeDescription parent;
+ private final Category category;
+ private final List<TypeDescription> children;
+ private final List<String> fieldNames;
+ private int maxLength = DEFAULT_LENGTH;
+ private int precision = DEFAULT_PRECISION;
+ private int scale = DEFAULT_SCALE;
+
+ public void printToBuffer(StringBuilder buffer) {
+ buffer.append(category.name);
+ switch (category) {
+ case DECIMAL:
+ buffer.append('(');
+ buffer.append(precision);
+ buffer.append(',');
+ buffer.append(scale);
+ buffer.append(')');
+ break;
+ case CHAR:
+ case VARCHAR:
+ buffer.append('(');
+ buffer.append(maxLength);
+ buffer.append(')');
+ break;
+ case LIST:
+ case MAP:
+ case UNION:
+ buffer.append('<');
+ for(int i=0; i < children.size(); ++i) {
+ if (i != 0) {
+ buffer.append(',');
+ }
+ children.get(i).printToBuffer(buffer);
+ }
+ buffer.append('>');
+ break;
+ case STRUCT:
+ buffer.append('<');
+ for(int i=0; i < children.size(); ++i) {
+ if (i != 0) {
+ buffer.append(',');
+ }
+ buffer.append(fieldNames.get(i));
+ buffer.append(':');
+ children.get(i).printToBuffer(buffer);
+ }
+ buffer.append('>');
+ break;
+ default:
+ break;
+ }
+ }
+
+ public String toString() {
+ StringBuilder buffer = new StringBuilder();
+ printToBuffer(buffer);
+ return buffer.toString();
+ }
+
+ private void printJsonToBuffer(String prefix, StringBuilder buffer,
+ int indent) {
+ for(int i=0; i < indent; ++i) {
+ buffer.append(' ');
+ }
+ buffer.append(prefix);
+ buffer.append("{\"category\": \"");
+ buffer.append(category.name);
+ buffer.append("\", \"id\": ");
+ buffer.append(getId());
+ buffer.append(", \"max\": ");
+ buffer.append(maxId);
+ switch (category) {
+ case DECIMAL:
+ buffer.append(", \"precision\": ");
+ buffer.append(precision);
+ buffer.append(", \"scale\": ");
+ buffer.append(scale);
+ break;
+ case CHAR:
+ case VARCHAR:
+ buffer.append(", \"length\": ");
+ buffer.append(maxLength);
+ break;
+ case LIST:
+ case MAP:
+ case UNION:
+ buffer.append(", \"children\": [");
+ for(int i=0; i < children.size(); ++i) {
+ buffer.append('\n');
+ children.get(i).printJsonToBuffer("", buffer, indent + 2);
+ if (i != children.size() - 1) {
+ buffer.append(',');
+ }
+ }
+ buffer.append("]");
+ break;
+ case STRUCT:
+ buffer.append(", \"fields\": [");
+ for(int i=0; i < children.size(); ++i) {
+ buffer.append('\n');
+ children.get(i).printJsonToBuffer("\"" + fieldNames.get(i) + "\": ",
+ buffer, indent + 2);
+ if (i != children.size() - 1) {
+ buffer.append(',');
+ }
+ }
+ buffer.append(']');
+ break;
+ default:
+ break;
+ }
+ buffer.append('}');
+ }
+
+ public String toJson() {
+ StringBuilder buffer = new StringBuilder();
+ printJsonToBuffer("", buffer, 0);
+ return buffer.toString();
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/Writer.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/Writer.java b/orc/src/java/org/apache/orc/Writer.java
new file mode 100644
index 0000000..4492062
--- /dev/null
+++ b/orc/src/java/org/apache/orc/Writer.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import org.apache.orc.OrcProto;
+import org.apache.orc.StripeInformation;
+import org.apache.orc.TypeDescription;
+
+/**
+ * The interface for writing ORC files.
+ */
+public interface Writer {
+
+ /**
+ * Get the schema for this writer
+ * @return the file schema
+ */
+ TypeDescription getSchema();
+
+ /**
+ * Add arbitrary meta-data to the ORC file. This may be called at any point
+ * until the Writer is closed. If the same key is passed a second time, the
+ * second value will replace the first.
+ * @param key a key to label the data with.
+ * @param value the contents of the metadata.
+ */
+ void addUserMetadata(String key, ByteBuffer value);
+
+ /**
+ * Add a row batch to the ORC file.
+ * @param batch the rows to add
+ */
+ void addRowBatch(VectorizedRowBatch batch) throws IOException;
+
+ /**
+ * Flush all of the buffers and close the file. No methods on this writer
+ * should be called afterwards.
+ * @throws IOException
+ */
+ void close() throws IOException;
+
+ /**
+ * Return the deserialized data size. Raw data size will be compute when
+ * writing the file footer. Hence raw data size value will be available only
+ * after closing the writer.
+ *
+ * @return raw data size
+ */
+ long getRawDataSize();
+
+ /**
+ * Return the number of rows in file. Row count gets updated when flushing
+ * the stripes. To get accurate row count this method should be called after
+ * closing the writer.
+ *
+ * @return row count
+ */
+ long getNumberOfRows();
+
+ /**
+ * Write an intermediate footer on the file such that if the file is
+ * truncated to the returned offset, it would be a valid ORC file.
+ * @return the offset that would be a valid end location for an ORC file
+ */
+ long writeIntermediateFooter() throws IOException;
+
+ /**
+ * Fast stripe append to ORC file. This interface is used for fast ORC file
+ * merge with other ORC files. When merging, the file to be merged should pass
+ * stripe in binary form along with stripe information and stripe statistics.
+ * After appending last stripe of a file, use appendUserMetadata() to append
+ * any user metadata.
+ * @param stripe - stripe as byte array
+ * @param offset - offset within byte array
+ * @param length - length of stripe within byte array
+ * @param stripeInfo - stripe information
+ * @param stripeStatistics - stripe statistics (Protobuf objects can be
+ * merged directly)
+ * @throws IOException
+ */
+ public void appendStripe(byte[] stripe, int offset, int length,
+ StripeInformation stripeInfo,
+ OrcProto.StripeStatistics stripeStatistics) throws IOException;
+
+ /**
+ * When fast stripe append is used for merging ORC stripes, after appending
+ * the last stripe from a file, this interface must be used to merge any
+ * user metadata.
+ * @param userMetadata - user metadata
+ */
+ public void appendUserMetadata(List<OrcProto.UserMetadataItem> userMetadata);
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/impl/BitFieldReader.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/BitFieldReader.java b/orc/src/java/org/apache/orc/impl/BitFieldReader.java
new file mode 100644
index 0000000..8d9d3cb
--- /dev/null
+++ b/orc/src/java/org/apache/orc/impl/BitFieldReader.java
@@ -0,0 +1,216 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.impl;
+
+import java.io.EOFException;
+import java.io.IOException;
+
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.orc.impl.InStream;
+import org.apache.orc.impl.PositionProvider;
+import org.apache.orc.impl.RunLengthByteReader;
+
+public class BitFieldReader {
+ private final RunLengthByteReader input;
+ /** The number of bits in one item. Non-test code always uses 1. */
+ private final int bitSize;
+ private int current;
+ private int bitsLeft;
+ private final int mask;
+
+ public BitFieldReader(InStream input,
+ int bitSize) throws IOException {
+ this.input = new RunLengthByteReader(input);
+ this.bitSize = bitSize;
+ mask = (1 << bitSize) - 1;
+ }
+
+ public void setInStream(InStream inStream) {
+ this.input.setInStream(inStream);
+ }
+
+ private void readByte() throws IOException {
+ if (input.hasNext()) {
+ current = 0xff & input.next();
+ bitsLeft = 8;
+ } else {
+ throw new EOFException("Read past end of bit field from " + this);
+ }
+ }
+
+ public int next() throws IOException {
+ int result = 0;
+ int bitsLeftToRead = bitSize;
+ while (bitsLeftToRead > bitsLeft) {
+ result <<= bitsLeft;
+ result |= current & ((1 << bitsLeft) - 1);
+ bitsLeftToRead -= bitsLeft;
+ readByte();
+ }
+ if (bitsLeftToRead > 0) {
+ result <<= bitsLeftToRead;
+ bitsLeft -= bitsLeftToRead;
+ result |= (current >>> bitsLeft) & ((1 << bitsLeftToRead) - 1);
+ }
+ return result & mask;
+ }
+
+ /**
+ * Unlike integer readers, where runs are encoded explicitly, in this one we have to read ahead
+ * to figure out whether we have a run. Given that runs in booleans are likely it's worth it.
+ * However it means we'd need to keep track of how many bytes we read, and next/nextVector won't
+ * work anymore once this is called. These is trivial to fix, but these are never interspersed.
+ */
+ private boolean lastRunValue;
+ private int lastRunLength = -1;
+ private void readNextRun(int maxRunLength) throws IOException {
+ assert bitSize == 1;
+ if (lastRunLength > 0) return; // last run is not exhausted yet
+ if (bitsLeft == 0) {
+ readByte();
+ }
+ // First take care of the partial bits.
+ boolean hasVal = false;
+ int runLength = 0;
+ if (bitsLeft != 8) {
+ int partialBitsMask = (1 << bitsLeft) - 1;
+ int partialBits = current & partialBitsMask;
+ if (partialBits == partialBitsMask || partialBits == 0) {
+ lastRunValue = (partialBits == partialBitsMask);
+ if (maxRunLength <= bitsLeft) {
+ lastRunLength = maxRunLength;
+ return;
+ }
+ maxRunLength -= bitsLeft;
+ hasVal = true;
+ runLength = bitsLeft;
+ bitsLeft = 0;
+ } else {
+ // There's no run in partial bits. Return whatever we have.
+ int prefixBitsCount = 32 - bitsLeft;
+ runLength = Integer.numberOfLeadingZeros(partialBits) - prefixBitsCount;
+ lastRunValue = (runLength > 0);
+ lastRunLength = Math.min(maxRunLength, lastRunValue ? runLength :
+ (Integer.numberOfLeadingZeros(~(partialBits | ~partialBitsMask)) - prefixBitsCount));
+ return;
+ }
+ assert bitsLeft == 0;
+ readByte();
+ }
+ if (!hasVal) {
+ lastRunValue = ((current >> 7) == 1);
+ hasVal = true;
+ }
+ // Read full bytes until the run ends.
+ assert bitsLeft == 8;
+ while (maxRunLength >= 8
+ && ((lastRunValue && (current == 0xff)) || (!lastRunValue && (current == 0)))) {
+ runLength += 8;
+ maxRunLength -= 8;
+ readByte();
+ }
+ if (maxRunLength > 0) {
+ int extraBits = Integer.numberOfLeadingZeros(
+ lastRunValue ? (~(current | ~255)) : current) - 24;
+ bitsLeft -= extraBits;
+ runLength += extraBits;
+ }
+ lastRunLength = runLength;
+ }
+
+ public void nextVector(LongColumnVector previous,
+ long previousLen) throws IOException {
+ previous.isRepeating = true;
+ for (int i = 0; i < previousLen; i++) {
+ if (!previous.isNull[i]) {
+ previous.vector[i] = next();
+ } else {
+ // The default value of null for int types in vectorized
+ // processing is 1, so set that if the value is null
+ previous.vector[i] = 1;
+ }
+
+ // The default value for nulls in Vectorization for int types is 1
+ // and given that non null value can also be 1, we need to check for isNull also
+ // when determining the isRepeating flag.
+ if (previous.isRepeating
+ && i > 0
+ && ((previous.vector[i - 1] != previous.vector[i]) || (previous.isNull[i - 1] != previous.isNull[i]))) {
+ previous.isRepeating = false;
+ }
+ }
+ }
+
+ public void seek(PositionProvider index) throws IOException {
+ input.seek(index);
+ int consumed = (int) index.getNext();
+ if (consumed > 8) {
+ throw new IllegalArgumentException("Seek past end of byte at " +
+ consumed + " in " + input);
+ } else if (consumed != 0) {
+ readByte();
+ bitsLeft = 8 - consumed;
+ } else {
+ bitsLeft = 0;
+ }
+ }
+
+ public void skip(long items) throws IOException {
+ long totalBits = bitSize * items;
+ if (bitsLeft >= totalBits) {
+ bitsLeft -= totalBits;
+ } else {
+ totalBits -= bitsLeft;
+ input.skip(totalBits / 8);
+ current = input.next();
+ bitsLeft = (int) (8 - (totalBits % 8));
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "bit reader current: " + current + " bits left: " + bitsLeft +
+ " bit size: " + bitSize + " from " + input;
+ }
+
+ boolean hasFullByte() {
+ return bitsLeft == 8 || bitsLeft == 0;
+ }
+
+ int peekOneBit() throws IOException {
+ assert bitSize == 1;
+ if (bitsLeft == 0) {
+ readByte();
+ }
+ return (current >>> (bitsLeft - 1)) & 1;
+ }
+
+ int peekFullByte() throws IOException {
+ assert bitSize == 1;
+ assert bitsLeft == 8 || bitsLeft == 0;
+ if (bitsLeft == 0) {
+ readByte();
+ }
+ return current;
+ }
+
+ void skipInCurrentByte(int bits) throws IOException {
+ assert bitsLeft >= bits;
+ bitsLeft -= bits;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/impl/BitFieldWriter.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/BitFieldWriter.java b/orc/src/java/org/apache/orc/impl/BitFieldWriter.java
new file mode 100644
index 0000000..aa5f886
--- /dev/null
+++ b/orc/src/java/org/apache/orc/impl/BitFieldWriter.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.impl;
+
+import org.apache.orc.impl.PositionRecorder;
+import org.apache.orc.impl.PositionedOutputStream;
+import org.apache.orc.impl.RunLengthByteWriter;
+
+import java.io.IOException;
+
+public class BitFieldWriter {
+ private RunLengthByteWriter output;
+ private final int bitSize;
+ private byte current = 0;
+ private int bitsLeft = 8;
+
+ public BitFieldWriter(PositionedOutputStream output,
+ int bitSize) throws IOException {
+ this.output = new RunLengthByteWriter(output);
+ this.bitSize = bitSize;
+ }
+
+ private void writeByte() throws IOException {
+ output.write(current);
+ current = 0;
+ bitsLeft = 8;
+ }
+
+ public void flush() throws IOException {
+ if (bitsLeft != 8) {
+ writeByte();
+ }
+ output.flush();
+ }
+
+ public void write(int value) throws IOException {
+ int bitsToWrite = bitSize;
+ while (bitsToWrite > bitsLeft) {
+ // add the bits to the bottom of the current word
+ current |= value >>> (bitsToWrite - bitsLeft);
+ // subtract out the bits we just added
+ bitsToWrite -= bitsLeft;
+ // zero out the bits above bitsToWrite
+ value &= (1 << bitsToWrite) - 1;
+ writeByte();
+ }
+ bitsLeft -= bitsToWrite;
+ current |= value << bitsLeft;
+ if (bitsLeft == 0) {
+ writeByte();
+ }
+ }
+
+ public void getPosition(PositionRecorder recorder) throws IOException {
+ output.getPosition(recorder);
+ recorder.addPosition(8 - bitsLeft);
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/impl/BufferChunk.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/BufferChunk.java b/orc/src/java/org/apache/orc/impl/BufferChunk.java
new file mode 100644
index 0000000..da43b96
--- /dev/null
+++ b/orc/src/java/org/apache/orc/impl/BufferChunk.java
@@ -0,0 +1,85 @@
+package org.apache.orc.impl;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.hive.common.io.DiskRange;
+import org.apache.hadoop.hive.common.io.DiskRangeList;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.nio.ByteBuffer;
+
+/**
+ * The sections of stripe that we have read.
+ * This might not match diskRange - 1 disk range can be multiple buffer chunks,
+ * depending on DFS block boundaries.
+ */
+public class BufferChunk extends DiskRangeList {
+
+ private static final Logger LOG =
+ LoggerFactory.getLogger(BufferChunk.class);
+ final ByteBuffer chunk;
+
+ public BufferChunk(ByteBuffer chunk, long offset) {
+ super(offset, offset + chunk.remaining());
+ this.chunk = chunk;
+ }
+
+ public ByteBuffer getChunk() {
+ return chunk;
+ }
+
+ @Override
+ public boolean hasData() {
+ return chunk != null;
+ }
+
+ @Override
+ public final String toString() {
+ boolean makesSense = chunk.remaining() == (end - offset);
+ return "data range [" + offset + ", " + end + "), size: " + chunk.remaining()
+ + (makesSense ? "" : "(!)") + " type: " +
+ (chunk.isDirect() ? "direct" : "array-backed");
+ }
+
+ @Override
+ public DiskRange sliceAndShift(long offset, long end, long shiftBy) {
+ assert offset <= end && offset >= this.offset && end <= this.end;
+ assert offset + shiftBy >= 0;
+ ByteBuffer sliceBuf = chunk.slice();
+ int newPos = (int) (offset - this.offset);
+ int newLimit = newPos + (int) (end - offset);
+ try {
+ sliceBuf.position(newPos);
+ sliceBuf.limit(newLimit);
+ } catch (Throwable t) {
+ LOG.error("Failed to slice buffer chunk with range" + " [" + this.offset + ", " + this.end
+ + "), position: " + chunk.position() + " limit: " + chunk.limit() + ", "
+ + (chunk.isDirect() ? "direct" : "array") + "; to [" + offset + ", " + end + ") "
+ + t.getClass());
+ throw new RuntimeException(t);
+ }
+ return new BufferChunk(sliceBuf, offset + shiftBy);
+ }
+
+ @Override
+ public ByteBuffer getData() {
+ return chunk;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/orc/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
new file mode 100644
index 0000000..745ed9a
--- /dev/null
+++ b/orc/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
@@ -0,0 +1,1097 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.impl;
+
+import java.sql.Date;
+import java.sql.Timestamp;
+
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.WritableComparator;
+import org.apache.orc.BinaryColumnStatistics;
+import org.apache.orc.BooleanColumnStatistics;
+import org.apache.orc.ColumnStatistics;
+import org.apache.orc.DateColumnStatistics;
+import org.apache.orc.DecimalColumnStatistics;
+import org.apache.orc.DoubleColumnStatistics;
+import org.apache.orc.IntegerColumnStatistics;
+import org.apache.orc.OrcProto;
+import org.apache.orc.StringColumnStatistics;
+import org.apache.orc.TimestampColumnStatistics;
+import org.apache.orc.TypeDescription;
+
+public class ColumnStatisticsImpl implements ColumnStatistics {
+
+ private static final class BooleanStatisticsImpl extends ColumnStatisticsImpl
+ implements BooleanColumnStatistics {
+ private long trueCount = 0;
+
+ BooleanStatisticsImpl(OrcProto.ColumnStatistics stats) {
+ super(stats);
+ OrcProto.BucketStatistics bkt = stats.getBucketStatistics();
+ trueCount = bkt.getCount(0);
+ }
+
+ BooleanStatisticsImpl() {
+ }
+
+ @Override
+ public void reset() {
+ super.reset();
+ trueCount = 0;
+ }
+
+ @Override
+ public void updateBoolean(boolean value, int repetitions) {
+ if (value) {
+ trueCount += repetitions;
+ }
+ }
+
+ @Override
+ public void merge(ColumnStatisticsImpl other) {
+ if (other instanceof BooleanStatisticsImpl) {
+ BooleanStatisticsImpl bkt = (BooleanStatisticsImpl) other;
+ trueCount += bkt.trueCount;
+ } else {
+ if (isStatsExists() && trueCount != 0) {
+ throw new IllegalArgumentException("Incompatible merging of boolean column statistics");
+ }
+ }
+ super.merge(other);
+ }
+
+ @Override
+ public OrcProto.ColumnStatistics.Builder serialize() {
+ OrcProto.ColumnStatistics.Builder builder = super.serialize();
+ OrcProto.BucketStatistics.Builder bucket =
+ OrcProto.BucketStatistics.newBuilder();
+ bucket.addCount(trueCount);
+ builder.setBucketStatistics(bucket);
+ return builder;
+ }
+
+ @Override
+ public long getFalseCount() {
+ return getNumberOfValues() - trueCount;
+ }
+
+ @Override
+ public long getTrueCount() {
+ return trueCount;
+ }
+
+ @Override
+ public String toString() {
+ return super.toString() + " true: " + trueCount;
+ }
+ }
+
+ private static final class IntegerStatisticsImpl extends ColumnStatisticsImpl
+ implements IntegerColumnStatistics {
+
+ private long minimum = Long.MAX_VALUE;
+ private long maximum = Long.MIN_VALUE;
+ private long sum = 0;
+ private boolean hasMinimum = false;
+ private boolean overflow = false;
+
+ IntegerStatisticsImpl() {
+ }
+
+ IntegerStatisticsImpl(OrcProto.ColumnStatistics stats) {
+ super(stats);
+ OrcProto.IntegerStatistics intStat = stats.getIntStatistics();
+ if (intStat.hasMinimum()) {
+ hasMinimum = true;
+ minimum = intStat.getMinimum();
+ }
+ if (intStat.hasMaximum()) {
+ maximum = intStat.getMaximum();
+ }
+ if (intStat.hasSum()) {
+ sum = intStat.getSum();
+ } else {
+ overflow = true;
+ }
+ }
+
+ @Override
+ public void reset() {
+ super.reset();
+ hasMinimum = false;
+ minimum = Long.MAX_VALUE;
+ maximum = Long.MIN_VALUE;
+ sum = 0;
+ overflow = false;
+ }
+
+ @Override
+ public void updateInteger(long value, int repetitions) {
+ if (!hasMinimum) {
+ hasMinimum = true;
+ minimum = value;
+ maximum = value;
+ } else if (value < minimum) {
+ minimum = value;
+ } else if (value > maximum) {
+ maximum = value;
+ }
+ if (!overflow) {
+ boolean wasPositive = sum >= 0;
+ sum += value * repetitions;
+ if ((value >= 0) == wasPositive) {
+ overflow = (sum >= 0) != wasPositive;
+ }
+ }
+ }
+
+ @Override
+ public void merge(ColumnStatisticsImpl other) {
+ if (other instanceof IntegerStatisticsImpl) {
+ IntegerStatisticsImpl otherInt = (IntegerStatisticsImpl) other;
+ if (!hasMinimum) {
+ hasMinimum = otherInt.hasMinimum;
+ minimum = otherInt.minimum;
+ maximum = otherInt.maximum;
+ } else if (otherInt.hasMinimum) {
+ if (otherInt.minimum < minimum) {
+ minimum = otherInt.minimum;
+ }
+ if (otherInt.maximum > maximum) {
+ maximum = otherInt.maximum;
+ }
+ }
+
+ overflow |= otherInt.overflow;
+ if (!overflow) {
+ boolean wasPositive = sum >= 0;
+ sum += otherInt.sum;
+ if ((otherInt.sum >= 0) == wasPositive) {
+ overflow = (sum >= 0) != wasPositive;
+ }
+ }
+ } else {
+ if (isStatsExists() && hasMinimum) {
+ throw new IllegalArgumentException("Incompatible merging of integer column statistics");
+ }
+ }
+ super.merge(other);
+ }
+
+ @Override
+ public OrcProto.ColumnStatistics.Builder serialize() {
+ OrcProto.ColumnStatistics.Builder builder = super.serialize();
+ OrcProto.IntegerStatistics.Builder intb =
+ OrcProto.IntegerStatistics.newBuilder();
+ if (hasMinimum) {
+ intb.setMinimum(minimum);
+ intb.setMaximum(maximum);
+ }
+ if (!overflow) {
+ intb.setSum(sum);
+ }
+ builder.setIntStatistics(intb);
+ return builder;
+ }
+
+ @Override
+ public long getMinimum() {
+ return minimum;
+ }
+
+ @Override
+ public long getMaximum() {
+ return maximum;
+ }
+
+ @Override
+ public boolean isSumDefined() {
+ return !overflow;
+ }
+
+ @Override
+ public long getSum() {
+ return sum;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buf = new StringBuilder(super.toString());
+ if (hasMinimum) {
+ buf.append(" min: ");
+ buf.append(minimum);
+ buf.append(" max: ");
+ buf.append(maximum);
+ }
+ if (!overflow) {
+ buf.append(" sum: ");
+ buf.append(sum);
+ }
+ return buf.toString();
+ }
+ }
+
+ private static final class DoubleStatisticsImpl extends ColumnStatisticsImpl
+ implements DoubleColumnStatistics {
+ private boolean hasMinimum = false;
+ private double minimum = Double.MAX_VALUE;
+ private double maximum = Double.MIN_VALUE;
+ private double sum = 0;
+
+ DoubleStatisticsImpl() {
+ }
+
+ DoubleStatisticsImpl(OrcProto.ColumnStatistics stats) {
+ super(stats);
+ OrcProto.DoubleStatistics dbl = stats.getDoubleStatistics();
+ if (dbl.hasMinimum()) {
+ hasMinimum = true;
+ minimum = dbl.getMinimum();
+ }
+ if (dbl.hasMaximum()) {
+ maximum = dbl.getMaximum();
+ }
+ if (dbl.hasSum()) {
+ sum = dbl.getSum();
+ }
+ }
+
+ @Override
+ public void reset() {
+ super.reset();
+ hasMinimum = false;
+ minimum = Double.MAX_VALUE;
+ maximum = Double.MIN_VALUE;
+ sum = 0;
+ }
+
+ @Override
+ public void updateDouble(double value) {
+ if (!hasMinimum) {
+ hasMinimum = true;
+ minimum = value;
+ maximum = value;
+ } else if (value < minimum) {
+ minimum = value;
+ } else if (value > maximum) {
+ maximum = value;
+ }
+ sum += value;
+ }
+
+ @Override
+ public void merge(ColumnStatisticsImpl other) {
+ if (other instanceof DoubleStatisticsImpl) {
+ DoubleStatisticsImpl dbl = (DoubleStatisticsImpl) other;
+ if (!hasMinimum) {
+ hasMinimum = dbl.hasMinimum;
+ minimum = dbl.minimum;
+ maximum = dbl.maximum;
+ } else if (dbl.hasMinimum) {
+ if (dbl.minimum < minimum) {
+ minimum = dbl.minimum;
+ }
+ if (dbl.maximum > maximum) {
+ maximum = dbl.maximum;
+ }
+ }
+ sum += dbl.sum;
+ } else {
+ if (isStatsExists() && hasMinimum) {
+ throw new IllegalArgumentException("Incompatible merging of double column statistics");
+ }
+ }
+ super.merge(other);
+ }
+
+ @Override
+ public OrcProto.ColumnStatistics.Builder serialize() {
+ OrcProto.ColumnStatistics.Builder builder = super.serialize();
+ OrcProto.DoubleStatistics.Builder dbl =
+ OrcProto.DoubleStatistics.newBuilder();
+ if (hasMinimum) {
+ dbl.setMinimum(minimum);
+ dbl.setMaximum(maximum);
+ }
+ dbl.setSum(sum);
+ builder.setDoubleStatistics(dbl);
+ return builder;
+ }
+
+ @Override
+ public double getMinimum() {
+ return minimum;
+ }
+
+ @Override
+ public double getMaximum() {
+ return maximum;
+ }
+
+ @Override
+ public double getSum() {
+ return sum;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buf = new StringBuilder(super.toString());
+ if (hasMinimum) {
+ buf.append(" min: ");
+ buf.append(minimum);
+ buf.append(" max: ");
+ buf.append(maximum);
+ }
+ buf.append(" sum: ");
+ buf.append(sum);
+ return buf.toString();
+ }
+ }
+
+ protected static final class StringStatisticsImpl extends ColumnStatisticsImpl
+ implements StringColumnStatistics {
+ private Text minimum = null;
+ private Text maximum = null;
+ private long sum = 0;
+
+ StringStatisticsImpl() {
+ }
+
+ StringStatisticsImpl(OrcProto.ColumnStatistics stats) {
+ super(stats);
+ OrcProto.StringStatistics str = stats.getStringStatistics();
+ if (str.hasMaximum()) {
+ maximum = new Text(str.getMaximum());
+ }
+ if (str.hasMinimum()) {
+ minimum = new Text(str.getMinimum());
+ }
+ if(str.hasSum()) {
+ sum = str.getSum();
+ }
+ }
+
+ @Override
+ public void reset() {
+ super.reset();
+ minimum = null;
+ maximum = null;
+ sum = 0;
+ }
+
+ @Override
+ public void updateString(Text value) {
+ if (minimum == null) {
+ maximum = minimum = new Text(value);
+ } else if (minimum.compareTo(value) > 0) {
+ minimum = new Text(value);
+ } else if (maximum.compareTo(value) < 0) {
+ maximum = new Text(value);
+ }
+ sum += value.getLength();
+ }
+
+ @Override
+ public void updateString(byte[] bytes, int offset, int length,
+ int repetitions) {
+ if (minimum == null) {
+ maximum = minimum = new Text();
+ maximum.set(bytes, offset, length);
+ } else if (WritableComparator.compareBytes(minimum.getBytes(), 0,
+ minimum.getLength(), bytes, offset, length) > 0) {
+ minimum = new Text();
+ minimum.set(bytes, offset, length);
+ } else if (WritableComparator.compareBytes(maximum.getBytes(), 0,
+ maximum.getLength(), bytes, offset, length) < 0) {
+ maximum = new Text();
+ maximum.set(bytes, offset, length);
+ }
+ sum += length * repetitions;
+ }
+
+ @Override
+ public void merge(ColumnStatisticsImpl other) {
+ if (other instanceof StringStatisticsImpl) {
+ StringStatisticsImpl str = (StringStatisticsImpl) other;
+ if (minimum == null) {
+ if (str.minimum != null) {
+ maximum = new Text(str.getMaximum());
+ minimum = new Text(str.getMinimum());
+ } else {
+ /* both are empty */
+ maximum = minimum = null;
+ }
+ } else if (str.minimum != null) {
+ if (minimum.compareTo(str.minimum) > 0) {
+ minimum = new Text(str.getMinimum());
+ }
+ if (maximum.compareTo(str.maximum) < 0) {
+ maximum = new Text(str.getMaximum());
+ }
+ }
+ sum += str.sum;
+ } else {
+ if (isStatsExists() && minimum != null) {
+ throw new IllegalArgumentException("Incompatible merging of string column statistics");
+ }
+ }
+ super.merge(other);
+ }
+
+ @Override
+ public OrcProto.ColumnStatistics.Builder serialize() {
+ OrcProto.ColumnStatistics.Builder result = super.serialize();
+ OrcProto.StringStatistics.Builder str =
+ OrcProto.StringStatistics.newBuilder();
+ if (getNumberOfValues() != 0) {
+ str.setMinimum(getMinimum());
+ str.setMaximum(getMaximum());
+ str.setSum(sum);
+ }
+ result.setStringStatistics(str);
+ return result;
+ }
+
+ @Override
+ public String getMinimum() {
+ return minimum == null ? null : minimum.toString();
+ }
+
+ @Override
+ public String getMaximum() {
+ return maximum == null ? null : maximum.toString();
+ }
+
+ @Override
+ public long getSum() {
+ return sum;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buf = new StringBuilder(super.toString());
+ if (getNumberOfValues() != 0) {
+ buf.append(" min: ");
+ buf.append(getMinimum());
+ buf.append(" max: ");
+ buf.append(getMaximum());
+ buf.append(" sum: ");
+ buf.append(sum);
+ }
+ return buf.toString();
+ }
+ }
+
+ protected static final class BinaryStatisticsImpl extends ColumnStatisticsImpl implements
+ BinaryColumnStatistics {
+
+ private long sum = 0;
+
+ BinaryStatisticsImpl() {
+ }
+
+ BinaryStatisticsImpl(OrcProto.ColumnStatistics stats) {
+ super(stats);
+ OrcProto.BinaryStatistics binStats = stats.getBinaryStatistics();
+ if (binStats.hasSum()) {
+ sum = binStats.getSum();
+ }
+ }
+
+ @Override
+ public void reset() {
+ super.reset();
+ sum = 0;
+ }
+
+ @Override
+ public void updateBinary(BytesWritable value) {
+ sum += value.getLength();
+ }
+
+ @Override
+ public void updateBinary(byte[] bytes, int offset, int length,
+ int repetitions) {
+ sum += length * repetitions;
+ }
+
+ @Override
+ public void merge(ColumnStatisticsImpl other) {
+ if (other instanceof BinaryColumnStatistics) {
+ BinaryStatisticsImpl bin = (BinaryStatisticsImpl) other;
+ sum += bin.sum;
+ } else {
+ if (isStatsExists() && sum != 0) {
+ throw new IllegalArgumentException("Incompatible merging of binary column statistics");
+ }
+ }
+ super.merge(other);
+ }
+
+ @Override
+ public long getSum() {
+ return sum;
+ }
+
+ @Override
+ public OrcProto.ColumnStatistics.Builder serialize() {
+ OrcProto.ColumnStatistics.Builder result = super.serialize();
+ OrcProto.BinaryStatistics.Builder bin = OrcProto.BinaryStatistics.newBuilder();
+ bin.setSum(sum);
+ result.setBinaryStatistics(bin);
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buf = new StringBuilder(super.toString());
+ if (getNumberOfValues() != 0) {
+ buf.append(" sum: ");
+ buf.append(sum);
+ }
+ return buf.toString();
+ }
+ }
+
+ private static final class DecimalStatisticsImpl extends ColumnStatisticsImpl
+ implements DecimalColumnStatistics {
+ private HiveDecimal minimum = null;
+ private HiveDecimal maximum = null;
+ private HiveDecimal sum = HiveDecimal.ZERO;
+
+ DecimalStatisticsImpl() {
+ }
+
+ DecimalStatisticsImpl(OrcProto.ColumnStatistics stats) {
+ super(stats);
+ OrcProto.DecimalStatistics dec = stats.getDecimalStatistics();
+ if (dec.hasMaximum()) {
+ maximum = HiveDecimal.create(dec.getMaximum());
+ }
+ if (dec.hasMinimum()) {
+ minimum = HiveDecimal.create(dec.getMinimum());
+ }
+ if (dec.hasSum()) {
+ sum = HiveDecimal.create(dec.getSum());
+ } else {
+ sum = null;
+ }
+ }
+
+ @Override
+ public void reset() {
+ super.reset();
+ minimum = null;
+ maximum = null;
+ sum = HiveDecimal.ZERO;
+ }
+
+ @Override
+ public void updateDecimal(HiveDecimal value) {
+ if (minimum == null) {
+ minimum = value;
+ maximum = value;
+ } else if (minimum.compareTo(value) > 0) {
+ minimum = value;
+ } else if (maximum.compareTo(value) < 0) {
+ maximum = value;
+ }
+ if (sum != null) {
+ sum = sum.add(value);
+ }
+ }
+
+ @Override
+ public void merge(ColumnStatisticsImpl other) {
+ if (other instanceof DecimalStatisticsImpl) {
+ DecimalStatisticsImpl dec = (DecimalStatisticsImpl) other;
+ if (minimum == null) {
+ minimum = dec.minimum;
+ maximum = dec.maximum;
+ sum = dec.sum;
+ } else if (dec.minimum != null) {
+ if (minimum.compareTo(dec.minimum) > 0) {
+ minimum = dec.minimum;
+ }
+ if (maximum.compareTo(dec.maximum) < 0) {
+ maximum = dec.maximum;
+ }
+ if (sum == null || dec.sum == null) {
+ sum = null;
+ } else {
+ sum = sum.add(dec.sum);
+ }
+ }
+ } else {
+ if (isStatsExists() && minimum != null) {
+ throw new IllegalArgumentException("Incompatible merging of decimal column statistics");
+ }
+ }
+ super.merge(other);
+ }
+
+ @Override
+ public OrcProto.ColumnStatistics.Builder serialize() {
+ OrcProto.ColumnStatistics.Builder result = super.serialize();
+ OrcProto.DecimalStatistics.Builder dec =
+ OrcProto.DecimalStatistics.newBuilder();
+ if (getNumberOfValues() != 0 && minimum != null) {
+ dec.setMinimum(minimum.toString());
+ dec.setMaximum(maximum.toString());
+ }
+ if (sum != null) {
+ dec.setSum(sum.toString());
+ }
+ result.setDecimalStatistics(dec);
+ return result;
+ }
+
+ @Override
+ public HiveDecimal getMinimum() {
+ return minimum;
+ }
+
+ @Override
+ public HiveDecimal getMaximum() {
+ return maximum;
+ }
+
+ @Override
+ public HiveDecimal getSum() {
+ return sum;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buf = new StringBuilder(super.toString());
+ if (getNumberOfValues() != 0) {
+ buf.append(" min: ");
+ buf.append(minimum);
+ buf.append(" max: ");
+ buf.append(maximum);
+ if (sum != null) {
+ buf.append(" sum: ");
+ buf.append(sum);
+ }
+ }
+ return buf.toString();
+ }
+ }
+
+ private static final class DateStatisticsImpl extends ColumnStatisticsImpl
+ implements DateColumnStatistics {
+ private Integer minimum = null;
+ private Integer maximum = null;
+
+ DateStatisticsImpl() {
+ }
+
+ DateStatisticsImpl(OrcProto.ColumnStatistics stats) {
+ super(stats);
+ OrcProto.DateStatistics dateStats = stats.getDateStatistics();
+ // min,max values serialized/deserialized as int (days since epoch)
+ if (dateStats.hasMaximum()) {
+ maximum = dateStats.getMaximum();
+ }
+ if (dateStats.hasMinimum()) {
+ minimum = dateStats.getMinimum();
+ }
+ }
+
+ @Override
+ public void reset() {
+ super.reset();
+ minimum = null;
+ maximum = null;
+ }
+
+ @Override
+ public void updateDate(DateWritable value) {
+ if (minimum == null) {
+ minimum = value.getDays();
+ maximum = value.getDays();
+ } else if (minimum > value.getDays()) {
+ minimum = value.getDays();
+ } else if (maximum < value.getDays()) {
+ maximum = value.getDays();
+ }
+ }
+
+ @Override
+ public void updateDate(int value) {
+ if (minimum == null) {
+ minimum = value;
+ maximum = value;
+ } else if (minimum > value) {
+ minimum = value;
+ } else if (maximum < value) {
+ maximum = value;
+ }
+ }
+
+ @Override
+ public void merge(ColumnStatisticsImpl other) {
+ if (other instanceof DateStatisticsImpl) {
+ DateStatisticsImpl dateStats = (DateStatisticsImpl) other;
+ if (minimum == null) {
+ minimum = dateStats.minimum;
+ maximum = dateStats.maximum;
+ } else if (dateStats.minimum != null) {
+ if (minimum > dateStats.minimum) {
+ minimum = dateStats.minimum;
+ }
+ if (maximum < dateStats.maximum) {
+ maximum = dateStats.maximum;
+ }
+ }
+ } else {
+ if (isStatsExists() && minimum != null) {
+ throw new IllegalArgumentException("Incompatible merging of date column statistics");
+ }
+ }
+ super.merge(other);
+ }
+
+ @Override
+ public OrcProto.ColumnStatistics.Builder serialize() {
+ OrcProto.ColumnStatistics.Builder result = super.serialize();
+ OrcProto.DateStatistics.Builder dateStats =
+ OrcProto.DateStatistics.newBuilder();
+ if (getNumberOfValues() != 0 && minimum != null) {
+ dateStats.setMinimum(minimum);
+ dateStats.setMaximum(maximum);
+ }
+ result.setDateStatistics(dateStats);
+ return result;
+ }
+
+ private transient final DateWritable minDate = new DateWritable();
+ private transient final DateWritable maxDate = new DateWritable();
+
+ @Override
+ public Date getMinimum() {
+ if (minimum == null) {
+ return null;
+ }
+ minDate.set(minimum);
+ return minDate.get();
+ }
+
+ @Override
+ public Date getMaximum() {
+ if (maximum == null) {
+ return null;
+ }
+ maxDate.set(maximum);
+ return maxDate.get();
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buf = new StringBuilder(super.toString());
+ if (getNumberOfValues() != 0) {
+ buf.append(" min: ");
+ buf.append(getMinimum());
+ buf.append(" max: ");
+ buf.append(getMaximum());
+ }
+ return buf.toString();
+ }
+ }
+
+ private static final class TimestampStatisticsImpl extends ColumnStatisticsImpl
+ implements TimestampColumnStatistics {
+ private Long minimum = null;
+ private Long maximum = null;
+
+ TimestampStatisticsImpl() {
+ }
+
+ TimestampStatisticsImpl(OrcProto.ColumnStatistics stats) {
+ super(stats);
+ OrcProto.TimestampStatistics timestampStats = stats.getTimestampStatistics();
+ // min,max values serialized/deserialized as int (milliseconds since epoch)
+ if (timestampStats.hasMaximum()) {
+ maximum = timestampStats.getMaximum();
+ }
+ if (timestampStats.hasMinimum()) {
+ minimum = timestampStats.getMinimum();
+ }
+ }
+
+ @Override
+ public void reset() {
+ super.reset();
+ minimum = null;
+ maximum = null;
+ }
+
+ @Override
+ public void updateTimestamp(Timestamp value) {
+ if (minimum == null) {
+ minimum = value.getTime();
+ maximum = value.getTime();
+ } else if (minimum > value.getTime()) {
+ minimum = value.getTime();
+ } else if (maximum < value.getTime()) {
+ maximum = value.getTime();
+ }
+ }
+
+ @Override
+ public void updateTimestamp(long value) {
+ if (minimum == null) {
+ minimum = value;
+ maximum = value;
+ } else if (minimum > value) {
+ minimum = value;
+ } else if (maximum < value) {
+ maximum = value;
+ }
+ }
+
+ @Override
+ public void merge(ColumnStatisticsImpl other) {
+ if (other instanceof TimestampStatisticsImpl) {
+ TimestampStatisticsImpl timestampStats = (TimestampStatisticsImpl) other;
+ if (minimum == null) {
+ minimum = timestampStats.minimum;
+ maximum = timestampStats.maximum;
+ } else if (timestampStats.minimum != null) {
+ if (minimum > timestampStats.minimum) {
+ minimum = timestampStats.minimum;
+ }
+ if (maximum < timestampStats.maximum) {
+ maximum = timestampStats.maximum;
+ }
+ }
+ } else {
+ if (isStatsExists() && minimum != null) {
+ throw new IllegalArgumentException("Incompatible merging of timestamp column statistics");
+ }
+ }
+ super.merge(other);
+ }
+
+ @Override
+ public OrcProto.ColumnStatistics.Builder serialize() {
+ OrcProto.ColumnStatistics.Builder result = super.serialize();
+ OrcProto.TimestampStatistics.Builder timestampStats = OrcProto.TimestampStatistics
+ .newBuilder();
+ if (getNumberOfValues() != 0 && minimum != null) {
+ timestampStats.setMinimum(minimum);
+ timestampStats.setMaximum(maximum);
+ }
+ result.setTimestampStatistics(timestampStats);
+ return result;
+ }
+
+ @Override
+ public Timestamp getMinimum() {
+ return minimum == null ? null : new Timestamp(minimum);
+ }
+
+ @Override
+ public Timestamp getMaximum() {
+ return maximum == null ? null : new Timestamp(maximum);
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buf = new StringBuilder(super.toString());
+ if (getNumberOfValues() != 0) {
+ buf.append(" min: ");
+ buf.append(getMinimum());
+ buf.append(" max: ");
+ buf.append(getMaximum());
+ }
+ return buf.toString();
+ }
+ }
+
+ private long count = 0;
+ private boolean hasNull = false;
+
+ ColumnStatisticsImpl(OrcProto.ColumnStatistics stats) {
+ if (stats.hasNumberOfValues()) {
+ count = stats.getNumberOfValues();
+ }
+
+ if (stats.hasHasNull()) {
+ hasNull = stats.getHasNull();
+ } else {
+ hasNull = true;
+ }
+ }
+
+ ColumnStatisticsImpl() {
+ }
+
+ public void increment() {
+ count += 1;
+ }
+
+ public void increment(int count) {
+ this.count += count;
+ }
+
+ public void setNull() {
+ hasNull = true;
+ }
+
+ public void updateBoolean(boolean value, int repetitions) {
+ throw new UnsupportedOperationException("Can't update boolean");
+ }
+
+ public void updateInteger(long value, int repetitions) {
+ throw new UnsupportedOperationException("Can't update integer");
+ }
+
+ public void updateDouble(double value) {
+ throw new UnsupportedOperationException("Can't update double");
+ }
+
+ public void updateString(Text value) {
+ throw new UnsupportedOperationException("Can't update string");
+ }
+
+ public void updateString(byte[] bytes, int offset, int length,
+ int repetitions) {
+ throw new UnsupportedOperationException("Can't update string");
+ }
+
+ public void updateBinary(BytesWritable value) {
+ throw new UnsupportedOperationException("Can't update binary");
+ }
+
+ public void updateBinary(byte[] bytes, int offset, int length,
+ int repetitions) {
+ throw new UnsupportedOperationException("Can't update string");
+ }
+
+ public void updateDecimal(HiveDecimal value) {
+ throw new UnsupportedOperationException("Can't update decimal");
+ }
+
+ public void updateDate(DateWritable value) {
+ throw new UnsupportedOperationException("Can't update date");
+ }
+
+ public void updateDate(int value) {
+ throw new UnsupportedOperationException("Can't update date");
+ }
+
+ public void updateTimestamp(Timestamp value) {
+ throw new UnsupportedOperationException("Can't update timestamp");
+ }
+
+ public void updateTimestamp(long value) {
+ throw new UnsupportedOperationException("Can't update timestamp");
+ }
+
+ public boolean isStatsExists() {
+ return (count > 0 || hasNull == true);
+ }
+
+ public void merge(ColumnStatisticsImpl stats) {
+ count += stats.count;
+ hasNull |= stats.hasNull;
+ }
+
+ public void reset() {
+ count = 0;
+ hasNull = false;
+ }
+
+ @Override
+ public long getNumberOfValues() {
+ return count;
+ }
+
+ @Override
+ public boolean hasNull() {
+ return hasNull;
+ }
+
+ @Override
+ public String toString() {
+ return "count: " + count + " hasNull: " + hasNull;
+ }
+
+ public OrcProto.ColumnStatistics.Builder serialize() {
+ OrcProto.ColumnStatistics.Builder builder =
+ OrcProto.ColumnStatistics.newBuilder();
+ builder.setNumberOfValues(count);
+ builder.setHasNull(hasNull);
+ return builder;
+ }
+
+ public static ColumnStatisticsImpl create(TypeDescription schema) {
+ switch (schema.getCategory()) {
+ case BOOLEAN:
+ return new BooleanStatisticsImpl();
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ return new IntegerStatisticsImpl();
+ case FLOAT:
+ case DOUBLE:
+ return new DoubleStatisticsImpl();
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ return new StringStatisticsImpl();
+ case DECIMAL:
+ return new DecimalStatisticsImpl();
+ case DATE:
+ return new DateStatisticsImpl();
+ case TIMESTAMP:
+ return new TimestampStatisticsImpl();
+ case BINARY:
+ return new BinaryStatisticsImpl();
+ default:
+ return new ColumnStatisticsImpl();
+ }
+ }
+
+ public static ColumnStatisticsImpl deserialize(OrcProto.ColumnStatistics stats) {
+ if (stats.hasBucketStatistics()) {
+ return new BooleanStatisticsImpl(stats);
+ } else if (stats.hasIntStatistics()) {
+ return new IntegerStatisticsImpl(stats);
+ } else if (stats.hasDoubleStatistics()) {
+ return new DoubleStatisticsImpl(stats);
+ } else if (stats.hasStringStatistics()) {
+ return new StringStatisticsImpl(stats);
+ } else if (stats.hasDecimalStatistics()) {
+ return new DecimalStatisticsImpl(stats);
+ } else if (stats.hasDateStatistics()) {
+ return new DateStatisticsImpl(stats);
+ } else if (stats.hasTimestampStatistics()) {
+ return new TimestampStatisticsImpl(stats);
+ } else if(stats.hasBinaryStatistics()) {
+ return new BinaryStatisticsImpl(stats);
+ } else {
+ return new ColumnStatisticsImpl(stats);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/impl/DirectDecompressionCodec.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/DirectDecompressionCodec.java b/orc/src/java/org/apache/orc/impl/DirectDecompressionCodec.java
new file mode 100644
index 0000000..7e0110d
--- /dev/null
+++ b/orc/src/java/org/apache/orc/impl/DirectDecompressionCodec.java
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.impl;
+
+import org.apache.orc.CompressionCodec;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+public interface DirectDecompressionCodec extends CompressionCodec {
+ public boolean isAvailable();
+ public void directDecompress(ByteBuffer in, ByteBuffer out) throws IOException;
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/impl/DynamicByteArray.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/DynamicByteArray.java b/orc/src/java/org/apache/orc/impl/DynamicByteArray.java
new file mode 100644
index 0000000..986c2ac
--- /dev/null
+++ b/orc/src/java/org/apache/orc/impl/DynamicByteArray.java
@@ -0,0 +1,303 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.impl;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+
+import org.apache.hadoop.io.Text;
+
+/**
+ * A class that is a growable array of bytes. Growth is managed in terms of
+ * chunks that are allocated when needed.
+ */
+public final class DynamicByteArray {
+ static final int DEFAULT_CHUNKSIZE = 32 * 1024;
+ static final int DEFAULT_NUM_CHUNKS = 128;
+
+ private final int chunkSize; // our allocation sizes
+ private byte[][] data; // the real data
+ private int length; // max set element index +1
+ private int initializedChunks = 0; // the number of chunks created
+
+ public DynamicByteArray() {
+ this(DEFAULT_NUM_CHUNKS, DEFAULT_CHUNKSIZE);
+ }
+
+ public DynamicByteArray(int numChunks, int chunkSize) {
+ if (chunkSize == 0) {
+ throw new IllegalArgumentException("bad chunksize");
+ }
+ this.chunkSize = chunkSize;
+ data = new byte[numChunks][];
+ }
+
+ /**
+ * Ensure that the given index is valid.
+ */
+ private void grow(int chunkIndex) {
+ if (chunkIndex >= initializedChunks) {
+ if (chunkIndex >= data.length) {
+ int newSize = Math.max(chunkIndex + 1, 2 * data.length);
+ byte[][] newChunk = new byte[newSize][];
+ System.arraycopy(data, 0, newChunk, 0, data.length);
+ data = newChunk;
+ }
+ for(int i=initializedChunks; i <= chunkIndex; ++i) {
+ data[i] = new byte[chunkSize];
+ }
+ initializedChunks = chunkIndex + 1;
+ }
+ }
+
+ public byte get(int index) {
+ if (index >= length) {
+ throw new IndexOutOfBoundsException("Index " + index +
+ " is outside of 0.." +
+ (length - 1));
+ }
+ int i = index / chunkSize;
+ int j = index % chunkSize;
+ return data[i][j];
+ }
+
+ public void set(int index, byte value) {
+ int i = index / chunkSize;
+ int j = index % chunkSize;
+ grow(i);
+ if (index >= length) {
+ length = index + 1;
+ }
+ data[i][j] = value;
+ }
+
+ public int add(byte value) {
+ int i = length / chunkSize;
+ int j = length % chunkSize;
+ grow(i);
+ data[i][j] = value;
+ int result = length;
+ length += 1;
+ return result;
+ }
+
+ /**
+ * Copy a slice of a byte array into our buffer.
+ * @param value the array to copy from
+ * @param valueOffset the first location to copy from value
+ * @param valueLength the number of bytes to copy from value
+ * @return the offset of the start of the value
+ */
+ public int add(byte[] value, int valueOffset, int valueLength) {
+ int i = length / chunkSize;
+ int j = length % chunkSize;
+ grow((length + valueLength) / chunkSize);
+ int remaining = valueLength;
+ while (remaining > 0) {
+ int size = Math.min(remaining, chunkSize - j);
+ System.arraycopy(value, valueOffset, data[i], j, size);
+ remaining -= size;
+ valueOffset += size;
+ i += 1;
+ j = 0;
+ }
+ int result = length;
+ length += valueLength;
+ return result;
+ }
+
+ /**
+ * Read the entire stream into this array.
+ * @param in the stream to read from
+ * @throws IOException
+ */
+ public void readAll(InputStream in) throws IOException {
+ int currentChunk = length / chunkSize;
+ int currentOffset = length % chunkSize;
+ grow(currentChunk);
+ int currentLength = in.read(data[currentChunk], currentOffset,
+ chunkSize - currentOffset);
+ while (currentLength > 0) {
+ length += currentLength;
+ currentOffset = length % chunkSize;
+ if (currentOffset == 0) {
+ currentChunk = length / chunkSize;
+ grow(currentChunk);
+ }
+ currentLength = in.read(data[currentChunk], currentOffset,
+ chunkSize - currentOffset);
+ }
+ }
+
+ /**
+ * Byte compare a set of bytes against the bytes in this dynamic array.
+ * @param other source of the other bytes
+ * @param otherOffset start offset in the other array
+ * @param otherLength number of bytes in the other array
+ * @param ourOffset the offset in our array
+ * @param ourLength the number of bytes in our array
+ * @return negative for less, 0 for equal, positive for greater
+ */
+ public int compare(byte[] other, int otherOffset, int otherLength,
+ int ourOffset, int ourLength) {
+ int currentChunk = ourOffset / chunkSize;
+ int currentOffset = ourOffset % chunkSize;
+ int maxLength = Math.min(otherLength, ourLength);
+ while (maxLength > 0 &&
+ other[otherOffset] == data[currentChunk][currentOffset]) {
+ otherOffset += 1;
+ currentOffset += 1;
+ if (currentOffset == chunkSize) {
+ currentChunk += 1;
+ currentOffset = 0;
+ }
+ maxLength -= 1;
+ }
+ if (maxLength == 0) {
+ return otherLength - ourLength;
+ }
+ int otherByte = 0xff & other[otherOffset];
+ int ourByte = 0xff & data[currentChunk][currentOffset];
+ return otherByte > ourByte ? 1 : -1;
+ }
+
+ /**
+ * Get the size of the array.
+ * @return the number of bytes in the array
+ */
+ public int size() {
+ return length;
+ }
+
+ /**
+ * Clear the array to its original pristine state.
+ */
+ public void clear() {
+ length = 0;
+ for(int i=0; i < data.length; ++i) {
+ data[i] = null;
+ }
+ initializedChunks = 0;
+ }
+
+ /**
+ * Set a text value from the bytes in this dynamic array.
+ * @param result the value to set
+ * @param offset the start of the bytes to copy
+ * @param length the number of bytes to copy
+ */
+ public void setText(Text result, int offset, int length) {
+ result.clear();
+ int currentChunk = offset / chunkSize;
+ int currentOffset = offset % chunkSize;
+ int currentLength = Math.min(length, chunkSize - currentOffset);
+ while (length > 0) {
+ result.append(data[currentChunk], currentOffset, currentLength);
+ length -= currentLength;
+ currentChunk += 1;
+ currentOffset = 0;
+ currentLength = Math.min(length, chunkSize - currentOffset);
+ }
+ }
+
+ /**
+ * Write out a range of this dynamic array to an output stream.
+ * @param out the stream to write to
+ * @param offset the first offset to write
+ * @param length the number of bytes to write
+ * @throws IOException
+ */
+ public void write(OutputStream out, int offset,
+ int length) throws IOException {
+ int currentChunk = offset / chunkSize;
+ int currentOffset = offset % chunkSize;
+ while (length > 0) {
+ int currentLength = Math.min(length, chunkSize - currentOffset);
+ out.write(data[currentChunk], currentOffset, currentLength);
+ length -= currentLength;
+ currentChunk += 1;
+ currentOffset = 0;
+ }
+ }
+
+ @Override
+ public String toString() {
+ int i;
+ StringBuilder sb = new StringBuilder(length * 3);
+
+ sb.append('{');
+ int l = length - 1;
+ for (i=0; i<l; i++) {
+ sb.append(Integer.toHexString(get(i)));
+ sb.append(',');
+ }
+ sb.append(get(i));
+ sb.append('}');
+
+ return sb.toString();
+ }
+
+ public void setByteBuffer(ByteBuffer result, int offset, int length) {
+ result.clear();
+ int currentChunk = offset / chunkSize;
+ int currentOffset = offset % chunkSize;
+ int currentLength = Math.min(length, chunkSize - currentOffset);
+ while (length > 0) {
+ result.put(data[currentChunk], currentOffset, currentLength);
+ length -= currentLength;
+ currentChunk += 1;
+ currentOffset = 0;
+ currentLength = Math.min(length, chunkSize - currentOffset);
+ }
+ }
+
+ /**
+ * Gets all the bytes of the array.
+ *
+ * @return Bytes of the array
+ */
+ public byte[] get() {
+ byte[] result = null;
+ if (length > 0) {
+ int currentChunk = 0;
+ int currentOffset = 0;
+ int currentLength = Math.min(length, chunkSize);
+ int destOffset = 0;
+ result = new byte[length];
+ int totalLength = length;
+ while (totalLength > 0) {
+ System.arraycopy(data[currentChunk], currentOffset, result, destOffset, currentLength);
+ destOffset += currentLength;
+ totalLength -= currentLength;
+ currentChunk += 1;
+ currentOffset = 0;
+ currentLength = Math.min(totalLength, chunkSize - currentOffset);
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Get the size of the buffers.
+ */
+ public long getSizeInBytes() {
+ return initializedChunks * chunkSize;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/impl/DynamicIntArray.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/DynamicIntArray.java b/orc/src/java/org/apache/orc/impl/DynamicIntArray.java
new file mode 100644
index 0000000..3b2884b
--- /dev/null
+++ b/orc/src/java/org/apache/orc/impl/DynamicIntArray.java
@@ -0,0 +1,142 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.impl;
+
+/**
+ * Dynamic int array that uses primitive types and chunks to avoid copying
+ * large number of integers when it resizes.
+ *
+ * The motivation for this class is memory optimization, i.e. space efficient
+ * storage of potentially huge arrays without good a-priori size guesses.
+ *
+ * The API of this class is between a primitive array and a AbstractList. It's
+ * not a Collection implementation because it handles primitive types, but the
+ * API could be extended to support iterators and the like.
+ *
+ * NOTE: Like standard Collection implementations/arrays, this class is not
+ * synchronized.
+ */
+public final class DynamicIntArray {
+ static final int DEFAULT_CHUNKSIZE = 8 * 1024;
+ static final int INIT_CHUNKS = 128;
+
+ private final int chunkSize; // our allocation size
+ private int[][] data; // the real data
+ private int length; // max set element index +1
+ private int initializedChunks = 0; // the number of created chunks
+
+ public DynamicIntArray() {
+ this(DEFAULT_CHUNKSIZE);
+ }
+
+ public DynamicIntArray(int chunkSize) {
+ this.chunkSize = chunkSize;
+
+ data = new int[INIT_CHUNKS][];
+ }
+
+ /**
+ * Ensure that the given index is valid.
+ */
+ private void grow(int chunkIndex) {
+ if (chunkIndex >= initializedChunks) {
+ if (chunkIndex >= data.length) {
+ int newSize = Math.max(chunkIndex + 1, 2 * data.length);
+ int[][] newChunk = new int[newSize][];
+ System.arraycopy(data, 0, newChunk, 0, data.length);
+ data = newChunk;
+ }
+ for (int i=initializedChunks; i <= chunkIndex; ++i) {
+ data[i] = new int[chunkSize];
+ }
+ initializedChunks = chunkIndex + 1;
+ }
+ }
+
+ public int get(int index) {
+ if (index >= length) {
+ throw new IndexOutOfBoundsException("Index " + index +
+ " is outside of 0.." +
+ (length - 1));
+ }
+ int i = index / chunkSize;
+ int j = index % chunkSize;
+ return data[i][j];
+ }
+
+ public void set(int index, int value) {
+ int i = index / chunkSize;
+ int j = index % chunkSize;
+ grow(i);
+ if (index >= length) {
+ length = index + 1;
+ }
+ data[i][j] = value;
+ }
+
+ public void increment(int index, int value) {
+ int i = index / chunkSize;
+ int j = index % chunkSize;
+ grow(i);
+ if (index >= length) {
+ length = index + 1;
+ }
+ data[i][j] += value;
+ }
+
+ public void add(int value) {
+ int i = length / chunkSize;
+ int j = length % chunkSize;
+ grow(i);
+ data[i][j] = value;
+ length += 1;
+ }
+
+ public int size() {
+ return length;
+ }
+
+ public void clear() {
+ length = 0;
+ for(int i=0; i < data.length; ++i) {
+ data[i] = null;
+ }
+ initializedChunks = 0;
+ }
+
+ public String toString() {
+ int i;
+ StringBuilder sb = new StringBuilder(length * 4);
+
+ sb.append('{');
+ int l = length - 1;
+ for (i=0; i<l; i++) {
+ sb.append(get(i));
+ sb.append(',');
+ }
+ sb.append(get(i));
+ sb.append('}');
+
+ return sb.toString();
+ }
+
+ public int getSizeInBytes() {
+ return 4 * initializedChunks * chunkSize;
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/impl/HadoopShims.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/HadoopShims.java b/orc/src/java/org/apache/orc/impl/HadoopShims.java
new file mode 100644
index 0000000..2980d71
--- /dev/null
+++ b/orc/src/java/org/apache/orc/impl/HadoopShims.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.impl;
+
+import org.apache.hadoop.util.VersionInfo;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+public interface HadoopShims {
+
+ enum DirectCompressionType {
+ NONE,
+ ZLIB_NOHEADER,
+ ZLIB,
+ SNAPPY,
+ }
+
+ interface DirectDecompressor {
+ void decompress(ByteBuffer var1, ByteBuffer var2) throws IOException;
+ }
+
+ /**
+ * Get a direct decompressor codec, if it is available
+ * @param codec
+ * @return
+ */
+ DirectDecompressor getDirectDecompressor(DirectCompressionType codec);
+
+
+ class Factory {
+ private static HadoopShims SHIMS = null;
+
+ public static synchronized HadoopShims get() {
+ if (SHIMS == null) {
+ String[] versionParts = VersionInfo.getVersion().split("[.]");
+ int major = Integer.parseInt(versionParts[0]);
+ int minor = Integer.parseInt(versionParts[1]);
+ if (major < 2 || (major == 2 && minor < 3)) {
+ SHIMS = new HadoopShims_2_2();
+ } else {
+ SHIMS = new HadoopShimsCurrent();
+ }
+ }
+ return SHIMS;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/impl/HadoopShimsCurrent.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/HadoopShimsCurrent.java b/orc/src/java/org/apache/orc/impl/HadoopShimsCurrent.java
new file mode 100644
index 0000000..3b9371d
--- /dev/null
+++ b/orc/src/java/org/apache/orc/impl/HadoopShimsCurrent.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.impl;
+
+import org.apache.hadoop.io.compress.snappy.SnappyDecompressor;
+import org.apache.hadoop.io.compress.zlib.ZlibDecompressor;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+/**
+ * Shims for recent versions of Hadoop
+ */
+public class HadoopShimsCurrent implements HadoopShims {
+
+ private static class DirectDecompressWrapper implements DirectDecompressor {
+ private final org.apache.hadoop.io.compress.DirectDecompressor root;
+
+ DirectDecompressWrapper(org.apache.hadoop.io.compress.DirectDecompressor root) {
+ this.root = root;
+ }
+
+ public void decompress(ByteBuffer input,
+ ByteBuffer output) throws IOException {
+ root.decompress(input, output);
+ }
+ }
+
+ public DirectDecompressor getDirectDecompressor(
+ DirectCompressionType codec) {
+ switch (codec) {
+ case ZLIB:
+ return new DirectDecompressWrapper
+ (new ZlibDecompressor.ZlibDirectDecompressor());
+ case ZLIB_NOHEADER:
+ return new DirectDecompressWrapper
+ (new ZlibDecompressor.ZlibDirectDecompressor
+ (ZlibDecompressor.CompressionHeader.NO_HEADER, 0));
+ case SNAPPY:
+ return new DirectDecompressWrapper
+ (new SnappyDecompressor.SnappyDirectDecompressor());
+ default:
+ return null;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/impl/HadoopShims_2_2.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/HadoopShims_2_2.java b/orc/src/java/org/apache/orc/impl/HadoopShims_2_2.java
new file mode 100644
index 0000000..ac46836
--- /dev/null
+++ b/orc/src/java/org/apache/orc/impl/HadoopShims_2_2.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.impl;
+
+import org.apache.hadoop.io.compress.snappy.SnappyDecompressor;
+import org.apache.hadoop.io.compress.zlib.ZlibDecompressor;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+/**
+ * Shims for versions of Hadoop up to and including 2.2.x
+ */
+public class HadoopShims_2_2 implements HadoopShims {
+
+ public DirectDecompressor getDirectDecompressor(
+ DirectCompressionType codec) {
+ return null;
+ }
+}