You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by om...@apache.org on 2015/12/12 00:28:11 UTC
[14/16] hive git commit: HIVE-11890. Create ORC submodue. (omalley
reviewed by prasanthj)
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/BinaryColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/BinaryColumnStatistics.java b/orc/src/java/org/apache/orc/BinaryColumnStatistics.java
new file mode 100644
index 0000000..19db98a
--- /dev/null
+++ b/orc/src/java/org/apache/orc/BinaryColumnStatistics.java
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import org.apache.orc.ColumnStatistics;
+
+/**
+ * Statistics for binary columns.
+ */
+public interface BinaryColumnStatistics extends ColumnStatistics {
+ long getSum();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/BooleanColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/BooleanColumnStatistics.java b/orc/src/java/org/apache/orc/BooleanColumnStatistics.java
new file mode 100644
index 0000000..af08f06
--- /dev/null
+++ b/orc/src/java/org/apache/orc/BooleanColumnStatistics.java
@@ -0,0 +1,29 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import org.apache.orc.ColumnStatistics;
+
+/**
+ * Statistics for boolean columns.
+ */
+public interface BooleanColumnStatistics extends ColumnStatistics {
+ long getFalseCount();
+
+ long getTrueCount();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/ColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/ColumnStatistics.java b/orc/src/java/org/apache/orc/ColumnStatistics.java
new file mode 100644
index 0000000..72d8fbf
--- /dev/null
+++ b/orc/src/java/org/apache/orc/ColumnStatistics.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+/**
+ * Statistics that are available for all types of columns.
+ */
+public interface ColumnStatistics {
+ /**
+ * Get the number of values in this column. It will differ from the number
+ * of rows because of NULL values and repeated values.
+ * @return the number of values
+ */
+ long getNumberOfValues();
+
+ /**
+ * Returns true if there are nulls in the scope of column statistics.
+ * @return true if null present else false
+ */
+ boolean hasNull();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/CompressionCodec.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/CompressionCodec.java b/orc/src/java/org/apache/orc/CompressionCodec.java
new file mode 100644
index 0000000..3421969
--- /dev/null
+++ b/orc/src/java/org/apache/orc/CompressionCodec.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.EnumSet;
+
+import javax.annotation.Nullable;
+
+public interface CompressionCodec {
+
+ enum Modifier {
+ /* speed/compression tradeoffs */
+ FASTEST,
+ FAST,
+ DEFAULT,
+ /* data sensitivity modifiers */
+ TEXT,
+ BINARY
+ };
+
+ /**
+ * Compress the in buffer to the out buffer.
+ * @param in the bytes to compress
+ * @param out the uncompressed bytes
+ * @param overflow put any additional bytes here
+ * @return true if the output is smaller than input
+ * @throws IOException
+ */
+ boolean compress(ByteBuffer in, ByteBuffer out, ByteBuffer overflow
+ ) throws IOException;
+
+ /**
+ * Decompress the in buffer to the out buffer.
+ * @param in the bytes to decompress
+ * @param out the decompressed bytes
+ * @throws IOException
+ */
+ void decompress(ByteBuffer in, ByteBuffer out) throws IOException;
+
+ /**
+ * Produce a modified compression codec if the underlying algorithm allows
+ * modification.
+ *
+ * This does not modify the current object, but returns a new object if
+ * modifications are possible. Returns the same object if no modifications
+ * are possible.
+ * @param modifiers compression modifiers
+ * @return codec for use after optional modification
+ */
+ CompressionCodec modify(@Nullable EnumSet<Modifier> modifiers);
+
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/CompressionKind.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/CompressionKind.java b/orc/src/java/org/apache/orc/CompressionKind.java
new file mode 100644
index 0000000..f684bef
--- /dev/null
+++ b/orc/src/java/org/apache/orc/CompressionKind.java
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+/**
+ * An enumeration that lists the generic compression algorithms that
+ * can be applied to ORC files.
+ */
+public enum CompressionKind {
+ NONE, ZLIB, SNAPPY, LZO
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/DataReader.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/DataReader.java b/orc/src/java/org/apache/orc/DataReader.java
new file mode 100644
index 0000000..3a5f854
--- /dev/null
+++ b/orc/src/java/org/apache/orc/DataReader.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.hadoop.hive.common.io.DiskRangeList;
+
+/** An abstract data reader that IO formats can use to read bytes from underlying storage. */
+public interface DataReader {
+
+ /** Opens the DataReader, making it ready to use. */
+ void open() throws IOException;
+
+ /** Closes the DataReader. */
+ void close() throws IOException;
+
+ /** Reads the data.
+ *
+ * Note that for the cases such as zero-copy read, caller must release the disk ranges
+ * produced after being done with them. Call isTrackingDiskRanges to find out if this is needed.
+ * @param range List if disk ranges to read. Ranges with data will be ignored.
+ * @param baseOffset Base offset from the start of the file of the ranges in disk range list.
+ * @param doForceDirect Whether the data should be read into direct buffers.
+ * @return New or modified list of DiskRange-s, where all the ranges are filled with data.
+ */
+ DiskRangeList readFileData(
+ DiskRangeList range, long baseOffset, boolean doForceDirect) throws IOException;
+
+
+ /**
+ * Whether the user should release buffers created by readFileData. See readFileData javadoc.
+ */
+ boolean isTrackingDiskRanges();
+
+ /**
+ * Releases buffers created by readFileData. See readFileData javadoc.
+ * @param toRelease The buffer to release.
+ */
+ void releaseBuffer(ByteBuffer toRelease);
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/DateColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/DateColumnStatistics.java b/orc/src/java/org/apache/orc/DateColumnStatistics.java
new file mode 100644
index 0000000..cdd01af
--- /dev/null
+++ b/orc/src/java/org/apache/orc/DateColumnStatistics.java
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import org.apache.orc.ColumnStatistics;
+
+import java.util.Date;
+
+/**
+ * Statistics for DATE columns.
+ */
+public interface DateColumnStatistics extends ColumnStatistics {
+ /**
+ * Get the minimum value for the column.
+ * @return minimum value
+ */
+ Date getMinimum();
+
+ /**
+ * Get the maximum value for the column.
+ * @return maximum value
+ */
+ Date getMaximum();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/DecimalColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/DecimalColumnStatistics.java b/orc/src/java/org/apache/orc/DecimalColumnStatistics.java
new file mode 100644
index 0000000..51b6d7d
--- /dev/null
+++ b/orc/src/java/org/apache/orc/DecimalColumnStatistics.java
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.orc.ColumnStatistics;
+
+/**
+ * Statistics for decimal columns.
+ */
+public interface DecimalColumnStatistics extends ColumnStatistics {
+
+ /**
+ * Get the minimum value for the column.
+ * @return the minimum value
+ */
+ HiveDecimal getMinimum();
+
+ /**
+ * Get the maximum value for the column.
+ * @return the maximum value
+ */
+ HiveDecimal getMaximum();
+
+ /**
+ * Get the sum of the values of the column.
+ * @return the sum
+ */
+ HiveDecimal getSum();
+
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/DoubleColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/DoubleColumnStatistics.java b/orc/src/java/org/apache/orc/DoubleColumnStatistics.java
new file mode 100644
index 0000000..00c728f
--- /dev/null
+++ b/orc/src/java/org/apache/orc/DoubleColumnStatistics.java
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import org.apache.orc.ColumnStatistics;
+
+/**
+ * Statistics for float and double columns.
+ */
+public interface DoubleColumnStatistics extends ColumnStatistics {
+
+ /**
+ * Get the smallest value in the column. Only defined if getNumberOfValues
+ * is non-zero.
+ * @return the minimum
+ */
+ double getMinimum();
+
+ /**
+ * Get the largest value in the column. Only defined if getNumberOfValues
+ * is non-zero.
+ * @return the maximum
+ */
+ double getMaximum();
+
+ /**
+ * Get the sum of the values in the column.
+ * @return the sum
+ */
+ double getSum();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/FileMetaInfo.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/FileMetaInfo.java b/orc/src/java/org/apache/orc/FileMetaInfo.java
new file mode 100644
index 0000000..d3cac3b
--- /dev/null
+++ b/orc/src/java/org/apache/orc/FileMetaInfo.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+/**
+ * FileMetaInfo - represents file metadata stored in footer and postscript sections of the file
+ * that is useful for Reader implementation
+ *
+ */
+public class FileMetaInfo {
+ public ByteBuffer footerMetaAndPsBuffer;
+ public final String compressionType;
+ public final int bufferSize;
+ public final int metadataSize;
+ public final ByteBuffer footerBuffer;
+ public final List<Integer> versionList;
+ public final OrcFile.WriterVersion writerVersion;
+
+
+ /** Ctor used when reading splits - no version list or full footer buffer. */
+ public FileMetaInfo(String compressionType, int bufferSize, int metadataSize,
+ ByteBuffer footerBuffer, OrcFile.WriterVersion writerVersion) {
+ this(compressionType, bufferSize, metadataSize, footerBuffer, null,
+ writerVersion, null);
+ }
+
+ /** Ctor used when creating file info during init and when getting a new one. */
+ public FileMetaInfo(String compressionType, int bufferSize, int metadataSize,
+ ByteBuffer footerBuffer, List<Integer> versionList,
+ OrcFile.WriterVersion writerVersion,
+ ByteBuffer fullFooterBuffer) {
+ this.compressionType = compressionType;
+ this.bufferSize = bufferSize;
+ this.metadataSize = metadataSize;
+ this.footerBuffer = footerBuffer;
+ this.versionList = versionList;
+ this.writerVersion = writerVersion;
+ this.footerMetaAndPsBuffer = fullFooterBuffer;
+ }
+
+ public OrcFile.WriterVersion getWriterVersion() {
+ return writerVersion;
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/FileMetadata.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/FileMetadata.java b/orc/src/java/org/apache/orc/FileMetadata.java
new file mode 100644
index 0000000..d63bdcc
--- /dev/null
+++ b/orc/src/java/org/apache/orc/FileMetadata.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import java.util.List;
+
+import org.apache.orc.CompressionKind;
+import org.apache.orc.OrcProto;
+import org.apache.orc.StripeInformation;
+
+/**
+ * Cached file metadata. Right now, it caches everything; we don't have to store all the
+ * protobuf structs actually, we could just store what we need, but that would require that
+ * ORC stop depending on them too. Luckily, they shouldn't be very big.
+ */
+public interface FileMetadata {
+ boolean isOriginalFormat();
+
+ List<StripeInformation> getStripes();
+
+ CompressionKind getCompressionKind();
+
+ int getCompressionBufferSize();
+
+ int getRowIndexStride();
+
+ int getColumnCount();
+
+ int getFlattenedColumnCount();
+
+ long getFileId();
+
+ List<Integer> getVersionList();
+
+ int getMetadataSize();
+
+ int getWriterVersionNum();
+
+ List<OrcProto.Type> getTypes();
+
+ List<OrcProto.StripeStatistics> getStripeStats();
+
+ long getContentLength();
+
+ long getNumberOfRows();
+
+ List<OrcProto.ColumnStatistics> getFileStats();
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/IntegerColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/IntegerColumnStatistics.java b/orc/src/java/org/apache/orc/IntegerColumnStatistics.java
new file mode 100644
index 0000000..1a162ff
--- /dev/null
+++ b/orc/src/java/org/apache/orc/IntegerColumnStatistics.java
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import org.apache.orc.ColumnStatistics;
+
+/**
+ * Statistics for all of the integer columns, such as byte, short, int, and
+ * long.
+ */
+public interface IntegerColumnStatistics extends ColumnStatistics {
+ /**
+ * Get the smallest value in the column. Only defined if getNumberOfValues
+ * is non-zero.
+ * @return the minimum
+ */
+ long getMinimum();
+
+ /**
+ * Get the largest value in the column. Only defined if getNumberOfValues
+ * is non-zero.
+ * @return the maximum
+ */
+ long getMaximum();
+
+ /**
+ * Is the sum defined? If the sum overflowed the counter this will be false.
+ * @return is the sum available
+ */
+ boolean isSumDefined();
+
+ /**
+ * Get the sum of the column. Only valid if isSumDefined returns true.
+ * @return the sum of the column
+ */
+ long getSum();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/OrcConf.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/OrcConf.java b/orc/src/java/org/apache/orc/OrcConf.java
new file mode 100644
index 0000000..6fcbb72
--- /dev/null
+++ b/orc/src/java/org/apache/orc/OrcConf.java
@@ -0,0 +1,191 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import org.apache.hadoop.conf.Configuration;
+
+import java.util.Properties;
+
+/**
+ * Define the configuration properties that Orc understands.
+ */
+public enum OrcConf {
+ STRIPE_SIZE("orc.stripe.size", "hive.exec.orc.default.stripe.size",
+ 64L * 1024 * 1024,
+ "Define the default ORC stripe size, in bytes."),
+ BLOCK_SIZE("orc.block.size", "hive.exec.orc.default.block.size",
+ 256L * 1024 * 1024,
+ "Define the default file system block size for ORC files."),
+ ENABLE_INDEXES("orc.create.index", "orc.create.index", true,
+ "Should the ORC writer create indexes as part of the file."),
+ ROW_INDEX_STRIDE("orc.row.index.stride",
+ "hive.exec.orc.default.row.index.stride", 10000,
+ "Define the default ORC index stride in number of rows. (Stride is the\n"+
+ " number of rows n index entry represents.)"),
+ BUFFER_SIZE("orc.compress.size", "hive.exec.orc.default.buffer.size",
+ 256 * 1024, "Define the default ORC buffer size, in bytes."),
+ BLOCK_PADDING("orc.block.padding", "hive.exec.orc.default.block.padding",
+ true,
+ "Define whether stripes should be padded to the HDFS block boundaries."),
+ COMPRESS("orc.compress", "hive.exec.orc.default.compress", "ZLIB",
+ "Define the default compression codec for ORC file"),
+ WRITE_FORMAT("orc.write.format", "hive.exec.orc.write.format", "0.12",
+ "Define the version of the file to write. Possible values are 0.11 and\n"+
+ " 0.12. If this parameter is not defined, ORC will use the run\n" +
+ " length encoding (RLE) introduced in Hive 0.12."),
+ ENCODING_STRATEGY("orc.encoding.strategy", "hive.exec.orc.encoding.strategy",
+ "SPEED",
+ "Define the encoding strategy to use while writing data. Changing this\n"+
+ "will only affect the light weight encoding for integers. This\n" +
+ "flag will not change the compression level of higher level\n" +
+ "compression codec (like ZLIB)."),
+ COMPRESSION_STRATEGY("orc.compression.strategy",
+ "hive.exec.orc.compression.strategy", "SPEED",
+ "Define the compression strategy to use while writing data.\n" +
+ "This changes the compression level of higher level compression\n" +
+ "codec (like ZLIB)."),
+ BLOCK_PADDING_TOLERANCE("orc.block.padding.tolerance",
+ "hive.exec.orc.block.padding.tolerance", 0.05,
+ "Define the tolerance for block padding as a decimal fraction of\n" +
+ "stripe size (for example, the default value 0.05 is 5% of the\n" +
+ "stripe size). For the defaults of 64Mb ORC stripe and 256Mb HDFS\n" +
+ "blocks, the default block padding tolerance of 5% will\n" +
+ "reserve a maximum of 3.2Mb for padding within the 256Mb block.\n" +
+ "In that case, if the available size within the block is more than\n"+
+ "3.2Mb, a new smaller stripe will be inserted to fit within that\n" +
+ "space. This will make sure that no stripe written will block\n" +
+ " boundaries and cause remote reads within a node local task."),
+ BLOOM_FILTER_FPP("orc.bloom.filter.fpp", "orc.default.bloom.fpp", 0.05,
+ "Define the default false positive probability for bloom filters."),
+ USE_ZEROCOPY("orc.use.zerocopy", "hive.exec.orc.zerocopy", false,
+ "Use zerocopy reads with ORC. (This requires Hadoop 2.3 or later.)"),
+ SKIP_CORRUPT_DATA("orc.skip.corrupt.data", "hive.exec.orc.skip.corrupt.data",
+ false,
+ "If ORC reader encounters corrupt data, this value will be used to\n" +
+ "determine whether to skip the corrupt data or throw exception.\n" +
+ "The default behavior is to throw exception."),
+ MEMORY_POOL("orc.memory.pool", "hive.exec.orc.memory.pool", 0.5,
+ "Maximum fraction of heap that can be used by ORC file writers"),
+ DICTIONARY_KEY_SIZE_THRESHOLD("orc.dictionary.key.threshold",
+ "hive.exec.orc.dictionary.key.size.threshold",
+ 0.8,
+ "If the number of distinct keys in a dictionary is greater than this\n" +
+ "fraction of the total number of non-null rows, turn off \n" +
+ "dictionary encoding. Use 1 to always use dictionary encoding."),
+ ROW_INDEX_STRIDE_DICTIONARY_CHECK("orc.dictionary.early.check",
+ "hive.orc.row.index.stride.dictionary.check",
+ true,
+ "If enabled dictionary check will happen after first row index stride\n" +
+ "(default 10000 rows) else dictionary check will happen before\n" +
+ "writing first stripe. In both cases, the decision to use\n" +
+ "dictionary or not will be retained thereafter."),
+ BLOOM_FILTER_COLUMNS("orc.bloom.filter.columns", "orc.bloom.filter.columns",
+ "", "List of columns to create bloom filters for when writing.")
+ ;
+
+ private final String attribute;
+ private final String hiveConfName;
+ private final Object defaultValue;
+ private final String description;
+
+ OrcConf(String attribute,
+ String hiveConfName,
+ Object defaultValue,
+ String description) {
+ this.attribute = attribute;
+ this.hiveConfName = hiveConfName;
+ this.defaultValue = defaultValue;
+ this.description = description;
+ }
+
+ public String getAttribute() {
+ return attribute;
+ }
+
+ public String getHiveConfName() {
+ return hiveConfName;
+ }
+
+ public Object getDefaultValue() {
+ return defaultValue;
+ }
+
+ public String getDescription() {
+ return description;
+ }
+
+ private String lookupValue(Properties tbl, Configuration conf) {
+ String result = null;
+ if (tbl != null) {
+ result = tbl.getProperty(attribute);
+ }
+ if (result == null && conf != null) {
+ result = conf.get(attribute);
+ if (result == null) {
+ result = conf.get(hiveConfName);
+ }
+ }
+ return result;
+ }
+
+ public long getLong(Properties tbl, Configuration conf) {
+ String value = lookupValue(tbl, conf);
+ if (value != null) {
+ return Long.parseLong(value);
+ }
+ return ((Number) defaultValue).longValue();
+ }
+
+ public long getLong(Configuration conf) {
+ return getLong(null, conf);
+ }
+
+ public String getString(Properties tbl, Configuration conf) {
+ String value = lookupValue(tbl, conf);
+ return value == null ? (String) defaultValue : value;
+ }
+
+ public String getString(Configuration conf) {
+ return getString(null, conf);
+ }
+
+ public boolean getBoolean(Properties tbl, Configuration conf) {
+ String value = lookupValue(tbl, conf);
+ if (value != null) {
+ return Boolean.parseBoolean(value);
+ }
+ return (Boolean) defaultValue;
+ }
+
+ public boolean getBoolean(Configuration conf) {
+ return getBoolean(null, conf);
+ }
+
+ public double getDouble(Properties tbl, Configuration conf) {
+ String value = lookupValue(tbl, conf);
+ if (value != null) {
+ return Double.parseDouble(value);
+ }
+ return ((Number) defaultValue).doubleValue();
+ }
+
+ public double getDouble(Configuration conf) {
+ return getDouble(null, conf);
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/OrcFile.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/OrcFile.java b/orc/src/java/org/apache/orc/OrcFile.java
new file mode 100644
index 0000000..9ea0b52
--- /dev/null
+++ b/orc/src/java/org/apache/orc/OrcFile.java
@@ -0,0 +1,514 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import java.io.IOException;
+import java.util.Properties;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.orc.impl.MemoryManager;
+
+/**
+ * Contains factory methods to read or write ORC files.
+ */
+public class OrcFile {
+ public static final String MAGIC = "ORC";
+
+ /**
+ * Create a version number for the ORC file format, so that we can add
+ * non-forward compatible changes in the future. To make it easier for users
+ * to understand the version numbers, we use the Hive release number that
+ * first wrote that version of ORC files.
+ *
+ * Thus, if you add new encodings or other non-forward compatible changes
+ * to ORC files, which prevent the old reader from reading the new format,
+ * you should change these variable to reflect the next Hive release number.
+ * Non-forward compatible changes should never be added in patch releases.
+ *
+ * Do not make any changes that break backwards compatibility, which would
+ * prevent the new reader from reading ORC files generated by any released
+ * version of Hive.
+ */
+ public enum Version {
+ V_0_11("0.11", 0, 11),
+ V_0_12("0.12", 0, 12);
+
+ public static final Version CURRENT = V_0_12;
+
+ private final String name;
+ private final int major;
+ private final int minor;
+
+ Version(String name, int major, int minor) {
+ this.name = name;
+ this.major = major;
+ this.minor = minor;
+ }
+
+ public static Version byName(String name) {
+ for(Version version: values()) {
+ if (version.name.equals(name)) {
+ return version;
+ }
+ }
+ throw new IllegalArgumentException("Unknown ORC version " + name);
+ }
+
+ /**
+ * Get the human readable name for the version.
+ */
+ public String getName() {
+ return name;
+ }
+
+ /**
+ * Get the major version number.
+ */
+ public int getMajor() {
+ return major;
+ }
+
+ /**
+ * Get the minor version number.
+ */
+ public int getMinor() {
+ return minor;
+ }
+ }
+
+ /**
+ * Records the version of the writer in terms of which bugs have been fixed.
+ * For bugs in the writer, but the old readers already read the new data
+ * correctly, bump this version instead of the Version.
+ */
+ public enum WriterVersion {
+ ORIGINAL(0),
+ HIVE_8732(1), // corrupted stripe/file maximum column statistics
+ HIVE_4243(2), // use real column names from Hive tables
+ // Don't use any magic numbers here except for the below:
+ FUTURE(Integer.MAX_VALUE); // a version from a future writer
+
+ private final int id;
+
+ public int getId() {
+ return id;
+ }
+
+ WriterVersion(int id) {
+ this.id = id;
+ }
+
+ private static final WriterVersion[] values;
+ static {
+ // Assumes few non-negative values close to zero.
+ int max = Integer.MIN_VALUE;
+ for (WriterVersion v : WriterVersion.values()) {
+ if (v.id < 0) throw new AssertionError();
+ if (v.id > max && FUTURE.id != v.id) {
+ max = v.id;
+ }
+ }
+ values = new WriterVersion[max + 1];
+ for (WriterVersion v : WriterVersion.values()) {
+ if (v.id < values.length) {
+ values[v.id] = v;
+ }
+ }
+ }
+
+ public static WriterVersion from(int val) {
+ if (val == FUTURE.id) return FUTURE; // Special handling for the magic value.
+ return values[val];
+ }
+ }
+
+ public enum EncodingStrategy {
+ SPEED, COMPRESSION
+ }
+
+ public enum CompressionStrategy {
+ SPEED, COMPRESSION
+ }
+
+ // unused
+ protected OrcFile() {}
+
+ public static class ReaderOptions {
+ private final Configuration conf;
+ private FileSystem filesystem;
+ private FileMetaInfo fileMetaInfo; // TODO: this comes from some place.
+ private long maxLength = Long.MAX_VALUE;
+ private FileMetadata fullFileMetadata; // Propagate from LLAP cache.
+
+ public ReaderOptions(Configuration conf) {
+ this.conf = conf;
+ }
+
+ public ReaderOptions fileMetaInfo(FileMetaInfo info) {
+ fileMetaInfo = info;
+ return this;
+ }
+
+ public ReaderOptions filesystem(FileSystem fs) {
+ this.filesystem = fs;
+ return this;
+ }
+
+ public ReaderOptions maxLength(long val) {
+ maxLength = val;
+ return this;
+ }
+
+ public ReaderOptions fileMetadata(FileMetadata metadata) {
+ this.fullFileMetadata = metadata;
+ return this;
+ }
+
+ public Configuration getConfiguration() {
+ return conf;
+ }
+
+ public FileSystem getFilesystem() {
+ return filesystem;
+ }
+
+ public FileMetaInfo getFileMetaInfo() {
+ return fileMetaInfo;
+ }
+
+ public long getMaxLength() {
+ return maxLength;
+ }
+
+ public FileMetadata getFileMetadata() {
+ return fullFileMetadata;
+ }
+ }
+
+ public static ReaderOptions readerOptions(Configuration conf) {
+ return new ReaderOptions(conf);
+ }
+
+ public interface WriterContext {
+ Writer getWriter();
+ }
+
+ public interface WriterCallback {
+ void preStripeWrite(WriterContext context) throws IOException;
+ void preFooterWrite(WriterContext context) throws IOException;
+ }
+
+ /**
+ * Options for creating ORC file writers.
+ */
+ public static class WriterOptions {
+ private final Configuration configuration;
+ private FileSystem fileSystemValue = null;
+ private TypeDescription schema = null;
+ private long stripeSizeValue;
+ private long blockSizeValue;
+ private int rowIndexStrideValue;
+ private int bufferSizeValue;
+ private boolean blockPaddingValue;
+ private CompressionKind compressValue;
+ private MemoryManager memoryManagerValue;
+ private Version versionValue;
+ private WriterCallback callback;
+ private EncodingStrategy encodingStrategy;
+ private CompressionStrategy compressionStrategy;
+ private double paddingTolerance;
+ private String bloomFilterColumns;
+ private double bloomFilterFpp;
+
+ protected WriterOptions(Properties tableProperties, Configuration conf) {
+ configuration = conf;
+ memoryManagerValue = getStaticMemoryManager(conf);
+ stripeSizeValue = OrcConf.STRIPE_SIZE.getLong(tableProperties, conf);
+ blockSizeValue = OrcConf.BLOCK_SIZE.getLong(tableProperties, conf);
+ rowIndexStrideValue =
+ (int) OrcConf.ROW_INDEX_STRIDE.getLong(tableProperties, conf);
+ bufferSizeValue = (int) OrcConf.BUFFER_SIZE.getLong(tableProperties,
+ conf);
+ blockPaddingValue =
+ OrcConf.BLOCK_PADDING.getBoolean(tableProperties, conf);
+ compressValue =
+ CompressionKind.valueOf(OrcConf.COMPRESS.getString(tableProperties,
+ conf));
+ String versionName = OrcConf.WRITE_FORMAT.getString(tableProperties,
+ conf);
+ versionValue = Version.byName(versionName);
+ String enString = OrcConf.ENCODING_STRATEGY.getString(tableProperties,
+ conf);
+ encodingStrategy = EncodingStrategy.valueOf(enString);
+
+ String compString =
+ OrcConf.COMPRESSION_STRATEGY.getString(tableProperties, conf);
+ compressionStrategy = CompressionStrategy.valueOf(compString);
+
+ paddingTolerance =
+ OrcConf.BLOCK_PADDING_TOLERANCE.getDouble(tableProperties, conf);
+
+ bloomFilterColumns = OrcConf.BLOOM_FILTER_COLUMNS.getString(tableProperties,
+ conf);
+ bloomFilterFpp = OrcConf.BLOOM_FILTER_FPP.getDouble(tableProperties,
+ conf);
+ }
+
+ /**
+ * Provide the filesystem for the path, if the client has it available.
+ * If it is not provided, it will be found from the path.
+ */
+ public WriterOptions fileSystem(FileSystem value) {
+ fileSystemValue = value;
+ return this;
+ }
+
+ /**
+ * Set the stripe size for the file. The writer stores the contents of the
+ * stripe in memory until this memory limit is reached and the stripe
+ * is flushed to the HDFS file and the next stripe started.
+ */
+ public WriterOptions stripeSize(long value) {
+ stripeSizeValue = value;
+ return this;
+ }
+
+ /**
+ * Set the file system block size for the file. For optimal performance,
+ * set the block size to be multiple factors of stripe size.
+ */
+ public WriterOptions blockSize(long value) {
+ blockSizeValue = value;
+ return this;
+ }
+
+ /**
+ * Set the distance between entries in the row index. The minimum value is
+ * 1000 to prevent the index from overwhelming the data. If the stride is
+ * set to 0, no indexes will be included in the file.
+ */
+ public WriterOptions rowIndexStride(int value) {
+ rowIndexStrideValue = value;
+ return this;
+ }
+
+ /**
+ * The size of the memory buffers used for compressing and storing the
+ * stripe in memory.
+ */
+ public WriterOptions bufferSize(int value) {
+ bufferSizeValue = value;
+ return this;
+ }
+
+ /**
+ * Sets whether the HDFS blocks are padded to prevent stripes from
+ * straddling blocks. Padding improves locality and thus the speed of
+ * reading, but costs space.
+ */
+ public WriterOptions blockPadding(boolean value) {
+ blockPaddingValue = value;
+ return this;
+ }
+
+ /**
+ * Sets the encoding strategy that is used to encode the data.
+ */
+ public WriterOptions encodingStrategy(EncodingStrategy strategy) {
+ encodingStrategy = strategy;
+ return this;
+ }
+
+ /**
+ * Sets the tolerance for block padding as a percentage of stripe size.
+ */
+ public WriterOptions paddingTolerance(double value) {
+ paddingTolerance = value;
+ return this;
+ }
+
+ /**
+ * Comma separated values of column names for which bloom filter is to be created.
+ */
+ public WriterOptions bloomFilterColumns(String columns) {
+ bloomFilterColumns = columns;
+ return this;
+ }
+
+ /**
+ * Specify the false positive probability for bloom filter.
+ * @param fpp - false positive probability
+ * @return this
+ */
+ public WriterOptions bloomFilterFpp(double fpp) {
+ bloomFilterFpp = fpp;
+ return this;
+ }
+
+ /**
+ * Sets the generic compression that is used to compress the data.
+ */
+ public WriterOptions compress(CompressionKind value) {
+ compressValue = value;
+ return this;
+ }
+
+ /**
+ * Set the schema for the file. This is a required parameter.
+ * @param schema the schema for the file.
+ * @return this
+ */
+ public WriterOptions setSchema(TypeDescription schema) {
+ this.schema = schema;
+ return this;
+ }
+
+ /**
+ * Sets the version of the file that will be written.
+ */
+ public WriterOptions version(Version value) {
+ versionValue = value;
+ return this;
+ }
+
+ /**
+ * Add a listener for when the stripe and file are about to be closed.
+ * @param callback the object to be called when the stripe is closed
+ * @return this
+ */
+ public WriterOptions callback(WriterCallback callback) {
+ this.callback = callback;
+ return this;
+ }
+
+ /**
+ * A package local option to set the memory manager.
+ */
+ protected WriterOptions memory(MemoryManager value) {
+ memoryManagerValue = value;
+ return this;
+ }
+
+ public boolean getBlockPadding() {
+ return blockPaddingValue;
+ }
+
+ public long getBlockSize() {
+ return blockSizeValue;
+ }
+
+ public String getBloomFilterColumns() {
+ return bloomFilterColumns;
+ }
+
+ public FileSystem getFileSystem() {
+ return fileSystemValue;
+ }
+
+ public Configuration getConfiguration() {
+ return configuration;
+ }
+
+ public TypeDescription getSchema() {
+ return schema;
+ }
+
+ public long getStripeSize() {
+ return stripeSizeValue;
+ }
+
+ public CompressionKind getCompress() {
+ return compressValue;
+ }
+
+ public WriterCallback getCallback() {
+ return callback;
+ }
+
+ public Version getVersion() {
+ return versionValue;
+ }
+
+ public MemoryManager getMemoryManager() {
+ return memoryManagerValue;
+ }
+
+ public int getBufferSize() {
+ return bufferSizeValue;
+ }
+
+ public int getRowIndexStride() {
+ return rowIndexStrideValue;
+ }
+
+ public CompressionStrategy getCompressionStrategy() {
+ return compressionStrategy;
+ }
+
+ public EncodingStrategy getEncodingStrategy() {
+ return encodingStrategy;
+ }
+
+ public double getPaddingTolerance() {
+ return paddingTolerance;
+ }
+
+ public double getBloomFilterFpp() {
+ return bloomFilterFpp;
+ }
+ }
+
+ /**
+ * Create a set of writer options based on a configuration.
+ * @param conf the configuration to use for values
+ * @return A WriterOptions object that can be modified
+ */
+ public static WriterOptions writerOptions(Configuration conf) {
+ return new WriterOptions(null, conf);
+ }
+
+ /**
+ * Create a set of write options based on a set of table properties and
+ * configuration.
+ * @param tableProperties the properties of the table
+ * @param conf the configuration of the query
+ * @return a WriterOptions object that can be modified
+ */
+ public static WriterOptions writerOptions(Properties tableProperties,
+ Configuration conf) {
+ return new WriterOptions(tableProperties, conf);
+ }
+
+ private static ThreadLocal<MemoryManager> memoryManager = null;
+
+ private static synchronized MemoryManager getStaticMemoryManager(
+ final Configuration conf) {
+ if (memoryManager == null) {
+ memoryManager = new ThreadLocal<MemoryManager>() {
+ @Override
+ protected MemoryManager initialValue() {
+ return new MemoryManager(conf);
+ }
+ };
+ }
+ return memoryManager.get();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/OrcUtils.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/OrcUtils.java b/orc/src/java/org/apache/orc/OrcUtils.java
new file mode 100644
index 0000000..2e93254
--- /dev/null
+++ b/orc/src/java/org/apache/orc/OrcUtils.java
@@ -0,0 +1,452 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import com.google.common.collect.Lists;
+
+public class OrcUtils {
+
+ /**
+ * Returns selected columns as a boolean array with true value set for specified column names.
+ * The result will contain number of elements equal to flattened number of columns.
+ * For example:
+ * selectedColumns - a,b,c
+ * allColumns - a,b,c,d
+ * If column c is a complex type, say list<string> and other types are primitives then result will
+ * be [false, true, true, true, true, true, false]
+ * Index 0 is the root element of the struct which is set to false by default, index 1,2
+ * corresponds to columns a and b. Index 3,4 correspond to column c which is list<string> and
+ * index 5 correspond to column d. After flattening list<string> gets 2 columns.
+ *
+ * @param selectedColumns - comma separated list of selected column names
+ * @param schema - object schema
+ * @return - boolean array with true value set for the specified column names
+ */
+ public static boolean[] includeColumns(String selectedColumns,
+ TypeDescription schema) {
+ int numFlattenedCols = schema.getMaximumId();
+ boolean[] results = new boolean[numFlattenedCols + 1];
+ if ("*".equals(selectedColumns)) {
+ Arrays.fill(results, true);
+ return results;
+ }
+ if (selectedColumns != null &&
+ schema.getCategory() == TypeDescription.Category.STRUCT) {
+ List<String> fieldNames = schema.getFieldNames();
+ List<TypeDescription> fields = schema.getChildren();
+ for (String column: selectedColumns.split((","))) {
+ TypeDescription col = findColumn(column, fieldNames, fields);
+ if (col != null) {
+ for(int i=col.getId(); i <= col.getMaximumId(); ++i) {
+ results[i] = true;
+ }
+ }
+ }
+ }
+ return results;
+ }
+
+ private static TypeDescription findColumn(String columnName,
+ List<String> fieldNames,
+ List<TypeDescription> fields) {
+ int i = 0;
+ for(String fieldName: fieldNames) {
+ if (fieldName.equalsIgnoreCase(columnName)) {
+ return fields.get(i);
+ } else {
+ i += 1;
+ }
+ }
+ return null;
+ }
+
+ public static List<OrcProto.Type> getOrcTypes(TypeDescription typeDescr) {
+ List<OrcProto.Type> result = Lists.newArrayList();
+ appendOrcTypes(result, typeDescr);
+ return result;
+ }
+
+ private static void appendOrcTypes(List<OrcProto.Type> result, TypeDescription typeDescr) {
+ OrcProto.Type.Builder type = OrcProto.Type.newBuilder();
+ List<TypeDescription> children = typeDescr.getChildren();
+ switch (typeDescr.getCategory()) {
+ case BOOLEAN:
+ type.setKind(OrcProto.Type.Kind.BOOLEAN);
+ break;
+ case BYTE:
+ type.setKind(OrcProto.Type.Kind.BYTE);
+ break;
+ case SHORT:
+ type.setKind(OrcProto.Type.Kind.SHORT);
+ break;
+ case INT:
+ type.setKind(OrcProto.Type.Kind.INT);
+ break;
+ case LONG:
+ type.setKind(OrcProto.Type.Kind.LONG);
+ break;
+ case FLOAT:
+ type.setKind(OrcProto.Type.Kind.FLOAT);
+ break;
+ case DOUBLE:
+ type.setKind(OrcProto.Type.Kind.DOUBLE);
+ break;
+ case STRING:
+ type.setKind(OrcProto.Type.Kind.STRING);
+ break;
+ case CHAR:
+ type.setKind(OrcProto.Type.Kind.CHAR);
+ type.setMaximumLength(typeDescr.getMaxLength());
+ break;
+ case VARCHAR:
+ type.setKind(OrcProto.Type.Kind.VARCHAR);
+ type.setMaximumLength(typeDescr.getMaxLength());
+ break;
+ case BINARY:
+ type.setKind(OrcProto.Type.Kind.BINARY);
+ break;
+ case TIMESTAMP:
+ type.setKind(OrcProto.Type.Kind.TIMESTAMP);
+ break;
+ case DATE:
+ type.setKind(OrcProto.Type.Kind.DATE);
+ break;
+ case DECIMAL:
+ type.setKind(OrcProto.Type.Kind.DECIMAL);
+ type.setPrecision(typeDescr.getPrecision());
+ type.setScale(typeDescr.getScale());
+ break;
+ case LIST:
+ type.setKind(OrcProto.Type.Kind.LIST);
+ type.addSubtypes(children.get(0).getId());
+ break;
+ case MAP:
+ type.setKind(OrcProto.Type.Kind.MAP);
+ for(TypeDescription t: children) {
+ type.addSubtypes(t.getId());
+ }
+ break;
+ case STRUCT:
+ type.setKind(OrcProto.Type.Kind.STRUCT);
+ for(TypeDescription t: children) {
+ type.addSubtypes(t.getId());
+ }
+ for(String field: typeDescr.getFieldNames()) {
+ type.addFieldNames(field);
+ }
+ break;
+ case UNION:
+ type.setKind(OrcProto.Type.Kind.UNION);
+ for(TypeDescription t: children) {
+ type.addSubtypes(t.getId());
+ }
+ break;
+ default:
+ throw new IllegalArgumentException("Unknown category: " +
+ typeDescr.getCategory());
+ }
+ result.add(type.build());
+ if (children != null) {
+ for(TypeDescription child: children) {
+ appendOrcTypes(result, child);
+ }
+ }
+ }
+
+ /**
+ * NOTE: This method ignores the subtype numbers in the TypeDescription rebuilds the subtype
+ * numbers based on the length of the result list being appended.
+ *
+ * @param result
+ * @param typeDescr
+ */
+ public static void appendOrcTypesRebuildSubtypes(List<OrcProto.Type> result,
+ TypeDescription typeDescr) {
+
+ int subtype = result.size();
+ OrcProto.Type.Builder type = OrcProto.Type.newBuilder();
+ boolean needsAdd = true;
+ List<TypeDescription> children = typeDescr.getChildren();
+ switch (typeDescr.getCategory()) {
+ case BOOLEAN:
+ type.setKind(OrcProto.Type.Kind.BOOLEAN);
+ break;
+ case BYTE:
+ type.setKind(OrcProto.Type.Kind.BYTE);
+ break;
+ case SHORT:
+ type.setKind(OrcProto.Type.Kind.SHORT);
+ break;
+ case INT:
+ type.setKind(OrcProto.Type.Kind.INT);
+ break;
+ case LONG:
+ type.setKind(OrcProto.Type.Kind.LONG);
+ break;
+ case FLOAT:
+ type.setKind(OrcProto.Type.Kind.FLOAT);
+ break;
+ case DOUBLE:
+ type.setKind(OrcProto.Type.Kind.DOUBLE);
+ break;
+ case STRING:
+ type.setKind(OrcProto.Type.Kind.STRING);
+ break;
+ case CHAR:
+ type.setKind(OrcProto.Type.Kind.CHAR);
+ type.setMaximumLength(typeDescr.getMaxLength());
+ break;
+ case VARCHAR:
+ type.setKind(OrcProto.Type.Kind.VARCHAR);
+ type.setMaximumLength(typeDescr.getMaxLength());
+ break;
+ case BINARY:
+ type.setKind(OrcProto.Type.Kind.BINARY);
+ break;
+ case TIMESTAMP:
+ type.setKind(OrcProto.Type.Kind.TIMESTAMP);
+ break;
+ case DATE:
+ type.setKind(OrcProto.Type.Kind.DATE);
+ break;
+ case DECIMAL:
+ type.setKind(OrcProto.Type.Kind.DECIMAL);
+ type.setPrecision(typeDescr.getPrecision());
+ type.setScale(typeDescr.getScale());
+ break;
+ case LIST:
+ type.setKind(OrcProto.Type.Kind.LIST);
+ type.addSubtypes(++subtype);
+ result.add(type.build());
+ needsAdd = false;
+ appendOrcTypesRebuildSubtypes(result, children.get(0));
+ break;
+ case MAP:
+ {
+ // Make room for MAP type.
+ result.add(null);
+
+ // Add MAP type pair in order to determine their subtype values.
+ appendOrcTypesRebuildSubtypes(result, children.get(0));
+ int subtype2 = result.size();
+ appendOrcTypesRebuildSubtypes(result, children.get(1));
+ type.setKind(OrcProto.Type.Kind.MAP);
+ type.addSubtypes(subtype + 1);
+ type.addSubtypes(subtype2);
+ result.set(subtype, type.build());
+ needsAdd = false;
+ }
+ break;
+ case STRUCT:
+ {
+ List<String> fieldNames = typeDescr.getFieldNames();
+
+ // Make room for STRUCT type.
+ result.add(null);
+
+ List<Integer> fieldSubtypes = new ArrayList<Integer>(fieldNames.size());
+ for(TypeDescription child: children) {
+ int fieldSubtype = result.size();
+ fieldSubtypes.add(fieldSubtype);
+ appendOrcTypesRebuildSubtypes(result, child);
+ }
+
+ type.setKind(OrcProto.Type.Kind.STRUCT);
+
+ for (int i = 0 ; i < fieldNames.size(); i++) {
+ type.addSubtypes(fieldSubtypes.get(i));
+ type.addFieldNames(fieldNames.get(i));
+ }
+ result.set(subtype, type.build());
+ needsAdd = false;
+ }
+ break;
+ case UNION:
+ {
+ // Make room for UNION type.
+ result.add(null);
+
+ List<Integer> unionSubtypes = new ArrayList<Integer>(children.size());
+ for(TypeDescription child: children) {
+ int unionSubtype = result.size();
+ unionSubtypes.add(unionSubtype);
+ appendOrcTypesRebuildSubtypes(result, child);
+ }
+
+ type.setKind(OrcProto.Type.Kind.UNION);
+ for (int i = 0 ; i < children.size(); i++) {
+ type.addSubtypes(unionSubtypes.get(i));
+ }
+ result.set(subtype, type.build());
+ needsAdd = false;
+ }
+ break;
+ default:
+ throw new IllegalArgumentException("Unknown category: " + typeDescr.getCategory());
+ }
+ if (needsAdd) {
+ result.add(type.build());
+ }
+ }
+
+ /**
+ * NOTE: This method ignores the subtype numbers in the OrcProto.Type rebuilds the subtype
+ * numbers based on the length of the result list being appended.
+ *
+ * @param result
+ * @param types
+ * @param columnId
+ */
+ public static int appendOrcTypesRebuildSubtypes(List<OrcProto.Type> result,
+ List<OrcProto.Type> types, int columnId) {
+
+ OrcProto.Type oldType = types.get(columnId++);
+
+ int subtype = result.size();
+ OrcProto.Type.Builder builder = OrcProto.Type.newBuilder();
+ boolean needsAdd = true;
+ switch (oldType.getKind()) {
+ case BOOLEAN:
+ builder.setKind(OrcProto.Type.Kind.BOOLEAN);
+ break;
+ case BYTE:
+ builder.setKind(OrcProto.Type.Kind.BYTE);
+ break;
+ case SHORT:
+ builder.setKind(OrcProto.Type.Kind.SHORT);
+ break;
+ case INT:
+ builder.setKind(OrcProto.Type.Kind.INT);
+ break;
+ case LONG:
+ builder.setKind(OrcProto.Type.Kind.LONG);
+ break;
+ case FLOAT:
+ builder.setKind(OrcProto.Type.Kind.FLOAT);
+ break;
+ case DOUBLE:
+ builder.setKind(OrcProto.Type.Kind.DOUBLE);
+ break;
+ case STRING:
+ builder.setKind(OrcProto.Type.Kind.STRING);
+ break;
+ case CHAR:
+ builder.setKind(OrcProto.Type.Kind.CHAR);
+ builder.setMaximumLength(oldType.getMaximumLength());
+ break;
+ case VARCHAR:
+ builder.setKind(OrcProto.Type.Kind.VARCHAR);
+ builder.setMaximumLength(oldType.getMaximumLength());
+ break;
+ case BINARY:
+ builder.setKind(OrcProto.Type.Kind.BINARY);
+ break;
+ case TIMESTAMP:
+ builder.setKind(OrcProto.Type.Kind.TIMESTAMP);
+ break;
+ case DATE:
+ builder.setKind(OrcProto.Type.Kind.DATE);
+ break;
+ case DECIMAL:
+ builder.setKind(OrcProto.Type.Kind.DECIMAL);
+ builder.setPrecision(oldType.getPrecision());
+ builder.setScale(oldType.getScale());
+ break;
+ case LIST:
+ builder.setKind(OrcProto.Type.Kind.LIST);
+ builder.addSubtypes(++subtype);
+ result.add(builder.build());
+ needsAdd = false;
+ columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
+ break;
+ case MAP:
+ {
+ // Make room for MAP type.
+ result.add(null);
+
+ // Add MAP type pair in order to determine their subtype values.
+ columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
+ int subtype2 = result.size();
+ columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
+ builder.setKind(OrcProto.Type.Kind.MAP);
+ builder.addSubtypes(subtype + 1);
+ builder.addSubtypes(subtype2);
+ result.set(subtype, builder.build());
+ needsAdd = false;
+ }
+ break;
+ case STRUCT:
+ {
+ List<String> fieldNames = oldType.getFieldNamesList();
+
+ // Make room for STRUCT type.
+ result.add(null);
+
+ List<Integer> fieldSubtypes = new ArrayList<Integer>(fieldNames.size());
+ for(int i = 0 ; i < fieldNames.size(); i++) {
+ int fieldSubtype = result.size();
+ fieldSubtypes.add(fieldSubtype);
+ columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
+ }
+
+ builder.setKind(OrcProto.Type.Kind.STRUCT);
+
+ for (int i = 0 ; i < fieldNames.size(); i++) {
+ builder.addSubtypes(fieldSubtypes.get(i));
+ builder.addFieldNames(fieldNames.get(i));
+ }
+ result.set(subtype, builder.build());
+ needsAdd = false;
+ }
+ break;
+ case UNION:
+ {
+ int subtypeCount = oldType.getSubtypesCount();
+
+ // Make room for UNION type.
+ result.add(null);
+
+ List<Integer> unionSubtypes = new ArrayList<Integer>(subtypeCount);
+ for(int i = 0 ; i < subtypeCount; i++) {
+ int unionSubtype = result.size();
+ unionSubtypes.add(unionSubtype);
+ columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
+ }
+
+ builder.setKind(OrcProto.Type.Kind.UNION);
+ for (int i = 0 ; i < subtypeCount; i++) {
+ builder.addSubtypes(unionSubtypes.get(i));
+ }
+ result.set(subtype, builder.build());
+ needsAdd = false;
+ }
+ break;
+ default:
+ throw new IllegalArgumentException("Unknown category: " + oldType.getKind());
+ }
+ if (needsAdd) {
+ result.add(builder.build());
+ }
+ return columnId;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/Reader.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/Reader.java b/orc/src/java/org/apache/orc/Reader.java
new file mode 100644
index 0000000..be722b5
--- /dev/null
+++ b/orc/src/java/org/apache/orc/Reader.java
@@ -0,0 +1,363 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
+import org.apache.orc.impl.MetadataReader;
+
+/**
+ * The interface for reading ORC files.
+ *
+ * One Reader can support multiple concurrent RecordReader.
+ */
+public interface Reader {
+
+ /**
+ * Get the number of rows in the file.
+ * @return the number of rows
+ */
+ long getNumberOfRows();
+
+ /**
+ * Get the deserialized data size of the file
+ * @return raw data size
+ */
+ long getRawDataSize();
+
+ /**
+ * Get the deserialized data size of the specified columns
+ * @param colNames
+ * @return raw data size of columns
+ */
+ long getRawDataSizeOfColumns(List<String> colNames);
+
+ /**
+ * Get the deserialized data size of the specified columns ids
+ * @param colIds - internal column id (check orcfiledump for column ids)
+ * @return raw data size of columns
+ */
+ long getRawDataSizeFromColIndices(List<Integer> colIds);
+
+ /**
+ * Get the user metadata keys.
+ * @return the set of metadata keys
+ */
+ List<String> getMetadataKeys();
+
+ /**
+ * Get a user metadata value.
+ * @param key a key given by the user
+ * @return the bytes associated with the given key
+ */
+ ByteBuffer getMetadataValue(String key);
+
+ /**
+ * Did the user set the given metadata value.
+ * @param key the key to check
+ * @return true if the metadata value was set
+ */
+ boolean hasMetadataValue(String key);
+
+ /**
+ * Get the compression kind.
+ * @return the kind of compression in the file
+ */
+ CompressionKind getCompressionKind();
+
+ /**
+ * Get the buffer size for the compression.
+ * @return number of bytes to buffer for the compression codec.
+ */
+ int getCompressionSize();
+
+ /**
+ * Get the number of rows per a entry in the row index.
+ * @return the number of rows per an entry in the row index or 0 if there
+ * is no row index.
+ */
+ int getRowIndexStride();
+
+ /**
+ * Get the list of stripes.
+ * @return the information about the stripes in order
+ */
+ List<StripeInformation> getStripes();
+
+ /**
+ * Get the length of the file.
+ * @return the number of bytes in the file
+ */
+ long getContentLength();
+
+ /**
+ * Get the statistics about the columns in the file.
+ * @return the information about the column
+ */
+ ColumnStatistics[] getStatistics();
+
+ /**
+ * Get the list of types contained in the file. The root type is the first
+ * type in the list.
+ * @return the list of flattened types
+ */
+ List<OrcProto.Type> getTypes();
+
+ /**
+ * Get the file format version.
+ */
+ OrcFile.Version getFileVersion();
+
+ /**
+ * Get the version of the writer of this file.
+ */
+ OrcFile.WriterVersion getWriterVersion();
+
+ /**
+ * Options for creating a RecordReader.
+ */
+ public static class Options {
+ private boolean[] include;
+ private long offset = 0;
+ private long length = Long.MAX_VALUE;
+ private SearchArgument sarg = null;
+ private String[] columnNames = null;
+ private Boolean useZeroCopy = null;
+ private Boolean skipCorruptRecords = null;
+ private TypeDescription schema = null;
+
+ /**
+ * Set the list of columns to read.
+ * @param include a list of columns to read
+ * @return this
+ */
+ public Options include(boolean[] include) {
+ this.include = include;
+ return this;
+ }
+
+ /**
+ * Set the range of bytes to read
+ * @param offset the starting byte offset
+ * @param length the number of bytes to read
+ * @return this
+ */
+ public Options range(long offset, long length) {
+ this.offset = offset;
+ this.length = length;
+ return this;
+ }
+
+ /**
+ * Set the schema on read type description.
+ */
+ public Options schema(TypeDescription schema) {
+ this.schema = schema;
+ return this;
+ }
+
+ /**
+ * Set search argument for predicate push down.
+ * @param sarg the search argument
+ * @param columnNames the column names for
+ * @return this
+ */
+ public Options searchArgument(SearchArgument sarg, String[] columnNames) {
+ this.sarg = sarg;
+ this.columnNames = columnNames;
+ return this;
+ }
+
+ /**
+ * Set whether to use zero copy from HDFS.
+ * @param value the new zero copy flag
+ * @return this
+ */
+ public Options useZeroCopy(boolean value) {
+ this.useZeroCopy = value;
+ return this;
+ }
+
+ /**
+ * Set whether to skip corrupt records.
+ * @param value the new skip corrupt records flag
+ * @return this
+ */
+ public Options skipCorruptRecords(boolean value) {
+ this.skipCorruptRecords = value;
+ return this;
+ }
+
+ public boolean[] getInclude() {
+ return include;
+ }
+
+ public long getOffset() {
+ return offset;
+ }
+
+ public long getLength() {
+ return length;
+ }
+
+ public TypeDescription getSchema() {
+ return schema;
+ }
+
+ public SearchArgument getSearchArgument() {
+ return sarg;
+ }
+
+ public String[] getColumnNames() {
+ return columnNames;
+ }
+
+ public long getMaxOffset() {
+ long result = offset + length;
+ if (result < 0) {
+ result = Long.MAX_VALUE;
+ }
+ return result;
+ }
+
+ public Boolean getUseZeroCopy() {
+ return useZeroCopy;
+ }
+
+ public Boolean getSkipCorruptRecords() {
+ return skipCorruptRecords;
+ }
+
+ public Options clone() {
+ Options result = new Options();
+ result.include = include;
+ result.offset = offset;
+ result.length = length;
+ result.sarg = sarg;
+ result.schema = schema;
+ result.columnNames = columnNames;
+ result.useZeroCopy = useZeroCopy;
+ result.skipCorruptRecords = skipCorruptRecords;
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buffer = new StringBuilder();
+ buffer.append("{include: ");
+ if (include == null) {
+ buffer.append("null");
+ } else {
+ buffer.append("[");
+ for(int i=0; i < include.length; ++i) {
+ if (i != 0) {
+ buffer.append(", ");
+ }
+ buffer.append(include[i]);
+ }
+ buffer.append("]");
+ }
+ buffer.append(", offset: ");
+ buffer.append(offset);
+ buffer.append(", length: ");
+ buffer.append(length);
+ if (sarg != null) {
+ buffer.append(", sarg: ");
+ buffer.append(sarg.toString());
+ buffer.append(", columns: [");
+ for(int i=0; i < columnNames.length; ++i) {
+ if (i != 0) {
+ buffer.append(", ");
+ }
+ buffer.append("'");
+ buffer.append(columnNames[i]);
+ buffer.append("'");
+ }
+ buffer.append("]");
+ }
+ if (schema != null) {
+ buffer.append(", schema: ");
+ schema.printToBuffer(buffer);
+ }
+ buffer.append("}");
+ return buffer.toString();
+ }
+ }
+
+ /**
+ * Create a RecordReader that reads everything with the default options.
+ * @return a new RecordReader
+ * @throws IOException
+ */
+ RecordReader rows() throws IOException;
+
+ /**
+ * Create a RecordReader that uses the options given.
+ * This method can't be named rows, because many callers used rows(null)
+ * before the rows() method was introduced.
+ * @param options the options to read with
+ * @return a new RecordReader
+ * @throws IOException
+ */
+ RecordReader rowsOptions(Options options) throws IOException;
+
+ /**
+ * @return Metadata reader used to read file metadata.
+ */
+ MetadataReader metadata() throws IOException;
+
+ /**
+ * @return List of integers representing version of the file, in order from major to minor.
+ */
+ List<Integer> getVersionList();
+
+ /**
+ * @return Gets the size of metadata, in bytes.
+ */
+ int getMetadataSize();
+
+ /**
+ * @return Stripe statistics, in original protobuf form.
+ */
+ List<OrcProto.StripeStatistics> getOrcProtoStripeStatistics();
+
+ /**
+ * @return Stripe statistics.
+ */
+ List<StripeStatistics> getStripeStatistics();
+
+ /**
+ * @return File statistics, in original protobuf form.
+ */
+ List<OrcProto.ColumnStatistics> getOrcProtoFileStatistics();
+
+ /**
+ * @param useZeroCopy Whether zero-copy read should be used.
+ * @return The default data reader that ORC is using to read bytes from disk.
+ */
+ DataReader createDefaultDataReader(boolean useZeroCopy);
+
+ /**
+ * @return Serialized file metadata read from disk for the purposes of caching, etc.
+ */
+ ByteBuffer getSerializedFileFooter();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/RecordReader.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/RecordReader.java b/orc/src/java/org/apache/orc/RecordReader.java
new file mode 100644
index 0000000..7229dda
--- /dev/null
+++ b/orc/src/java/org/apache/orc/RecordReader.java
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import java.io.IOException;
+
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+/**
+ * A row-by-row iterator for ORC files.
+ */
+public interface RecordReader {
+ /**
+ * Read the next row batch. The size of the batch to read cannot be
+ * controlled by the callers. Caller need to look at
+ * VectorizedRowBatch.size of the retunred object to know the batch
+ * size read.
+ * @param previousBatch a row batch object that can be reused by the reader
+ * @return the row batch that was read. The batch will have a non-zero row
+ * count if the pointer isn't at the end of the file
+ * @throws java.io.IOException
+ */
+ VectorizedRowBatch nextBatch(VectorizedRowBatch previousBatch
+ ) throws IOException;
+
+ /**
+ * Get the row number of the row that will be returned by the following
+ * call to next().
+ * @return the row number from 0 to the number of rows in the file
+ * @throws java.io.IOException
+ */
+ long getRowNumber() throws IOException;
+
+ /**
+ * Get the progress of the reader through the rows.
+ * @return a fraction between 0.0 and 1.0 of rows read
+ * @throws java.io.IOException
+ */
+ float getProgress() throws IOException;
+
+ /**
+ * Release the resources associated with the given reader.
+ * @throws java.io.IOException
+ */
+ void close() throws IOException;
+
+ /**
+ * Seek to a particular row number.
+ */
+ void seekToRow(long rowCount) throws IOException;
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/StringColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/StringColumnStatistics.java b/orc/src/java/org/apache/orc/StringColumnStatistics.java
new file mode 100644
index 0000000..5a868d0
--- /dev/null
+++ b/orc/src/java/org/apache/orc/StringColumnStatistics.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import org.apache.orc.ColumnStatistics;
+
+/**
+ * Statistics for string columns.
+ */
+public interface StringColumnStatistics extends ColumnStatistics {
+ /**
+ * Get the minimum string.
+ * @return the minimum
+ */
+ String getMinimum();
+
+ /**
+ * Get the maximum string.
+ * @return the maximum
+ */
+ String getMaximum();
+
+ /**
+ * Get the total length of all strings
+ * @return the sum (total length)
+ */
+ long getSum();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/StripeInformation.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/StripeInformation.java b/orc/src/java/org/apache/orc/StripeInformation.java
new file mode 100644
index 0000000..38f7eba
--- /dev/null
+++ b/orc/src/java/org/apache/orc/StripeInformation.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+/**
+ * Information about the stripes in an ORC file that is provided by the Reader.
+ */
+public interface StripeInformation {
+ /**
+ * Get the byte offset of the start of the stripe.
+ * @return the bytes from the start of the file
+ */
+ long getOffset();
+
+ /**
+ * Get the total length of the stripe in bytes.
+ * @return the number of bytes in the stripe
+ */
+ long getLength();
+
+ /**
+ * Get the length of the stripe's indexes.
+ * @return the number of bytes in the index
+ */
+ long getIndexLength();
+
+ /**
+ * Get the length of the stripe's data.
+ * @return the number of bytes in the stripe
+ */
+ long getDataLength();
+
+ /**
+ * Get the length of the stripe's tail section, which contains its index.
+ * @return the number of bytes in the tail
+ */
+ long getFooterLength();
+
+ /**
+ * Get the number of rows in the stripe.
+ * @return a count of the number of rows
+ */
+ long getNumberOfRows();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/StripeStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/StripeStatistics.java b/orc/src/java/org/apache/orc/StripeStatistics.java
new file mode 100644
index 0000000..8fc91cb
--- /dev/null
+++ b/orc/src/java/org/apache/orc/StripeStatistics.java
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import org.apache.orc.impl.ColumnStatisticsImpl;
+
+import java.util.List;
+
+public class StripeStatistics {
+ private final List<OrcProto.ColumnStatistics> cs;
+
+ public StripeStatistics(List<OrcProto.ColumnStatistics> list) {
+ this.cs = list;
+ }
+
+ /**
+ * Return list of column statistics
+ *
+ * @return column stats
+ */
+ public ColumnStatistics[] getColumnStatistics() {
+ ColumnStatistics[] result = new ColumnStatistics[cs.size()];
+ for (int i = 0; i < result.length; ++i) {
+ result[i] = ColumnStatisticsImpl.deserialize(cs.get(i));
+ }
+ return result;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9c7a78ee/orc/src/java/org/apache/orc/TimestampColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/TimestampColumnStatistics.java b/orc/src/java/org/apache/orc/TimestampColumnStatistics.java
new file mode 100644
index 0000000..27dc49f
--- /dev/null
+++ b/orc/src/java/org/apache/orc/TimestampColumnStatistics.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import java.sql.Timestamp;
+
+/**
+ * Statistics for Timestamp columns.
+ */
+public interface TimestampColumnStatistics extends ColumnStatistics {
+ /**
+ * Get the minimum value for the column.
+ * @return minimum value
+ */
+ Timestamp getMinimum();
+
+ /**
+ * Get the maximum value for the column.
+ * @return maximum value
+ */
+ Timestamp getMaximum();
+}