You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2016/05/13 19:50:51 UTC
[22/23] orc git commit: ORC-1 Import of ORC code from Hive. (omalley reviewed by prasanthj)

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/OrcUtils.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/OrcUtils.java b/java/core/src/java/org/apache/orc/OrcUtils.java
new file mode 100644
index 0000000..5845ba6
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/OrcUtils.java
@@ -0,0 +1,530 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import com.google.common.collect.Lists;
+
+public class OrcUtils {
+
+  /**
+   * Returns selected columns as a boolean array with true value set for specified column names.
+   * The result will contain number of elements equal to flattened number of columns.
+   * For example:
+   * selectedColumns - a,b,c
+   * allColumns - a,b,c,d
+   * If column c is a complex type, say list<string> and other types are primitives then result will
+   * be [false, true, true, true, true, true, false]
+   * Index 0 is the root element of the struct which is set to false by default, index 1,2
+   * corresponds to columns a and b. Index 3,4 correspond to column c which is list<string> and
+   * index 5 correspond to column d. After flattening list<string> gets 2 columns.
+   *
+   * @param selectedColumns - comma separated list of selected column names
+   * @param schema       - object schema
+   * @return - boolean array with true value set for the specified column names
+   */
+  public static boolean[] includeColumns(String selectedColumns,
+                                         TypeDescription schema) {
+    int numFlattenedCols = schema.getMaximumId();
+    boolean[] results = new boolean[numFlattenedCols + 1];
+    if ("*".equals(selectedColumns)) {
+      Arrays.fill(results, true);
+      return results;
+    }
+    if (selectedColumns != null &&
+        schema.getCategory() == TypeDescription.Category.STRUCT) {
+      List<String> fieldNames = schema.getFieldNames();
+      List<TypeDescription> fields = schema.getChildren();
+      for (String column: selectedColumns.split((","))) {
+        TypeDescription col = findColumn(column, fieldNames, fields);
+        if (col != null) {
+          for(int i=col.getId(); i <= col.getMaximumId(); ++i) {
+            results[i] = true;
+          }
+        }
+      }
+    }
+    return results;
+  }
+
+  private static TypeDescription findColumn(String columnName,
+                                            List<String> fieldNames,
+                                            List<TypeDescription> fields) {
+    int i = 0;
+    for(String fieldName: fieldNames) {
+      if (fieldName.equalsIgnoreCase(columnName)) {
+        return fields.get(i);
+      } else {
+        i += 1;
+      }
+    }
+    return null;
+  }
+
+  public static List<OrcProto.Type> getOrcTypes(TypeDescription typeDescr) {
+    List<OrcProto.Type> result = Lists.newArrayList();
+    appendOrcTypes(result, typeDescr);
+    return result;
+  }
+
+  private static void appendOrcTypes(List<OrcProto.Type> result, TypeDescription typeDescr) {
+    OrcProto.Type.Builder type = OrcProto.Type.newBuilder();
+    List<TypeDescription> children = typeDescr.getChildren();
+    switch (typeDescr.getCategory()) {
+    case BOOLEAN:
+      type.setKind(OrcProto.Type.Kind.BOOLEAN);
+      break;
+    case BYTE:
+      type.setKind(OrcProto.Type.Kind.BYTE);
+      break;
+    case SHORT:
+      type.setKind(OrcProto.Type.Kind.SHORT);
+      break;
+    case INT:
+      type.setKind(OrcProto.Type.Kind.INT);
+      break;
+    case LONG:
+      type.setKind(OrcProto.Type.Kind.LONG);
+      break;
+    case FLOAT:
+      type.setKind(OrcProto.Type.Kind.FLOAT);
+      break;
+    case DOUBLE:
+      type.setKind(OrcProto.Type.Kind.DOUBLE);
+      break;
+    case STRING:
+      type.setKind(OrcProto.Type.Kind.STRING);
+      break;
+    case CHAR:
+      type.setKind(OrcProto.Type.Kind.CHAR);
+      type.setMaximumLength(typeDescr.getMaxLength());
+      break;
+    case VARCHAR:
+      type.setKind(OrcProto.Type.Kind.VARCHAR);
+      type.setMaximumLength(typeDescr.getMaxLength());
+      break;
+    case BINARY:
+      type.setKind(OrcProto.Type.Kind.BINARY);
+      break;
+    case TIMESTAMP:
+      type.setKind(OrcProto.Type.Kind.TIMESTAMP);
+      break;
+    case DATE:
+      type.setKind(OrcProto.Type.Kind.DATE);
+      break;
+    case DECIMAL:
+      type.setKind(OrcProto.Type.Kind.DECIMAL);
+      type.setPrecision(typeDescr.getPrecision());
+      type.setScale(typeDescr.getScale());
+      break;
+    case LIST:
+      type.setKind(OrcProto.Type.Kind.LIST);
+      type.addSubtypes(children.get(0).getId());
+      break;
+    case MAP:
+      type.setKind(OrcProto.Type.Kind.MAP);
+      for(TypeDescription t: children) {
+        type.addSubtypes(t.getId());
+      }
+      break;
+    case STRUCT:
+      type.setKind(OrcProto.Type.Kind.STRUCT);
+      for(TypeDescription t: children) {
+        type.addSubtypes(t.getId());
+      }
+      for(String field: typeDescr.getFieldNames()) {
+        type.addFieldNames(field);
+      }
+      break;
+    case UNION:
+      type.setKind(OrcProto.Type.Kind.UNION);
+      for(TypeDescription t: children) {
+        type.addSubtypes(t.getId());
+      }
+      break;
+    default:
+      throw new IllegalArgumentException("Unknown category: " +
+          typeDescr.getCategory());
+    }
+    result.add(type.build());
+    if (children != null) {
+      for(TypeDescription child: children) {
+        appendOrcTypes(result, child);
+      }
+    }
+  }
+
+  /**
+   * NOTE: This method ignores the subtype numbers in the TypeDescription rebuilds the subtype
+   * numbers based on the length of the result list being appended.
+   *
+   * @param result
+   * @param typeDescr
+   */
+  public static void appendOrcTypesRebuildSubtypes(List<OrcProto.Type> result,
+      TypeDescription typeDescr) {
+
+    int subtype = result.size();
+    OrcProto.Type.Builder type = OrcProto.Type.newBuilder();
+    boolean needsAdd = true;
+    List<TypeDescription> children = typeDescr.getChildren();
+    switch (typeDescr.getCategory()) {
+    case BOOLEAN:
+      type.setKind(OrcProto.Type.Kind.BOOLEAN);
+      break;
+    case BYTE:
+      type.setKind(OrcProto.Type.Kind.BYTE);
+      break;
+    case SHORT:
+      type.setKind(OrcProto.Type.Kind.SHORT);
+      break;
+    case INT:
+      type.setKind(OrcProto.Type.Kind.INT);
+      break;
+    case LONG:
+      type.setKind(OrcProto.Type.Kind.LONG);
+      break;
+    case FLOAT:
+      type.setKind(OrcProto.Type.Kind.FLOAT);
+      break;
+    case DOUBLE:
+      type.setKind(OrcProto.Type.Kind.DOUBLE);
+      break;
+    case STRING:
+      type.setKind(OrcProto.Type.Kind.STRING);
+      break;
+    case CHAR:
+      type.setKind(OrcProto.Type.Kind.CHAR);
+      type.setMaximumLength(typeDescr.getMaxLength());
+      break;
+    case VARCHAR:
+      type.setKind(OrcProto.Type.Kind.VARCHAR);
+      type.setMaximumLength(typeDescr.getMaxLength());
+      break;
+    case BINARY:
+      type.setKind(OrcProto.Type.Kind.BINARY);
+      break;
+    case TIMESTAMP:
+      type.setKind(OrcProto.Type.Kind.TIMESTAMP);
+      break;
+    case DATE:
+      type.setKind(OrcProto.Type.Kind.DATE);
+      break;
+    case DECIMAL:
+      type.setKind(OrcProto.Type.Kind.DECIMAL);
+      type.setPrecision(typeDescr.getPrecision());
+      type.setScale(typeDescr.getScale());
+      break;
+    case LIST:
+      type.setKind(OrcProto.Type.Kind.LIST);
+      type.addSubtypes(++subtype);
+      result.add(type.build());
+      needsAdd = false;
+      appendOrcTypesRebuildSubtypes(result, children.get(0));
+      break;
+    case MAP:
+      {
+        // Make room for MAP type.
+        result.add(null);
+  
+        // Add MAP type pair in order to determine their subtype values.
+        appendOrcTypesRebuildSubtypes(result, children.get(0));
+        int subtype2 = result.size();
+        appendOrcTypesRebuildSubtypes(result, children.get(1));
+        type.setKind(OrcProto.Type.Kind.MAP);
+        type.addSubtypes(subtype + 1);
+        type.addSubtypes(subtype2);
+        result.set(subtype, type.build());
+        needsAdd = false;
+      }
+      break;
+    case STRUCT:
+      {
+        List<String> fieldNames = typeDescr.getFieldNames();
+
+        // Make room for STRUCT type.
+        result.add(null);
+
+        List<Integer> fieldSubtypes = new ArrayList<Integer>(fieldNames.size());
+        for(TypeDescription child: children) {
+          int fieldSubtype = result.size();
+          fieldSubtypes.add(fieldSubtype);
+          appendOrcTypesRebuildSubtypes(result, child);
+        }
+
+        type.setKind(OrcProto.Type.Kind.STRUCT);
+
+        for (int i = 0 ; i < fieldNames.size(); i++) {
+          type.addSubtypes(fieldSubtypes.get(i));
+          type.addFieldNames(fieldNames.get(i));
+        }
+        result.set(subtype, type.build());
+        needsAdd = false;
+      }
+      break;
+    case UNION:
+      {
+        // Make room for UNION type.
+        result.add(null);
+
+        List<Integer> unionSubtypes = new ArrayList<Integer>(children.size());
+        for(TypeDescription child: children) {
+          int unionSubtype = result.size();
+          unionSubtypes.add(unionSubtype);
+          appendOrcTypesRebuildSubtypes(result, child);
+        }
+
+        type.setKind(OrcProto.Type.Kind.UNION);
+        for (int i = 0 ; i < children.size(); i++) {
+          type.addSubtypes(unionSubtypes.get(i));
+        }
+        result.set(subtype, type.build());
+        needsAdd = false;
+      }
+      break;
+    default:
+      throw new IllegalArgumentException("Unknown category: " + typeDescr.getCategory());
+    }
+    if (needsAdd) {
+      result.add(type.build());
+    }
+  }
+
+  /**
+   * NOTE: This method ignores the subtype numbers in the OrcProto.Type rebuilds the subtype
+   * numbers based on the length of the result list being appended.
+   *
+   * @param result
+   * @param types
+   * @param columnId
+   */
+  public static int appendOrcTypesRebuildSubtypes(List<OrcProto.Type> result,
+      List<OrcProto.Type> types, int columnId) {
+
+    OrcProto.Type oldType = types.get(columnId++);
+
+    int subtype = result.size();
+    OrcProto.Type.Builder builder = OrcProto.Type.newBuilder();
+    boolean needsAdd = true;
+    switch (oldType.getKind()) {
+    case BOOLEAN:
+      builder.setKind(OrcProto.Type.Kind.BOOLEAN);
+      break;
+    case BYTE:
+      builder.setKind(OrcProto.Type.Kind.BYTE);
+      break;
+    case SHORT:
+      builder.setKind(OrcProto.Type.Kind.SHORT);
+      break;
+    case INT:
+      builder.setKind(OrcProto.Type.Kind.INT);
+      break;
+    case LONG:
+      builder.setKind(OrcProto.Type.Kind.LONG);
+      break;
+    case FLOAT:
+      builder.setKind(OrcProto.Type.Kind.FLOAT);
+      break;
+    case DOUBLE:
+      builder.setKind(OrcProto.Type.Kind.DOUBLE);
+      break;
+    case STRING:
+      builder.setKind(OrcProto.Type.Kind.STRING);
+      break;
+    case CHAR:
+      builder.setKind(OrcProto.Type.Kind.CHAR);
+      builder.setMaximumLength(oldType.getMaximumLength());
+      break;
+    case VARCHAR:
+      builder.setKind(OrcProto.Type.Kind.VARCHAR);
+      builder.setMaximumLength(oldType.getMaximumLength());
+      break;
+    case BINARY:
+      builder.setKind(OrcProto.Type.Kind.BINARY);
+      break;
+    case TIMESTAMP:
+      builder.setKind(OrcProto.Type.Kind.TIMESTAMP);
+      break;
+    case DATE:
+      builder.setKind(OrcProto.Type.Kind.DATE);
+      break;
+    case DECIMAL:
+      builder.setKind(OrcProto.Type.Kind.DECIMAL);
+      builder.setPrecision(oldType.getPrecision());
+      builder.setScale(oldType.getScale());
+      break;
+    case LIST:
+      builder.setKind(OrcProto.Type.Kind.LIST);
+      builder.addSubtypes(++subtype);
+      result.add(builder.build());
+      needsAdd = false;
+      columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
+      break;
+    case MAP:
+      {
+        // Make room for MAP type.
+        result.add(null);
+  
+        // Add MAP type pair in order to determine their subtype values.
+        columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
+        int subtype2 = result.size();
+        columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
+        builder.setKind(OrcProto.Type.Kind.MAP);
+        builder.addSubtypes(subtype + 1);
+        builder.addSubtypes(subtype2);
+        result.set(subtype, builder.build());
+        needsAdd = false;
+      }
+      break;
+    case STRUCT:
+      {
+        List<String> fieldNames = oldType.getFieldNamesList();
+
+        // Make room for STRUCT type.
+        result.add(null);
+
+        List<Integer> fieldSubtypes = new ArrayList<Integer>(fieldNames.size());
+        for(int i = 0 ; i < fieldNames.size(); i++) {
+          int fieldSubtype = result.size();
+          fieldSubtypes.add(fieldSubtype);
+          columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
+        }
+
+        builder.setKind(OrcProto.Type.Kind.STRUCT);
+
+        for (int i = 0 ; i < fieldNames.size(); i++) {
+          builder.addSubtypes(fieldSubtypes.get(i));
+          builder.addFieldNames(fieldNames.get(i));
+        }
+        result.set(subtype, builder.build());
+        needsAdd = false;
+      }
+      break;
+    case UNION:
+      {
+        int subtypeCount = oldType.getSubtypesCount();
+
+        // Make room for UNION type.
+        result.add(null);
+
+        List<Integer> unionSubtypes = new ArrayList<Integer>(subtypeCount);
+        for(int i = 0 ; i < subtypeCount; i++) {
+          int unionSubtype = result.size();
+          unionSubtypes.add(unionSubtype);
+          columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
+        }
+
+        builder.setKind(OrcProto.Type.Kind.UNION);
+        for (int i = 0 ; i < subtypeCount; i++) {
+          builder.addSubtypes(unionSubtypes.get(i));
+        }
+        result.set(subtype, builder.build());
+        needsAdd = false;
+      }
+      break;
+    default:
+      throw new IllegalArgumentException("Unknown category: " + oldType.getKind());
+    }
+    if (needsAdd) {
+      result.add(builder.build());
+    }
+    return columnId;
+  }
+
+  /**
+   * Translate the given rootColumn from the list of types to a TypeDescription.
+   * @param types all of the types
+   * @param rootColumn translate this type
+   * @return a new TypeDescription that matches the given rootColumn
+   */
+  public static
+        TypeDescription convertTypeFromProtobuf(List<OrcProto.Type> types,
+                                                int rootColumn) {
+    OrcProto.Type type = types.get(rootColumn);
+    switch (type.getKind()) {
+      case BOOLEAN:
+        return TypeDescription.createBoolean();
+      case BYTE:
+        return TypeDescription.createByte();
+      case SHORT:
+        return TypeDescription.createShort();
+      case INT:
+        return TypeDescription.createInt();
+      case LONG:
+        return TypeDescription.createLong();
+      case FLOAT:
+        return TypeDescription.createFloat();
+      case DOUBLE:
+        return TypeDescription.createDouble();
+      case STRING:
+        return TypeDescription.createString();
+      case CHAR:
+      case VARCHAR: {
+        TypeDescription result = type.getKind() == OrcProto.Type.Kind.CHAR ?
+            TypeDescription.createChar() : TypeDescription.createVarchar();
+        if (type.hasMaximumLength()) {
+          result.withMaxLength(type.getMaximumLength());
+        }
+        return result;
+      }
+      case BINARY:
+        return TypeDescription.createBinary();
+      case TIMESTAMP:
+        return TypeDescription.createTimestamp();
+      case DATE:
+        return TypeDescription.createDate();
+      case DECIMAL: {
+        TypeDescription result = TypeDescription.createDecimal();
+        if (type.hasScale()) {
+          result.withScale(type.getScale());
+        }
+        if (type.hasPrecision()) {
+          result.withPrecision(type.getPrecision());
+        }
+        return result;
+      }
+      case LIST:
+        return TypeDescription.createList(
+            convertTypeFromProtobuf(types, type.getSubtypes(0)));
+      case MAP:
+        return TypeDescription.createMap(
+            convertTypeFromProtobuf(types, type.getSubtypes(0)),
+            convertTypeFromProtobuf(types, type.getSubtypes(1)));
+      case STRUCT: {
+        TypeDescription result = TypeDescription.createStruct();
+        for(int f=0; f < type.getSubtypesCount(); ++f) {
+          result.addField(type.getFieldNames(f),
+              convertTypeFromProtobuf(types, type.getSubtypes(f)));
+        }
+        return result;
+      }
+      case UNION: {
+        TypeDescription result = TypeDescription.createUnion();
+        for(int f=0; f < type.getSubtypesCount(); ++f) {
+          result.addUnionChild(
+              convertTypeFromProtobuf(types, type.getSubtypes(f)));
+        }
+        return result;
+      }
+    }
+    throw new IllegalArgumentException("Unknown ORC type " + type.getKind());
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/Reader.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/Reader.java b/java/core/src/java/org/apache/orc/Reader.java
new file mode 100644
index 0000000..87f3293
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/Reader.java
@@ -0,0 +1,368 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
+
+/**
+ * The interface for reading ORC files.
+ *
+ * One Reader can support multiple concurrent RecordReader.
+ */
+public interface Reader {
+
+  /**
+   * Get the number of rows in the file.
+   * @return the number of rows
+   */
+  long getNumberOfRows();
+
+  /**
+   * Get the deserialized data size of the file
+   * @return raw data size
+   */
+  long getRawDataSize();
+
+  /**
+   * Get the deserialized data size of the specified columns
+   * @param colNames
+   * @return raw data size of columns
+   */
+  long getRawDataSizeOfColumns(List<String> colNames);
+
+  /**
+   * Get the deserialized data size of the specified columns ids
+   * @param colIds - internal column id (check orcfiledump for column ids)
+   * @return raw data size of columns
+   */
+  long getRawDataSizeFromColIndices(List<Integer> colIds);
+
+  /**
+   * Get the user metadata keys.
+   * @return the set of metadata keys
+   */
+  List<String> getMetadataKeys();
+
+  /**
+   * Get a user metadata value.
+   * @param key a key given by the user
+   * @return the bytes associated with the given key
+   */
+  ByteBuffer getMetadataValue(String key);
+
+  /**
+   * Did the user set the given metadata value.
+   * @param key the key to check
+   * @return true if the metadata value was set
+   */
+  boolean hasMetadataValue(String key);
+
+  /**
+   * Get the compression kind.
+   * @return the kind of compression in the file
+   */
+  CompressionKind getCompressionKind();
+
+  /**
+   * Get the buffer size for the compression.
+   * @return number of bytes to buffer for the compression codec.
+   */
+  int getCompressionSize();
+
+  /**
+   * Get the number of rows per a entry in the row index.
+   * @return the number of rows per an entry in the row index or 0 if there
+   * is no row index.
+   */
+  int getRowIndexStride();
+
+  /**
+   * Get the list of stripes.
+   * @return the information about the stripes in order
+   */
+  List<StripeInformation> getStripes();
+
+  /**
+   * Get the length of the file.
+   * @return the number of bytes in the file
+   */
+  long getContentLength();
+
+  /**
+   * Get the statistics about the columns in the file.
+   * @return the information about the column
+   */
+  ColumnStatistics[] getStatistics();
+
+  /**
+   * Get the type of rows in this ORC file.
+   */
+  TypeDescription getSchema();
+
+  /**
+   * Get the list of types contained in the file. The root type is the first
+   * type in the list.
+   * @return the list of flattened types
+   * @deprecated use getSchema instead
+   */
+  List<OrcProto.Type> getTypes();
+
+  /**
+   * Get the file format version.
+   */
+  OrcFile.Version getFileVersion();
+
+  /**
+   * Get the version of the writer of this file.
+   */
+  OrcFile.WriterVersion getWriterVersion();
+
+  /**
+   * Options for creating a RecordReader.
+   */
+  public static class Options {
+    private boolean[] include;
+    private long offset = 0;
+    private long length = Long.MAX_VALUE;
+    private SearchArgument sarg = null;
+    private String[] columnNames = null;
+    private Boolean useZeroCopy = null;
+    private Boolean skipCorruptRecords = null;
+    private TypeDescription schema = null;
+    private DataReader dataReader = null;
+
+    /**
+     * Set the list of columns to read.
+     * @param include a list of columns to read
+     * @return this
+     */
+    public Options include(boolean[] include) {
+      this.include = include;
+      return this;
+    }
+
+    /**
+     * Set the range of bytes to read
+     * @param offset the starting byte offset
+     * @param length the number of bytes to read
+     * @return this
+     */
+    public Options range(long offset, long length) {
+      this.offset = offset;
+      this.length = length;
+      return this;
+    }
+
+    /**
+     * Set the schema on read type description.
+     */
+    public Options schema(TypeDescription schema) {
+      this.schema = schema;
+      return this;
+    }
+
+    /**
+     * Set search argument for predicate push down.
+     * @param sarg the search argument
+     * @param columnNames the column names for
+     * @return this
+     */
+    public Options searchArgument(SearchArgument sarg, String[] columnNames) {
+      this.sarg = sarg;
+      this.columnNames = columnNames;
+      return this;
+    }
+
+    /**
+     * Set whether to use zero copy from HDFS.
+     * @param value the new zero copy flag
+     * @return this
+     */
+    public Options useZeroCopy(boolean value) {
+      this.useZeroCopy = value;
+      return this;
+    }
+
+    public Options dataReader(DataReader value) {
+      this.dataReader = value;
+      return this;
+    }
+
+    /**
+     * Set whether to skip corrupt records.
+     * @param value the new skip corrupt records flag
+     * @return this
+     */
+    public Options skipCorruptRecords(boolean value) {
+      this.skipCorruptRecords = value;
+      return this;
+    }
+
+    public boolean[] getInclude() {
+      return include;
+    }
+
+    public long getOffset() {
+      return offset;
+    }
+
+    public long getLength() {
+      return length;
+    }
+
+    public TypeDescription getSchema() {
+      return schema;
+    }
+
+    public SearchArgument getSearchArgument() {
+      return sarg;
+    }
+
+    public String[] getColumnNames() {
+      return columnNames;
+    }
+
+    public long getMaxOffset() {
+      long result = offset + length;
+      if (result < 0) {
+        result = Long.MAX_VALUE;
+      }
+      return result;
+    }
+
+    public Boolean getUseZeroCopy() {
+      return useZeroCopy;
+    }
+
+    public Boolean getSkipCorruptRecords() {
+      return skipCorruptRecords;
+    }
+
+    public DataReader getDataReader() {
+      return dataReader;
+    }
+
+    public Options clone() {
+      Options result = new Options();
+      result.include = include;
+      result.offset = offset;
+      result.length = length;
+      result.sarg = sarg;
+      result.schema = schema;
+      result.columnNames = columnNames;
+      result.useZeroCopy = useZeroCopy;
+      result.skipCorruptRecords = skipCorruptRecords;
+      result.dataReader = dataReader == null ? null : dataReader.clone();
+      return result;
+    }
+
+    @Override
+    public String toString() {
+      StringBuilder buffer = new StringBuilder();
+      buffer.append("{include: ");
+      if (include == null) {
+        buffer.append("null");
+      } else {
+        buffer.append("[");
+        for(int i=0; i < include.length; ++i) {
+          if (i != 0) {
+            buffer.append(", ");
+          }
+          buffer.append(include[i]);
+        }
+        buffer.append("]");
+      }
+      buffer.append(", offset: ");
+      buffer.append(offset);
+      buffer.append(", length: ");
+      buffer.append(length);
+      if (sarg != null) {
+        buffer.append(", sarg: ");
+        buffer.append(sarg.toString());
+        buffer.append(", columns: [");
+        for(int i=0; i < columnNames.length; ++i) {
+          if (i != 0) {
+            buffer.append(", ");
+          }
+          buffer.append("'");
+          buffer.append(columnNames[i]);
+          buffer.append("'");
+        }
+        buffer.append("]");
+      }
+      if (schema != null) {
+        buffer.append(", schema: ");
+        schema.printToBuffer(buffer);
+      }
+      buffer.append("}");
+      return buffer.toString();
+    }
+  }
+
+  /**
+   * Create a RecordReader that reads everything with the default options.
+   * @return a new RecordReader
+   * @throws IOException
+   */
+  RecordReader rows() throws IOException;
+
+  /**
+   * Create a RecordReader that uses the options given.
+   * This method can't be named rows, because many callers used rows(null)
+   * before the rows() method was introduced.
+   * @param options the options to read with
+   * @return a new RecordReader
+   * @throws IOException
+   */
+  RecordReader rows(Options options) throws IOException;
+
+  /**
+   * @return List of integers representing version of the file, in order from major to minor.
+   */
+  List<Integer> getVersionList();
+
+  /**
+   * @return Gets the size of metadata, in bytes.
+   */
+  int getMetadataSize();
+
+  /**
+   * @return Stripe statistics, in original protobuf form.
+   */
+  List<OrcProto.StripeStatistics> getOrcProtoStripeStatistics();
+
+  /**
+   * @return Stripe statistics.
+   */
+  List<StripeStatistics> getStripeStatistics();
+
+  /**
+   * @return File statistics, in original protobuf form.
+   */
+  List<OrcProto.ColumnStatistics> getOrcProtoFileStatistics();
+
+  /**
+   * @return Serialized file metadata read from disk for the purposes of caching, etc.
+   */
+  ByteBuffer getSerializedFileFooter();
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/RecordReader.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/RecordReader.java b/java/core/src/java/org/apache/orc/RecordReader.java
new file mode 100644
index 0000000..09ba0f0
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/RecordReader.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import java.io.IOException;
+
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+/**
+ * A row-by-row iterator for ORC files.
+ */
+public interface RecordReader {
+  /**
+   * Read the next row batch. The size of the batch to read cannot be
+   * controlled by the callers. Caller need to look at
+   * VectorizedRowBatch.size of the retunred object to know the batch
+   * size read.
+   * @param batch a row batch object to read into
+   * @return were more rows available to read?
+   * @throws java.io.IOException
+   */
+  boolean nextBatch(VectorizedRowBatch batch) throws IOException;
+
+  /**
+   * Get the row number of the row that will be returned by the following
+   * call to next().
+   * @return the row number from 0 to the number of rows in the file
+   * @throws java.io.IOException
+   */
+  long getRowNumber() throws IOException;
+
+  /**
+   * Get the progress of the reader through the rows.
+   * @return a fraction between 0.0 and 1.0 of rows read
+   * @throws java.io.IOException
+   */
+  float getProgress() throws IOException;
+
+  /**
+   * Release the resources associated with the given reader.
+   * @throws java.io.IOException
+   */
+  void close() throws IOException;
+
+  /**
+   * Seek to a particular row number.
+   */
+  void seekToRow(long rowCount) throws IOException;
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/StringColumnStatistics.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/StringColumnStatistics.java b/java/core/src/java/org/apache/orc/StringColumnStatistics.java
new file mode 100644
index 0000000..5a868d0
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/StringColumnStatistics.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import org.apache.orc.ColumnStatistics;
+
+/**
+ * Statistics for string columns.
+ */
+public interface StringColumnStatistics extends ColumnStatistics {
+  /**
+   * Get the minimum string.
+   * @return the minimum
+   */
+  String getMinimum();
+
+  /**
+   * Get the maximum string.
+   * @return the maximum
+   */
+  String getMaximum();
+
+  /**
+   * Get the total length of all strings
+   * @return the sum (total length)
+   */
+  long getSum();
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/StripeInformation.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/StripeInformation.java b/java/core/src/java/org/apache/orc/StripeInformation.java
new file mode 100644
index 0000000..38f7eba
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/StripeInformation.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+/**
+ * Information about the stripes in an ORC file that is provided by the Reader.
+ */
+public interface StripeInformation {
+  /**
+   * Get the byte offset of the start of the stripe.
+   * @return the bytes from the start of the file
+   */
+  long getOffset();
+
+  /**
+   * Get the total length of the stripe in bytes.
+   * @return the number of bytes in the stripe
+   */
+  long getLength();
+
+  /**
+   * Get the length of the stripe's indexes.
+   * @return the number of bytes in the index
+   */
+  long getIndexLength();
+
+  /**
+   * Get the length of the stripe's data.
+   * @return the number of bytes in the stripe
+   */
+  long getDataLength();
+
+  /**
+   * Get the length of the stripe's tail section, which contains its index.
+   * @return the number of bytes in the tail
+   */
+  long getFooterLength();
+
+  /**
+   * Get the number of rows in the stripe.
+   * @return a count of the number of rows
+   */
+  long getNumberOfRows();
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/StripeStatistics.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/StripeStatistics.java b/java/core/src/java/org/apache/orc/StripeStatistics.java
new file mode 100644
index 0000000..8fc91cb
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/StripeStatistics.java
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import org.apache.orc.impl.ColumnStatisticsImpl;
+
+import java.util.List;
+
+public class StripeStatistics {
+  private final List<OrcProto.ColumnStatistics> cs;
+
+  public StripeStatistics(List<OrcProto.ColumnStatistics> list) {
+    this.cs = list;
+  }
+
+  /**
+   * Return list of column statistics
+   *
+   * @return column stats
+   */
+  public ColumnStatistics[] getColumnStatistics() {
+    ColumnStatistics[] result = new ColumnStatistics[cs.size()];
+    for (int i = 0; i < result.length; ++i) {
+      result[i] = ColumnStatisticsImpl.deserialize(cs.get(i));
+    }
+    return result;
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/TimestampColumnStatistics.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/TimestampColumnStatistics.java b/java/core/src/java/org/apache/orc/TimestampColumnStatistics.java
new file mode 100644
index 0000000..27dc49f
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/TimestampColumnStatistics.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import java.sql.Timestamp;
+
+/**
+ * Statistics for Timestamp columns.
+ */
+public interface TimestampColumnStatistics extends ColumnStatistics {
+  /**
+   * Get the minimum value for the column.
+   * @return minimum value
+   */
+  Timestamp getMinimum();
+
+  /**
+   * Get the maximum value for the column.
+   * @return maximum value
+   */
+  Timestamp getMaximum();
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/TypeDescription.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java
new file mode 100644
index 0000000..d4c66d1
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/TypeDescription.java
@@ -0,0 +1,791 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * This is the description of the types in an ORC file.
+ */
+public class TypeDescription {
+  private static final int MAX_PRECISION = 38;
+  private static final int MAX_SCALE = 38;
+  private static final int DEFAULT_PRECISION = 38;
+  private static final int DEFAULT_SCALE = 10;
+  private static final int DEFAULT_LENGTH = 256;
+  public enum Category {
+    BOOLEAN("boolean", true),
+    BYTE("tinyint", true),
+    SHORT("smallint", true),
+    INT("int", true),
+    LONG("bigint", true),
+    FLOAT("float", true),
+    DOUBLE("double", true),
+    STRING("string", true),
+    DATE("date", true),
+    TIMESTAMP("timestamp", true),
+    BINARY("binary", true),
+    DECIMAL("decimal", true),
+    VARCHAR("varchar", true),
+    CHAR("char", true),
+    LIST("array", false),
+    MAP("map", false),
+    STRUCT("struct", false),
+    UNION("uniontype", false);
+
+    Category(String name, boolean isPrimitive) {
+      this.name = name;
+      this.isPrimitive = isPrimitive;
+    }
+
+    final boolean isPrimitive;
+    final String name;
+
+    public boolean isPrimitive() {
+      return isPrimitive;
+    }
+
+    public String getName() {
+      return name;
+    }
+  }
+
+  public static TypeDescription createBoolean() {
+    return new TypeDescription(Category.BOOLEAN);
+  }
+
+  public static TypeDescription createByte() {
+    return new TypeDescription(Category.BYTE);
+  }
+
+  public static TypeDescription createShort() {
+    return new TypeDescription(Category.SHORT);
+  }
+
+  public static TypeDescription createInt() {
+    return new TypeDescription(Category.INT);
+  }
+
+  public static TypeDescription createLong() {
+    return new TypeDescription(Category.LONG);
+  }
+
+  public static TypeDescription createFloat() {
+    return new TypeDescription(Category.FLOAT);
+  }
+
+  public static TypeDescription createDouble() {
+    return new TypeDescription(Category.DOUBLE);
+  }
+
+  public static TypeDescription createString() {
+    return new TypeDescription(Category.STRING);
+  }
+
+  public static TypeDescription createDate() {
+    return new TypeDescription(Category.DATE);
+  }
+
+  public static TypeDescription createTimestamp() {
+    return new TypeDescription(Category.TIMESTAMP);
+  }
+
+  public static TypeDescription createBinary() {
+    return new TypeDescription(Category.BINARY);
+  }
+
+  public static TypeDescription createDecimal() {
+    return new TypeDescription(Category.DECIMAL);
+  }
+
+  static class StringPosition {
+    final String value;
+    int position;
+    final int length;
+
+    StringPosition(String value) {
+      this.value = value;
+      position = 0;
+      length = value.length();
+    }
+
+    @Override
+    public String toString() {
+      StringBuilder buffer = new StringBuilder();
+      buffer.append('\'');
+      buffer.append(value.substring(0, position));
+      buffer.append('^');
+      buffer.append(value.substring(position));
+      buffer.append('\'');
+      return buffer.toString();
+    }
+  }
+
+  static Category parseCategory(StringPosition source) {
+    int start = source.position;
+    while (source.position < source.length) {
+      char ch = source.value.charAt(source.position);
+      if (!Character.isLetter(ch)) {
+        break;
+      }
+      source.position += 1;
+    }
+    if (source.position != start) {
+      String word = source.value.substring(start, source.position).toLowerCase();
+      for (Category cat : Category.values()) {
+        if (cat.getName().equals(word)) {
+          return cat;
+        }
+      }
+    }
+    throw new IllegalArgumentException("Can't parse category at " + source);
+  }
+
+  static int parseInt(StringPosition source) {
+    int start = source.position;
+    int result = 0;
+    while (source.position < source.length) {
+      char ch = source.value.charAt(source.position);
+      if (!Character.isDigit(ch)) {
+        break;
+      }
+      result = result * 10 + (ch - '0');
+      source.position += 1;
+    }
+    if (source.position == start) {
+      throw new IllegalArgumentException("Missing integer at " + source);
+    }
+    return result;
+  }
+
+  static String parseName(StringPosition source) {
+    int start = source.position;
+    while (source.position < source.length) {
+      char ch = source.value.charAt(source.position);
+      if (!Character.isLetterOrDigit(ch) && ch != '.' && ch != '_') {
+        break;
+      }
+      source.position += 1;
+    }
+    if (source.position == start) {
+      throw new IllegalArgumentException("Missing name at " + source);
+    }
+    return source.value.substring(start, source.position);
+  }
+
+  static void requireChar(StringPosition source, char required) {
+    if (source.position >= source.length ||
+        source.value.charAt(source.position) != required) {
+      throw new IllegalArgumentException("Missing required char '" +
+          required + "' at " + source);
+    }
+    source.position += 1;
+  }
+
+  static boolean consumeChar(StringPosition source, char ch) {
+    boolean result = source.position < source.length &&
+        source.value.charAt(source.position) == ch;
+    if (result) {
+      source.position += 1;
+    }
+    return result;
+  }
+
+  static void parseUnion(TypeDescription type, StringPosition source) {
+    requireChar(source, '<');
+    do {
+      type.addUnionChild(parseType(source));
+    } while (consumeChar(source, ','));
+    requireChar(source, '>');
+  }
+
+  static void parseStruct(TypeDescription type, StringPosition source) {
+    requireChar(source, '<');
+    do {
+      String fieldName = parseName(source);
+      requireChar(source, ':');
+      type.addField(fieldName, parseType(source));
+    } while (consumeChar(source, ','));
+    requireChar(source, '>');
+  }
+
+  static TypeDescription parseType(StringPosition source) {
+    TypeDescription result = new TypeDescription(parseCategory(source));
+    switch (result.getCategory()) {
+      case BINARY:
+      case BOOLEAN:
+      case BYTE:
+      case DATE:
+      case DOUBLE:
+      case FLOAT:
+      case INT:
+      case LONG:
+      case SHORT:
+      case STRING:
+      case TIMESTAMP:
+        break;
+      case CHAR:
+      case VARCHAR:
+        requireChar(source, '(');
+        result.withMaxLength(parseInt(source));
+        requireChar(source, ')');
+        break;
+      case DECIMAL: {
+        requireChar(source, '(');
+        int precision = parseInt(source);
+        requireChar(source, ',');
+        result.withScale(parseInt(source));
+        result.withPrecision(precision);
+        requireChar(source, ')');
+        break;
+      }
+      case LIST:
+        requireChar(source, '<');
+        result.children.add(parseType(source));
+        requireChar(source, '>');
+        break;
+      case MAP:
+        requireChar(source, '<');
+        result.children.add(parseType(source));
+        requireChar(source, ',');
+        result.children.add(parseType(source));
+        requireChar(source, '>');
+        break;
+      case UNION:
+        parseUnion(result, source);
+        break;
+      case STRUCT:
+        parseStruct(result, source);
+        break;
+      default:
+        throw new IllegalArgumentException("Unknown type " +
+            result.getCategory() + " at " + source);
+    }
+    return result;
+  }
+
+  /**
+   * Parse TypeDescription from the Hive type names. This is the inverse
+   * of TypeDescription.toString()
+   * @param typeName the name of the type
+   * @return a new TypeDescription or null if typeName was null
+   * @throws IllegalArgumentException if the string is badly formed
+   */
+  public static TypeDescription fromString(String typeName) {
+    if (typeName == null) {
+      return null;
+    }
+    StringPosition source = new StringPosition(typeName);
+    TypeDescription result = parseType(source);
+    if (source.position != source.length) {
+      throw new IllegalArgumentException("Extra characters at " + source);
+    }
+    return result;
+  }
+
+  /**
+   * For decimal types, set the precision.
+   * @param precision the new precision
+   * @return this
+   */
+  public TypeDescription withPrecision(int precision) {
+    if (category != Category.DECIMAL) {
+      throw new IllegalArgumentException("precision is only allowed on decimal"+
+         " and not " + category.name);
+    } else if (precision < 1 || precision > MAX_PRECISION || scale > precision){
+      throw new IllegalArgumentException("precision " + precision +
+          " is out of range 1 .. " + scale);
+    }
+    this.precision = precision;
+    return this;
+  }
+
+  /**
+   * For decimal types, set the scale.
+   * @param scale the new scale
+   * @return this
+   */
+  public TypeDescription withScale(int scale) {
+    if (category != Category.DECIMAL) {
+      throw new IllegalArgumentException("scale is only allowed on decimal"+
+          " and not " + category.name);
+    } else if (scale < 0 || scale > MAX_SCALE || scale > precision) {
+      throw new IllegalArgumentException("scale is out of range at " + scale);
+    }
+    this.scale = scale;
+    return this;
+  }
+
+  public static TypeDescription createVarchar() {
+    return new TypeDescription(Category.VARCHAR);
+  }
+
+  public static TypeDescription createChar() {
+    return new TypeDescription(Category.CHAR);
+  }
+
+  /**
+   * Set the maximum length for char and varchar types.
+   * @param maxLength the maximum value
+   * @return this
+   */
+  public TypeDescription withMaxLength(int maxLength) {
+    if (category != Category.VARCHAR && category != Category.CHAR) {
+      throw new IllegalArgumentException("maxLength is only allowed on char" +
+                   " and varchar and not " + category.name);
+    }
+    this.maxLength = maxLength;
+    return this;
+  }
+
+  public static TypeDescription createList(TypeDescription childType) {
+    TypeDescription result = new TypeDescription(Category.LIST);
+    result.children.add(childType);
+    childType.parent = result;
+    return result;
+  }
+
+  public static TypeDescription createMap(TypeDescription keyType,
+                                          TypeDescription valueType) {
+    TypeDescription result = new TypeDescription(Category.MAP);
+    result.children.add(keyType);
+    result.children.add(valueType);
+    keyType.parent = result;
+    valueType.parent = result;
+    return result;
+  }
+
+  public static TypeDescription createUnion() {
+    return new TypeDescription(Category.UNION);
+  }
+
+  public static TypeDescription createStruct() {
+    return new TypeDescription(Category.STRUCT);
+  }
+
+  /**
+   * Add a child to a union type.
+   * @param child a new child type to add
+   * @return the union type.
+   */
+  public TypeDescription addUnionChild(TypeDescription child) {
+    if (category != Category.UNION) {
+      throw new IllegalArgumentException("Can only add types to union type" +
+          " and not " + category);
+    }
+    children.add(child);
+    child.parent = this;
+    return this;
+  }
+
+  /**
+   * Add a field to a struct type as it is built.
+   * @param field the field name
+   * @param fieldType the type of the field
+   * @return the struct type
+   */
+  public TypeDescription addField(String field, TypeDescription fieldType) {
+    if (category != Category.STRUCT) {
+      throw new IllegalArgumentException("Can only add fields to struct type" +
+          " and not " + category);
+    }
+    fieldNames.add(field);
+    children.add(fieldType);
+    fieldType.parent = this;
+    return this;
+  }
+
+  /**
+   * Get the id for this type.
+   * The first call will cause all of the the ids in tree to be assigned, so
+   * it should not be called before the type is completely built.
+   * @return the sequential id
+   */
+  public int getId() {
+    // if the id hasn't been assigned, assign all of the ids from the root
+    if (id == -1) {
+      TypeDescription root = this;
+      while (root.parent != null) {
+        root = root.parent;
+      }
+      root.assignIds(0);
+    }
+    return id;
+  }
+
+  public TypeDescription clone() {
+    TypeDescription result = new TypeDescription(category);
+    result.maxLength = maxLength;
+    result.precision = precision;
+    result.scale = scale;
+    if (fieldNames != null) {
+      result.fieldNames.addAll(fieldNames);
+    }
+    if (children != null) {
+      for(TypeDescription child: children) {
+        TypeDescription clone = child.clone();
+        clone.parent = result;
+        result.children.add(clone);
+      }
+    }
+    return result;
+  }
+
+  @Override
+  public int hashCode() {
+    return getId();
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (other == null || other.getClass() != TypeDescription.class) {
+      return false;
+    }
+    if (other == this) {
+      return true;
+    }
+    TypeDescription castOther = (TypeDescription) other;
+    if (category != castOther.category ||
+        getId() != castOther.getId() ||
+        getMaximumId() != castOther.getMaximumId() ||
+        maxLength != castOther.maxLength ||
+        scale != castOther.scale ||
+        precision != castOther.precision) {
+      return false;
+    }
+    if (children != null) {
+      if (children.size() != castOther.children.size()) {
+        return false;
+      }
+      for (int i = 0; i < children.size(); ++i) {
+        if (!children.get(i).equals(castOther.children.get(i))) {
+          return false;
+        }
+      }
+    }
+    if (category == Category.STRUCT) {
+      for(int i=0; i < fieldNames.size(); ++i) {
+        if (!fieldNames.get(i).equals(castOther.fieldNames.get(i))) {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  /**
+   * Get the maximum id assigned to this type or its children.
+   * The first call will cause all of the the ids in tree to be assigned, so
+   * it should not be called before the type is completely built.
+   * @return the maximum id assigned under this type
+   */
+  public int getMaximumId() {
+    // if the id hasn't been assigned, assign all of the ids from the root
+    if (maxId == -1) {
+      TypeDescription root = this;
+      while (root.parent != null) {
+        root = root.parent;
+      }
+      root.assignIds(0);
+    }
+    return maxId;
+  }
+
+  private ColumnVector createColumn(int maxSize) {
+    switch (category) {
+      case BOOLEAN:
+      case BYTE:
+      case SHORT:
+      case INT:
+      case LONG:
+      case DATE:
+        return new LongColumnVector(maxSize);
+      case TIMESTAMP:
+        return new TimestampColumnVector(maxSize);
+      case FLOAT:
+      case DOUBLE:
+        return new DoubleColumnVector(maxSize);
+      case DECIMAL:
+        return new DecimalColumnVector(maxSize, precision, scale);
+      case STRING:
+      case BINARY:
+      case CHAR:
+      case VARCHAR:
+        return new BytesColumnVector(maxSize);
+      case STRUCT: {
+        ColumnVector[] fieldVector = new ColumnVector[children.size()];
+        for(int i=0; i < fieldVector.length; ++i) {
+          fieldVector[i] = children.get(i).createColumn(maxSize);
+        }
+        return new StructColumnVector(maxSize,
+                fieldVector);
+      }
+      case UNION: {
+        ColumnVector[] fieldVector = new ColumnVector[children.size()];
+        for(int i=0; i < fieldVector.length; ++i) {
+          fieldVector[i] = children.get(i).createColumn(maxSize);
+        }
+        return new UnionColumnVector(maxSize,
+            fieldVector);
+      }
+      case LIST:
+        return new ListColumnVector(maxSize,
+            children.get(0).createColumn(maxSize));
+      case MAP:
+        return new MapColumnVector(maxSize,
+            children.get(0).createColumn(maxSize),
+            children.get(1).createColumn(maxSize));
+      default:
+        throw new IllegalArgumentException("Unknown type " + category);
+    }
+  }
+
+  public VectorizedRowBatch createRowBatch(int maxSize) {
+    VectorizedRowBatch result;
+    if (category == Category.STRUCT) {
+      result = new VectorizedRowBatch(children.size(), maxSize);
+      for(int i=0; i < result.cols.length; ++i) {
+        result.cols[i] = children.get(i).createColumn(maxSize);
+      }
+    } else {
+      result = new VectorizedRowBatch(1, maxSize);
+      result.cols[0] = createColumn(maxSize);
+    }
+    result.reset();
+    return result;
+  }
+
+  public VectorizedRowBatch createRowBatch() {
+    return createRowBatch(VectorizedRowBatch.DEFAULT_SIZE);
+  }
+
+  /**
+   * Get the kind of this type.
+   * @return get the category for this type.
+   */
+  public Category getCategory() {
+    return category;
+  }
+
+  /**
+   * Get the maximum length of the type. Only used for char and varchar types.
+   * @return the maximum length of the string type
+   */
+  public int getMaxLength() {
+    return maxLength;
+  }
+
+  /**
+   * Get the precision of the decimal type.
+   * @return the number of digits for the precision.
+   */
+  public int getPrecision() {
+    return precision;
+  }
+
+  /**
+   * Get the scale of the decimal type.
+   * @return the number of digits for the scale.
+   */
+  public int getScale() {
+    return scale;
+  }
+
+  /**
+   * For struct types, get the list of field names.
+   * @return the list of field names.
+   */
+  public List<String> getFieldNames() {
+    return Collections.unmodifiableList(fieldNames);
+  }
+
+  /**
+   * Get the subtypes of this type.
+   * @return the list of children types
+   */
+  public List<TypeDescription> getChildren() {
+    return children == null ? null : Collections.unmodifiableList(children);
+  }
+
+  /**
+   * Assign ids to all of the nodes under this one.
+   * @param startId the lowest id to assign
+   * @return the next available id
+   */
+  private int assignIds(int startId) {
+    id = startId++;
+    if (children != null) {
+      for (TypeDescription child : children) {
+        startId = child.assignIds(startId);
+      }
+    }
+    maxId = startId - 1;
+    return startId;
+  }
+
+  private TypeDescription(Category category) {
+    this.category = category;
+    if (category.isPrimitive) {
+      children = null;
+    } else {
+      children = new ArrayList<>();
+    }
+    if (category == Category.STRUCT) {
+      fieldNames = new ArrayList<>();
+    } else {
+      fieldNames = null;
+    }
+  }
+
+  private int id = -1;
+  private int maxId = -1;
+  private TypeDescription parent;
+  private final Category category;
+  private final List<TypeDescription> children;
+  private final List<String> fieldNames;
+  private int maxLength = DEFAULT_LENGTH;
+  private int precision = DEFAULT_PRECISION;
+  private int scale = DEFAULT_SCALE;
+
+  public void printToBuffer(StringBuilder buffer) {
+    buffer.append(category.name);
+    switch (category) {
+      case DECIMAL:
+        buffer.append('(');
+        buffer.append(precision);
+        buffer.append(',');
+        buffer.append(scale);
+        buffer.append(')');
+        break;
+      case CHAR:
+      case VARCHAR:
+        buffer.append('(');
+        buffer.append(maxLength);
+        buffer.append(')');
+        break;
+      case LIST:
+      case MAP:
+      case UNION:
+        buffer.append('<');
+        for(int i=0; i < children.size(); ++i) {
+          if (i != 0) {
+            buffer.append(',');
+          }
+          children.get(i).printToBuffer(buffer);
+        }
+        buffer.append('>');
+        break;
+      case STRUCT:
+        buffer.append('<');
+        for(int i=0; i < children.size(); ++i) {
+          if (i != 0) {
+            buffer.append(',');
+          }
+          buffer.append(fieldNames.get(i));
+          buffer.append(':');
+          children.get(i).printToBuffer(buffer);
+        }
+        buffer.append('>');
+        break;
+      default:
+        break;
+    }
+  }
+
+  public String toString() {
+    StringBuilder buffer = new StringBuilder();
+    printToBuffer(buffer);
+    return buffer.toString();
+  }
+
+  private void printJsonToBuffer(String prefix, StringBuilder buffer,
+                                 int indent) {
+    for(int i=0; i < indent; ++i) {
+      buffer.append(' ');
+    }
+    buffer.append(prefix);
+    buffer.append("{\"category\": \"");
+    buffer.append(category.name);
+    buffer.append("\", \"id\": ");
+    buffer.append(getId());
+    buffer.append(", \"max\": ");
+    buffer.append(maxId);
+    switch (category) {
+      case DECIMAL:
+        buffer.append(", \"precision\": ");
+        buffer.append(precision);
+        buffer.append(", \"scale\": ");
+        buffer.append(scale);
+        break;
+      case CHAR:
+      case VARCHAR:
+        buffer.append(", \"length\": ");
+        buffer.append(maxLength);
+        break;
+      case LIST:
+      case MAP:
+      case UNION:
+        buffer.append(", \"children\": [");
+        for(int i=0; i < children.size(); ++i) {
+          buffer.append('\n');
+          children.get(i).printJsonToBuffer("", buffer, indent + 2);
+          if (i != children.size() - 1) {
+            buffer.append(',');
+          }
+        }
+        buffer.append("]");
+        break;
+      case STRUCT:
+        buffer.append(", \"fields\": [");
+        for(int i=0; i < children.size(); ++i) {
+          buffer.append('\n');
+          children.get(i).printJsonToBuffer("\"" + fieldNames.get(i) + "\": ",
+              buffer, indent + 2);
+          if (i != children.size() - 1) {
+            buffer.append(',');
+          }
+        }
+        buffer.append(']');
+        break;
+      default:
+        break;
+    }
+    buffer.append('}');
+  }
+
+  public String toJson() {
+    StringBuilder buffer = new StringBuilder();
+    printJsonToBuffer("", buffer, 0);
+    return buffer.toString();
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/Writer.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/Writer.java b/java/core/src/java/org/apache/orc/Writer.java
new file mode 100644
index 0000000..4492062
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/Writer.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import org.apache.orc.OrcProto;
+import org.apache.orc.StripeInformation;
+import org.apache.orc.TypeDescription;
+
+/**
+ * The interface for writing ORC files.
+ */
+public interface Writer {
+
+  /**
+   * Get the schema for this writer
+   * @return the file schema
+   */
+  TypeDescription getSchema();
+
+  /**
+   * Add arbitrary meta-data to the ORC file. This may be called at any point
+   * until the Writer is closed. If the same key is passed a second time, the
+   * second value will replace the first.
+   * @param key a key to label the data with.
+   * @param value the contents of the metadata.
+   */
+  void addUserMetadata(String key, ByteBuffer value);
+
+  /**
+   * Add a row batch to the ORC file.
+   * @param batch the rows to add
+   */
+  void addRowBatch(VectorizedRowBatch batch) throws IOException;
+
+  /**
+   * Flush all of the buffers and close the file. No methods on this writer
+   * should be called afterwards.
+   * @throws IOException
+   */
+  void close() throws IOException;
+
+  /**
+   * Return the deserialized data size. Raw data size will be compute when
+   * writing the file footer. Hence raw data size value will be available only
+   * after closing the writer.
+   *
+   * @return raw data size
+   */
+  long getRawDataSize();
+
+  /**
+   * Return the number of rows in file. Row count gets updated when flushing
+   * the stripes. To get accurate row count this method should be called after
+   * closing the writer.
+   *
+   * @return row count
+   */
+  long getNumberOfRows();
+
+  /**
+   * Write an intermediate footer on the file such that if the file is
+   * truncated to the returned offset, it would be a valid ORC file.
+   * @return the offset that would be a valid end location for an ORC file
+   */
+  long writeIntermediateFooter() throws IOException;
+
+  /**
+   * Fast stripe append to ORC file. This interface is used for fast ORC file
+   * merge with other ORC files. When merging, the file to be merged should pass
+   * stripe in binary form along with stripe information and stripe statistics.
+   * After appending last stripe of a file, use appendUserMetadata() to append
+   * any user metadata.
+   * @param stripe - stripe as byte array
+   * @param offset - offset within byte array
+   * @param length - length of stripe within byte array
+   * @param stripeInfo - stripe information
+   * @param stripeStatistics - stripe statistics (Protobuf objects can be
+   *                         merged directly)
+   * @throws IOException
+   */
+  public void appendStripe(byte[] stripe, int offset, int length,
+      StripeInformation stripeInfo,
+      OrcProto.StripeStatistics stripeStatistics) throws IOException;
+
+  /**
+   * When fast stripe append is used for merging ORC stripes, after appending
+   * the last stripe from a file, this interface must be used to merge any
+   * user metadata.
+   * @param userMetadata - user metadata
+   */
+  public void appendUserMetadata(List<OrcProto.UserMetadataItem> userMetadata);
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/impl/AcidStats.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/AcidStats.java b/java/core/src/java/org/apache/orc/impl/AcidStats.java
new file mode 100644
index 0000000..6657fe9
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/impl/AcidStats.java
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.impl;
+
+/**
+ * Statistics about the ACID operations in an ORC file
+ */
+public class AcidStats {
+  public long inserts;
+  public long updates;
+  public long deletes;
+
+  public AcidStats() {
+    inserts = 0;
+    updates = 0;
+    deletes = 0;
+  }
+
+  public AcidStats(String serialized) {
+    String[] parts = serialized.split(",");
+    inserts = Long.parseLong(parts[0]);
+    updates = Long.parseLong(parts[1]);
+    deletes = Long.parseLong(parts[2]);
+  }
+
+  public String serialize() {
+    StringBuilder builder = new StringBuilder();
+    builder.append(inserts);
+    builder.append(",");
+    builder.append(updates);
+    builder.append(",");
+    builder.append(deletes);
+    return builder.toString();
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder builder = new StringBuilder();
+    builder.append(" inserts: ").append(inserts);
+    builder.append(" updates: ").append(updates);
+    builder.append(" deletes: ").append(deletes);
+    return builder.toString();
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/impl/BitFieldReader.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/BitFieldReader.java b/java/core/src/java/org/apache/orc/impl/BitFieldReader.java
new file mode 100644
index 0000000..dda7355
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/impl/BitFieldReader.java
@@ -0,0 +1,217 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.impl;
+
+import java.io.EOFException;
+import java.io.IOException;
+
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.orc.impl.InStream;
+import org.apache.orc.impl.PositionProvider;
+import org.apache.orc.impl.RunLengthByteReader;
+
+public class BitFieldReader {
+  private final RunLengthByteReader input;
+  /** The number of bits in one item. Non-test code always uses 1. */
+  private final int bitSize;
+  private int current;
+  private int bitsLeft;
+  private final int mask;
+
+  public BitFieldReader(InStream input,
+      int bitSize) throws IOException {
+    this.input = new RunLengthByteReader(input);
+    this.bitSize = bitSize;
+    mask = (1 << bitSize) - 1;
+  }
+
+  public void setInStream(InStream inStream) {
+    this.input.setInStream(inStream);
+  }
+
+  private void readByte() throws IOException {
+    if (input.hasNext()) {
+      current = 0xff & input.next();
+      bitsLeft = 8;
+    } else {
+      throw new EOFException("Read past end of bit field from " + this);
+    }
+  }
+
+  public int next() throws IOException {
+    int result = 0;
+    int bitsLeftToRead = bitSize;
+    while (bitsLeftToRead > bitsLeft) {
+      result <<= bitsLeft;
+      result |= current & ((1 << bitsLeft) - 1);
+      bitsLeftToRead -= bitsLeft;
+      readByte();
+    }
+    if (bitsLeftToRead > 0) {
+      result <<= bitsLeftToRead;
+      bitsLeft -= bitsLeftToRead;
+      result |= (current >>> bitsLeft) & ((1 << bitsLeftToRead) - 1);
+    }
+    return result & mask;
+  }
+
+  /**
+   * Unlike integer readers, where runs are encoded explicitly, in this one we have to read ahead
+   * to figure out whether we have a run. Given that runs in booleans are likely it's worth it.
+   * However it means we'd need to keep track of how many bytes we read, and next/nextVector won't
+   * work anymore once this is called. These is trivial to fix, but these are never interspersed.
+   */
+  private boolean lastRunValue;
+  private int lastRunLength = -1;
+  private void readNextRun(int maxRunLength) throws IOException {
+    assert bitSize == 1;
+    if (lastRunLength > 0) return; // last run is not exhausted yet
+    if (bitsLeft == 0) {
+      readByte();
+    }
+    // First take care of the partial bits.
+    boolean hasVal = false;
+    int runLength = 0;
+    if (bitsLeft != 8) {
+      int partialBitsMask = (1 << bitsLeft) - 1;
+      int partialBits = current & partialBitsMask;
+      if (partialBits == partialBitsMask || partialBits == 0) {
+        lastRunValue = (partialBits == partialBitsMask);
+        if (maxRunLength <= bitsLeft) {
+          lastRunLength = maxRunLength;
+          return;
+        }
+        maxRunLength -= bitsLeft;
+        hasVal = true;
+        runLength = bitsLeft;
+        bitsLeft = 0;
+      } else {
+        // There's no run in partial bits. Return whatever we have.
+        int prefixBitsCount = 32 - bitsLeft;
+        runLength = Integer.numberOfLeadingZeros(partialBits) - prefixBitsCount;
+        lastRunValue = (runLength > 0);
+        lastRunLength = Math.min(maxRunLength, lastRunValue ? runLength :
+          (Integer.numberOfLeadingZeros(~(partialBits | ~partialBitsMask)) - prefixBitsCount));
+        return;
+      }
+      assert bitsLeft == 0;
+      readByte();
+    }
+    if (!hasVal) {
+      lastRunValue = ((current >> 7) == 1);
+      hasVal = true;
+    }
+    // Read full bytes until the run ends.
+    assert bitsLeft == 8;
+    while (maxRunLength >= 8
+        && ((lastRunValue && (current == 0xff)) || (!lastRunValue && (current == 0)))) {
+      runLength += 8;
+      maxRunLength -= 8;
+      readByte();
+    }
+    if (maxRunLength > 0) {
+      int extraBits = Integer.numberOfLeadingZeros(
+          lastRunValue ? (~(current | ~255)) : current) - 24;
+      bitsLeft -= extraBits;
+      runLength += extraBits;
+    }
+    lastRunLength = runLength;
+  }
+
+  public void nextVector(LongColumnVector previous,
+                         long previousLen) throws IOException {
+    previous.isRepeating = true;
+    for (int i = 0; i < previousLen; i++) {
+      if (previous.noNulls || !previous.isNull[i]) {
+        previous.vector[i] = next();
+      } else {
+        // The default value of null for int types in vectorized
+        // processing is 1, so set that if the value is null
+        previous.vector[i] = 1;
+      }
+
+      // The default value for nulls in Vectorization for int types is 1
+      // and given that non null value can also be 1, we need to check for isNull also
+      // when determining the isRepeating flag.
+      if (previous.isRepeating
+          && i > 0
+          && ((previous.vector[0] != previous.vector[i]) ||
+          (previous.isNull[0] != previous.isNull[i]))) {
+        previous.isRepeating = false;
+      }
+    }
+  }
+
+  public void seek(PositionProvider index) throws IOException {
+    input.seek(index);
+    int consumed = (int) index.getNext();
+    if (consumed > 8) {
+      throw new IllegalArgumentException("Seek past end of byte at " +
+          consumed + " in " + input);
+    } else if (consumed != 0) {
+      readByte();
+      bitsLeft = 8 - consumed;
+    } else {
+      bitsLeft = 0;
+    }
+  }
+
+  public void skip(long items) throws IOException {
+    long totalBits = bitSize * items;
+    if (bitsLeft >= totalBits) {
+      bitsLeft -= totalBits;
+    } else {
+      totalBits -= bitsLeft;
+      input.skip(totalBits / 8);
+      current = input.next();
+      bitsLeft = (int) (8 - (totalBits % 8));
+    }
+  }
+
+  @Override
+  public String toString() {
+    return "bit reader current: " + current + " bits left: " + bitsLeft +
+        " bit size: " + bitSize + " from " + input;
+  }
+
+  boolean hasFullByte() {
+    return bitsLeft == 8 || bitsLeft == 0;
+  }
+
+  int peekOneBit() throws IOException {
+    assert bitSize == 1;
+    if (bitsLeft == 0) {
+      readByte();
+    }
+    return (current >>> (bitsLeft - 1)) & 1;
+  }
+
+  int peekFullByte() throws IOException {
+    assert bitSize == 1;
+    assert bitsLeft == 8 || bitsLeft == 0;
+    if (bitsLeft == 0) {
+      readByte();
+    }
+    return current;
+  }
+
+  void skipInCurrentByte(int bits) throws IOException {
+    assert bitsLeft >= bits;
+    bitsLeft -= bits;
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/impl/BitFieldWriter.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/BitFieldWriter.java b/java/core/src/java/org/apache/orc/impl/BitFieldWriter.java
new file mode 100644
index 0000000..aa5f886
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/impl/BitFieldWriter.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.impl;
+
+import org.apache.orc.impl.PositionRecorder;
+import org.apache.orc.impl.PositionedOutputStream;
+import org.apache.orc.impl.RunLengthByteWriter;
+
+import java.io.IOException;
+
+public class BitFieldWriter {
+  private RunLengthByteWriter output;
+  private final int bitSize;
+  private byte current = 0;
+  private int bitsLeft = 8;
+
+  public BitFieldWriter(PositionedOutputStream output,
+                 int bitSize) throws IOException {
+    this.output = new RunLengthByteWriter(output);
+    this.bitSize = bitSize;
+  }
+
+  private void writeByte() throws IOException {
+    output.write(current);
+    current = 0;
+    bitsLeft = 8;
+  }
+
+  public void flush() throws IOException {
+    if (bitsLeft != 8) {
+      writeByte();
+    }
+    output.flush();
+  }
+
+  public void write(int value) throws IOException {
+    int bitsToWrite = bitSize;
+    while (bitsToWrite > bitsLeft) {
+      // add the bits to the bottom of the current word
+      current |= value >>> (bitsToWrite - bitsLeft);
+      // subtract out the bits we just added
+      bitsToWrite -= bitsLeft;
+      // zero out the bits above bitsToWrite
+      value &= (1 << bitsToWrite) - 1;
+      writeByte();
+    }
+    bitsLeft -= bitsToWrite;
+    current |= value << bitsLeft;
+    if (bitsLeft == 0) {
+      writeByte();
+    }
+  }
+
+  public void getPosition(PositionRecorder recorder) throws IOException {
+    output.getPosition(recorder);
+    recorder.addPosition(8 - bitsLeft);
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/core/src/java/org/apache/orc/impl/BufferChunk.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/BufferChunk.java b/java/core/src/java/org/apache/orc/impl/BufferChunk.java
new file mode 100644
index 0000000..da43b96
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/impl/BufferChunk.java
@@ -0,0 +1,85 @@
+package org.apache.orc.impl;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.hive.common.io.DiskRange;
+import org.apache.hadoop.hive.common.io.DiskRangeList;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.nio.ByteBuffer;
+
+/**
+ * The sections of stripe that we have read.
+ * This might not match diskRange - 1 disk range can be multiple buffer chunks,
+ * depending on DFS block boundaries.
+ */
+public class BufferChunk extends DiskRangeList {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(BufferChunk.class);
+  final ByteBuffer chunk;
+
+  public BufferChunk(ByteBuffer chunk, long offset) {
+    super(offset, offset + chunk.remaining());
+    this.chunk = chunk;
+  }
+
+  public ByteBuffer getChunk() {
+    return chunk;
+  }
+
+  @Override
+  public boolean hasData() {
+    return chunk != null;
+  }
+
+  @Override
+  public final String toString() {
+    boolean makesSense = chunk.remaining() == (end - offset);
+    return "data range [" + offset + ", " + end + "), size: " + chunk.remaining()
+        + (makesSense ? "" : "(!)") + " type: " +
+        (chunk.isDirect() ? "direct" : "array-backed");
+  }
+
+  @Override
+  public DiskRange sliceAndShift(long offset, long end, long shiftBy) {
+    assert offset <= end && offset >= this.offset && end <= this.end;
+    assert offset + shiftBy >= 0;
+    ByteBuffer sliceBuf = chunk.slice();
+    int newPos = (int) (offset - this.offset);
+    int newLimit = newPos + (int) (end - offset);
+    try {
+      sliceBuf.position(newPos);
+      sliceBuf.limit(newLimit);
+    } catch (Throwable t) {
+      LOG.error("Failed to slice buffer chunk with range" + " [" + this.offset + ", " + this.end
+          + "), position: " + chunk.position() + " limit: " + chunk.limit() + ", "
+          + (chunk.isDirect() ? "direct" : "array") + "; to [" + offset + ", " + end + ") "
+          + t.getClass());
+      throw new RuntimeException(t);
+    }
+    return new BufferChunk(sliceBuf, offset + shiftBy);
+  }
+
+  @Override
+  public ByteBuffer getData() {
+    return chunk;
+  }
+}