You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by om...@apache.org on 2013/06/28 22:31:17 UTC

svn commit: r1497912 - in /hive/trunk/ql/src: java/org/apache/hadoop/hive/ql/io/orc/ test/org/apache/hadoop/hive/ql/io/orc/ test/resources/

Author: omalley
Date: Fri Jun 28 20:31:16 2013
New Revision: 1497912

URL: http://svn.apache.org/r1497912
Log:
HIVE-4478. In ORC remove ispresent stream from columns that contain no null 
values in a stripe. (Prasanth Jayachandran via omalley)

Added:
    hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java
Modified:
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OutStream.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java
    hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java
    hive/trunk/ql/src/test/resources/orc-file-dump.out

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OutStream.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OutStream.java?rev=1497912&r1=1497911&r2=1497912&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OutStream.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OutStream.java Fri Jun 28 20:31:16 2013
@@ -34,6 +34,8 @@ class OutStream extends PositionedOutput
   static final int HEADER_SIZE = 3;
   private final String name;
   private final OutputReceiver receiver;
+  // if enabled the stream will be suppressed when writing stripe
+  private boolean suppress;
 
   /**
    * Stores the uncompressed bytes that have been serialized, but not
@@ -70,6 +72,7 @@ class OutStream extends PositionedOutput
     this.bufferSize = bufferSize;
     this.codec = codec;
     this.receiver = receiver;
+    this.suppress = false;
   }
 
   public void clear() throws IOException {
@@ -78,6 +81,7 @@ class OutStream extends PositionedOutput
     compressed = null;
     overflow = null;
     current = null;
+    suppress = false;
   }
 
   /**
@@ -264,5 +268,20 @@ class OutStream extends PositionedOutput
     }
     return result;
   }
+
+  /**
+   * Set suppress flag
+   */
+  public void suppress() {
+    suppress = true;
+  }
+
+  /**
+   * Returns the state of suppress flag
+   * @return value of suppress flag
+   */
+  public boolean isSuppressed() {
+    return suppress;
+  }
 }
 

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java?rev=1497912&r1=1497911&r2=1497912&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java Fri Jun 28 20:31:16 2013
@@ -33,6 +33,7 @@ import org.apache.hadoop.fs.FSDataOutput
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndexEntry;
 import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
@@ -231,6 +232,14 @@ class WriterImpl implements Writer, Memo
     }
 
     /**
+     * Check the state of suppress flag in output stream
+     * @return value of suppress flag
+     */
+    public boolean isSuppressed() {
+      return outStream.isSuppressed();
+    }
+
+    /**
      * Write the saved compressed buffers to the OutputStream.
      * @param out the stream to write to
      * @throws IOException
@@ -291,9 +300,9 @@ class WriterImpl implements Writer, Memo
      * @return The output outStream that the section needs to be written to.
      * @throws IOException
      */
-    public PositionedOutputStream createStream(int column,
-                                               OrcProto.Stream.Kind kind
-                                              ) throws IOException {
+    public OutStream createStream(int column,
+                                  OrcProto.Stream.Kind kind
+                                  ) throws IOException {
       StreamName name = new StreamName(column, kind);
       BufferedStream result = streams.get(name);
       if (result == null) {
@@ -325,6 +334,14 @@ class WriterImpl implements Writer, Memo
     public boolean buildIndex() {
       return buildIndex;
     }
+
+    /**
+     * Is the ORC file compressed?
+     * @return are the streams compressed
+     */
+    public boolean isCompressed() {
+      return codec != null;
+    }
   }
 
   /**
@@ -337,6 +354,7 @@ class WriterImpl implements Writer, Memo
     protected final int id;
     protected final ObjectInspector inspector;
     private final BitFieldWriter isPresent;
+    private final boolean isCompressed;
     protected final ColumnStatisticsImpl indexStatistics;
     private final ColumnStatisticsImpl fileStatistics;
     protected TreeWriter[] childrenWriters;
@@ -344,6 +362,8 @@ class WriterImpl implements Writer, Memo
     private final OrcProto.RowIndex.Builder rowIndex;
     private final OrcProto.RowIndexEntry.Builder rowIndexEntry;
     private final PositionedOutputStream rowIndexStream;
+    private boolean foundNulls;
+    private OutStream isPresentOutStream;
 
     /**
      * Create a tree writer.
@@ -356,14 +376,17 @@ class WriterImpl implements Writer, Memo
     TreeWriter(int columnId, ObjectInspector inspector,
                StreamFactory streamFactory,
                boolean nullable) throws IOException {
+      this.isCompressed = streamFactory.isCompressed();
       this.id = columnId;
       this.inspector = inspector;
       if (nullable) {
-        isPresent = new BitFieldWriter(streamFactory.createStream(id,
-            OrcProto.Stream.Kind.PRESENT), 1);
+        isPresentOutStream = streamFactory.createStream(id,
+            OrcProto.Stream.Kind.PRESENT);
+        isPresent = new BitFieldWriter(isPresentOutStream, 1);
       } else {
         isPresent = null;
       }
+      this.foundNulls = false;
       indexStatistics = ColumnStatisticsImpl.create(inspector);
       fileStatistics = ColumnStatisticsImpl.create(inspector);
       childrenWriters = new TreeWriter[0];
@@ -401,6 +424,20 @@ class WriterImpl implements Writer, Memo
       }
       if (isPresent != null) {
         isPresent.write(obj == null ? 0 : 1);
+        if(obj == null) {
+          foundNulls = true;
+        }
+      }
+    }
+
+    private void removeIsPresentPositions() {
+      for(int i=0; i < rowIndex.getEntryCount(); ++i) {
+        RowIndexEntry.Builder entry = rowIndex.getEntryBuilder(i);
+        List<Long> positions = entry.getPositionsList();
+        // bit streams use 3 positions if uncompressed, 4 if compressed
+        positions = positions.subList(isCompressed ? 4 : 3, positions.size());
+        entry.clearPositions();
+        entry.addAllPositions(positions);
       }
     }
 
@@ -418,7 +455,21 @@ class WriterImpl implements Writer, Memo
                      int requiredIndexEntries) throws IOException {
       if (isPresent != null) {
         isPresent.flush();
+
+        // if no nulls are found in a stream, then suppress the stream
+        if(!foundNulls) {
+          isPresentOutStream.suppress();
+          // since isPresent bitstream is suppressed, update the index to
+          // remove the positions of the isPresent stream
+          if (rowIndexStream != null) {
+            removeIsPresentPositions();
+          }
+        }
       }
+
+      // reset the flag for next stripe
+      foundNulls = false;
+
       builder.addColumns(getEncoding());
       if (rowIndexStream != null) {
         if (rowIndex.getEntryCount() != requiredIndexEntries) {
@@ -810,6 +861,7 @@ class WriterImpl implements Writer, Memo
      * and augments them with the final information as the stripe is written.
      * @throws IOException
      */
+    @Override
     void createRowIndexEntry() throws IOException {
       getFileStatistics().merge(indexStatistics);
       OrcProto.RowIndexEntry.Builder rowIndexEntry = getRowIndexEntry();
@@ -1349,19 +1401,21 @@ class WriterImpl implements Writer, Memo
       long indexEnd = start;
       for(Map.Entry<StreamName, BufferedStream> pair: streams.entrySet()) {
         BufferedStream stream = pair.getValue();
-        stream.flush();
-        stream.spillTo(rawWriter);
-        stream.clear();
-        long end = rawWriter.getPos();
-        StreamName name = pair.getKey();
-        builder.addStreams(OrcProto.Stream.newBuilder()
-            .setColumn(name.getColumn())
-            .setKind(name.getKind())
-            .setLength(end-section));
-        section = end;
-        if (StreamName.Area.INDEX == name.getArea()) {
-          indexEnd = end;
+        if (!stream.isSuppressed()) {
+          stream.flush();
+          stream.spillTo(rawWriter);
+          long end = rawWriter.getPos();
+          StreamName name = pair.getKey();
+          builder.addStreams(OrcProto.Stream.newBuilder()
+              .setColumn(name.getColumn())
+              .setKind(name.getKind())
+              .setLength(end-section));
+          section = end;
+          if (StreamName.Area.INDEX == name.getArea()) {
+            indexEnd = end;
+          }
         }
+        stream.clear();
       }
       builder.build().writeTo(protobufWriter);
       protobufWriter.flush();

Modified: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java?rev=1497912&r1=1497911&r2=1497912&view=diff
==============================================================================
--- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java (original)
+++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java Fri Jun 28 20:31:16 2013
@@ -43,7 +43,7 @@ public class TestFileDump {
   Path workDir = new Path(System.getProperty("test.tmp.dir",
       "target" + File.separator + "test" + File.separator + "tmp"));
   Path resourceDir = new Path(System.getProperty("test.build.resources",
-      "src" + File.separator + "test" + File.separator + "resources"));
+      "ql" + File.separator + "src" + File.separator + "test" + File.separator + "resources"));
 
   Configuration conf;
   FileSystem fs;

Added: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java?rev=1497912&view=auto
==============================================================================
--- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java (added)
+++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java Fri Jun 28 20:31:16 2013
@@ -0,0 +1,338 @@
+package org.apache.hadoop.hive.ql.io.orc;
+
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.io.BooleanWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+
+import com.google.common.collect.Lists;
+
+public class TestOrcNullOptimization {
+
+  public static class MyStruct {
+    Integer a;
+    String b;
+    Boolean c;
+    List<InnerStruct> list = new ArrayList<InnerStruct>();
+
+    public MyStruct(Integer a, String b, Boolean c, List<InnerStruct> l) {
+      this.a = a;
+      this.b = b;
+      this.c = c;
+      this.list = l;
+    }
+  }
+
+  public static class InnerStruct {
+    Integer z;
+
+    public InnerStruct(int z) {
+      this.z = z;
+    }
+  }
+
+  Path workDir = new Path(System.getProperty("test.tmp.dir",
+      "target" + File.separator + "test" + File.separator + "tmp"));
+
+  Configuration conf;
+  FileSystem fs;
+  Path testFilePath;
+
+  @Rule
+  public TestName testCaseName = new TestName();
+
+  @Before
+  public void openFileSystem() throws Exception {
+    conf = new Configuration();
+    fs = FileSystem.getLocal(conf);
+    testFilePath = new Path(workDir, "TestOrcNullOptimization." +
+        testCaseName.getMethodName() + ".orc");
+    fs.delete(testFilePath, false);
+  }
+
+  @Test
+  public void testMultiStripeWithNull() throws Exception {
+    ObjectInspector inspector;
+    synchronized (TestOrcNullOptimization.class) {
+      inspector = ObjectInspectorFactory.getReflectionObjectInspector
+          (MyStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+    }
+    Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
+        100000, CompressionKind.NONE, 10000, 10000);
+    Random rand = new Random(100);
+    writer.addRow(new MyStruct(null, null, true, Lists.newArrayList(new InnerStruct(100))));
+    for (int i = 2; i < 20000; i++) {
+      writer.addRow(new MyStruct(rand.nextInt(1), "a", true, Lists
+          .newArrayList(new InnerStruct(100))));
+    }
+    writer.addRow(new MyStruct(null, null, true, Lists.newArrayList(new InnerStruct(100))));
+    writer.close();
+
+    Reader reader = OrcFile.createReader(fs, testFilePath);
+    // check the stats
+    ColumnStatistics[] stats = reader.getStatistics();
+    assertEquals(20000, reader.getNumberOfRows());
+    assertEquals(20000, stats[0].getNumberOfValues());
+
+    assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMaximum());
+    assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMinimum());
+    assertEquals(true, ((IntegerColumnStatistics) stats[1]).isSumDefined());
+    assertEquals(0, ((IntegerColumnStatistics) stats[1]).getSum());
+    assertEquals("count: 19998 min: 0 max: 0 sum: 0",
+        stats[1].toString());
+
+    assertEquals("a", ((StringColumnStatistics) stats[2]).getMaximum());
+    assertEquals("a", ((StringColumnStatistics) stats[2]).getMinimum());
+    assertEquals(19998, ((StringColumnStatistics) stats[2]).getNumberOfValues());
+    assertEquals("count: 19998 min: a max: a",
+        stats[2].toString());
+
+    // check the inspectors
+    StructObjectInspector readerInspector =
+        (StructObjectInspector) reader.getObjectInspector();
+    assertEquals(ObjectInspector.Category.STRUCT,
+        readerInspector.getCategory());
+    assertEquals("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>",
+        readerInspector.getTypeName());
+
+    RecordReader rows = reader.rows(null);
+
+    List<Boolean> expected = Lists.newArrayList();
+    for (StripeInformation sinfo : reader.getStripes()) {
+      expected.add(false);
+    }
+    // only the first and last stripe will have PRESENT stream
+    expected.set(0, true);
+    expected.set(expected.size() - 1, true);
+
+    List<Boolean> got = Lists.newArrayList();
+    // check if the strip footer contains PRESENT stream
+    for (StripeInformation sinfo : reader.getStripes()) {
+      OrcProto.StripeFooter sf = ((RecordReaderImpl) rows).readStripeFooter(sinfo);
+      got.add(sf.toString().indexOf(OrcProto.Stream.Kind.PRESENT.toString()) != -1);
+    }
+    assertEquals(expected, got);
+
+    // row 1
+    OrcStruct row = (OrcStruct) rows.next(null);
+    assertNotNull(row);
+    assertNull(row.getFieldValue(0));
+    assertNull(row.getFieldValue(1));
+    assertEquals(new BooleanWritable(true), row.getFieldValue(2));
+    assertEquals(new IntWritable(100),
+        ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).getFieldValue(0));
+
+    rows.seekToRow(19998);
+    // last-1 row
+    row = (OrcStruct) rows.next(null);
+    assertNotNull(row);
+    assertNotNull(row.getFieldValue(1));
+    assertEquals(new IntWritable(0), row.getFieldValue(0));
+    assertEquals(new BooleanWritable(true), row.getFieldValue(2));
+    assertEquals(new IntWritable(100),
+        ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).getFieldValue(0));
+
+    // last row
+    row = (OrcStruct) rows.next(row);
+    assertNotNull(row);
+    assertNull(row.getFieldValue(0));
+    assertNull(row.getFieldValue(1));
+    assertEquals(new BooleanWritable(true), row.getFieldValue(2));
+    assertEquals(new IntWritable(100),
+        ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).getFieldValue(0));
+
+    rows.close();
+  }
+
+  @Test
+  public void testMultiStripeWithoutNull() throws Exception {
+    ObjectInspector inspector;
+    synchronized (TestOrcNullOptimization.class) {
+      inspector = ObjectInspectorFactory.getReflectionObjectInspector
+          (MyStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+    }
+    Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
+        100000, CompressionKind.NONE, 10000, 10000);
+    Random rand = new Random(100);
+    for (int i = 1; i < 20000; i++) {
+      writer.addRow(new MyStruct(rand.nextInt(1), "a", true, Lists
+          .newArrayList(new InnerStruct(100))));
+    }
+    writer.addRow(new MyStruct(0, "b", true, Lists.newArrayList(new InnerStruct(100))));
+    writer.close();
+
+    Reader reader = OrcFile.createReader(fs, testFilePath);
+    // check the stats
+    ColumnStatistics[] stats = reader.getStatistics();
+    assertEquals(20000, reader.getNumberOfRows());
+    assertEquals(20000, stats[0].getNumberOfValues());
+
+    assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMaximum());
+    assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMinimum());
+    assertEquals(true, ((IntegerColumnStatistics) stats[1]).isSumDefined());
+    assertEquals(0, ((IntegerColumnStatistics) stats[1]).getSum());
+    assertEquals("count: 20000 min: 0 max: 0 sum: 0",
+        stats[1].toString());
+
+    assertEquals("b", ((StringColumnStatistics) stats[2]).getMaximum());
+    assertEquals("a", ((StringColumnStatistics) stats[2]).getMinimum());
+    assertEquals(20000, ((StringColumnStatistics) stats[2]).getNumberOfValues());
+    assertEquals("count: 20000 min: a max: b",
+        stats[2].toString());
+
+    // check the inspectors
+    StructObjectInspector readerInspector =
+        (StructObjectInspector) reader.getObjectInspector();
+    assertEquals(ObjectInspector.Category.STRUCT,
+        readerInspector.getCategory());
+    assertEquals("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>",
+        readerInspector.getTypeName());
+
+    RecordReader rows = reader.rows(null);
+
+    // none of the stripes will have PRESENT stream
+    List<Boolean> expected = Lists.newArrayList();
+    for (StripeInformation sinfo : reader.getStripes()) {
+      expected.add(false);
+    }
+
+    List<Boolean> got = Lists.newArrayList();
+    // check if the strip footer contains PRESENT stream
+    for (StripeInformation sinfo : reader.getStripes()) {
+      OrcProto.StripeFooter sf = ((RecordReaderImpl) rows).readStripeFooter(sinfo);
+      got.add(sf.toString().indexOf(OrcProto.Stream.Kind.PRESENT.toString()) != -1);
+    }
+    assertEquals(expected, got);
+
+    rows.seekToRow(19998);
+    // last-1 row
+    OrcStruct row = (OrcStruct) rows.next(null);
+    assertNotNull(row);
+    assertNotNull(row.getFieldValue(1));
+    assertEquals(new IntWritable(0), row.getFieldValue(0));
+    assertEquals("a", row.getFieldValue(1).toString());
+    assertEquals(new BooleanWritable(true), row.getFieldValue(2));
+    assertEquals(new IntWritable(100),
+        ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).getFieldValue(0));
+
+    // last row
+    row = (OrcStruct) rows.next(row);
+    assertNotNull(row);
+    assertNotNull(row.getFieldValue(0));
+    assertNotNull(row.getFieldValue(1));
+    assertEquals("b", row.getFieldValue(1).toString());
+    assertEquals(new BooleanWritable(true), row.getFieldValue(2));
+    assertEquals(new IntWritable(100),
+        ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).getFieldValue(0));
+
+    rows.close();
+  }
+
+  @Test
+  public void testColumnsWithNullAndCompression() throws Exception {
+    ObjectInspector inspector;
+    synchronized (TestOrcNullOptimization.class) {
+      inspector = ObjectInspectorFactory.getReflectionObjectInspector
+          (MyStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+    }
+    Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
+        100000, CompressionKind.ZLIB, 10000, 10000);
+    writer.addRow(new MyStruct(3, "a", true, Lists.newArrayList(new InnerStruct(100))));
+    writer.addRow(new MyStruct(null, "b", true, Lists.newArrayList(new InnerStruct(100))));
+    writer.addRow(new MyStruct(3, null, false, Lists.newArrayList(new InnerStruct(100))));
+    writer.addRow(new MyStruct(3, "d", true, Lists.newArrayList(new InnerStruct(100))));
+    writer.addRow(new MyStruct(2, "e", true, Lists.newArrayList(new InnerStruct(100))));
+    writer.addRow(new MyStruct(2, "f", true, Lists.newArrayList(new InnerStruct(100))));
+    writer.addRow(new MyStruct(2, "g", true, Lists.newArrayList(new InnerStruct(100))));
+    writer.addRow(new MyStruct(2, "h", true, Lists.newArrayList(new InnerStruct(100))));
+    writer.close();
+
+    Reader reader = OrcFile.createReader(fs, testFilePath);
+    // check the stats
+    ColumnStatistics[] stats = reader.getStatistics();
+    assertEquals(8, reader.getNumberOfRows());
+    assertEquals(8, stats[0].getNumberOfValues());
+
+    assertEquals(3, ((IntegerColumnStatistics) stats[1]).getMaximum());
+    assertEquals(2, ((IntegerColumnStatistics) stats[1]).getMinimum());
+    assertEquals(true, ((IntegerColumnStatistics) stats[1]).isSumDefined());
+    assertEquals(17, ((IntegerColumnStatistics) stats[1]).getSum());
+    assertEquals("count: 7 min: 2 max: 3 sum: 17",
+        stats[1].toString());
+
+    assertEquals("h", ((StringColumnStatistics) stats[2]).getMaximum());
+    assertEquals("a", ((StringColumnStatistics) stats[2]).getMinimum());
+    assertEquals(7, ((StringColumnStatistics) stats[2]).getNumberOfValues());
+    assertEquals("count: 7 min: a max: h",
+        stats[2].toString());
+
+    // check the inspectors
+    StructObjectInspector readerInspector =
+        (StructObjectInspector) reader.getObjectInspector();
+    assertEquals(ObjectInspector.Category.STRUCT,
+        readerInspector.getCategory());
+    assertEquals("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>",
+        readerInspector.getTypeName());
+
+    RecordReader rows = reader.rows(null);
+    // only the last strip will have PRESENT stream
+    List<Boolean> expected = Lists.newArrayList();
+    for (StripeInformation sinfo : reader.getStripes()) {
+      expected.add(false);
+    }
+    expected.set(expected.size() - 1, true);
+
+    List<Boolean> got = Lists.newArrayList();
+    // check if the strip footer contains PRESENT stream
+    for (StripeInformation sinfo : reader.getStripes()) {
+      OrcProto.StripeFooter sf = ((RecordReaderImpl) rows).readStripeFooter(sinfo);
+      got.add(sf.toString().indexOf(OrcProto.Stream.Kind.PRESENT.toString()) != -1);
+    }
+    assertEquals(expected, got);
+
+    // row 1
+    OrcStruct row = (OrcStruct) rows.next(null);
+    assertNotNull(row);
+    assertEquals(new IntWritable(3), row.getFieldValue(0));
+    assertEquals("a", row.getFieldValue(1).toString());
+    assertEquals(new BooleanWritable(true), row.getFieldValue(2));
+    assertEquals(new IntWritable(100),
+        ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).getFieldValue(0));
+
+    // row 2
+    row = (OrcStruct) rows.next(row);
+    assertNotNull(row);
+    assertNull(row.getFieldValue(0));
+    assertEquals("b", row.getFieldValue(1).toString());
+    assertEquals(new BooleanWritable(true), row.getFieldValue(2));
+    assertEquals(new IntWritable(100),
+        ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).getFieldValue(0));
+
+    // row 3
+    row = (OrcStruct) rows.next(row);
+    assertNotNull(row);
+    assertNull(row.getFieldValue(1));
+    assertEquals(new IntWritable(3), row.getFieldValue(0));
+    assertEquals(new BooleanWritable(false), row.getFieldValue(2));
+    assertEquals(new IntWritable(100),
+        ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).getFieldValue(0));
+    rows.close();
+  }
+}

Modified: hive/trunk/ql/src/test/resources/orc-file-dump.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/resources/orc-file-dump.out?rev=1497912&r1=1497911&r2=1497912&view=diff
==============================================================================
--- hive/trunk/ql/src/test/resources/orc-file-dump.out (original)
+++ hive/trunk/ql/src/test/resources/orc-file-dump.out Fri Jun 28 20:31:16 2013
@@ -11,87 +11,72 @@ Statistics:
   Column 3: count: 21000 min: Darkness, max: worst
 
 Stripes:
-  Stripe: offset: 3 data: 69638 rows: 5000 tail: 85 index: 126
+  Stripe: offset: 3 data: 69605 rows: 5000 tail: 72 index: 119
     Stream: column 0 section ROW_INDEX start: 3 length 10
-    Stream: column 1 section ROW_INDEX start: 13 length 38
-    Stream: column 2 section ROW_INDEX start: 51 length 42
-    Stream: column 3 section ROW_INDEX start: 93 length 36
-    Stream: column 1 section PRESENT start: 129 length 11
-    Stream: column 1 section DATA start: 140 length 22605
-    Stream: column 2 section PRESENT start: 22745 length 11
-    Stream: column 2 section DATA start: 22756 length 43426
-    Stream: column 3 section PRESENT start: 66182 length 11
-    Stream: column 3 section DATA start: 66193 length 3403
-    Stream: column 3 section LENGTH start: 69596 length 38
-    Stream: column 3 section DICTIONARY_DATA start: 69634 length 133
+    Stream: column 1 section ROW_INDEX start: 13 length 35
+    Stream: column 2 section ROW_INDEX start: 48 length 39
+    Stream: column 3 section ROW_INDEX start: 87 length 35
+    Stream: column 1 section DATA start: 122 length 22605
+    Stream: column 2 section DATA start: 22727 length 43426
+    Stream: column 3 section DATA start: 66153 length 3403
+    Stream: column 3 section LENGTH start: 69556 length 38
+    Stream: column 3 section DICTIONARY_DATA start: 69594 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT
     Encoding column 2: DIRECT
     Encoding column 3: DICTIONARY[35]
-  Stripe: offset: 69852 data: 69617 rows: 5000 tail: 83 index: 124
-    Stream: column 0 section ROW_INDEX start: 69852 length 10
-    Stream: column 1 section ROW_INDEX start: 69862 length 36
-    Stream: column 2 section ROW_INDEX start: 69898 length 42
-    Stream: column 3 section ROW_INDEX start: 69940 length 36
-    Stream: column 1 section PRESENT start: 69976 length 11
-    Stream: column 1 section DATA start: 69987 length 22597
-    Stream: column 2 section PRESENT start: 92584 length 11
-    Stream: column 2 section DATA start: 92595 length 43439
-    Stream: column 3 section PRESENT start: 136034 length 11
-    Stream: column 3 section DATA start: 136045 length 3377
-    Stream: column 3 section LENGTH start: 139422 length 38
-    Stream: column 3 section DICTIONARY_DATA start: 139460 length 133
+  Stripe: offset: 69799 data: 69584 rows: 5000 tail: 73 index: 118
+    Stream: column 0 section ROW_INDEX start: 69799 length 10
+    Stream: column 1 section ROW_INDEX start: 69809 length 34
+    Stream: column 2 section ROW_INDEX start: 69843 length 39
+    Stream: column 3 section ROW_INDEX start: 69882 length 35
+    Stream: column 1 section DATA start: 69917 length 22597
+    Stream: column 2 section DATA start: 92514 length 43439
+    Stream: column 3 section DATA start: 135953 length 3377
+    Stream: column 3 section LENGTH start: 139330 length 38
+    Stream: column 3 section DICTIONARY_DATA start: 139368 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT
     Encoding column 2: DIRECT
     Encoding column 3: DICTIONARY[35]
-  Stripe: offset: 139676 data: 69603 rows: 5000 tail: 85 index: 127
-    Stream: column 0 section ROW_INDEX start: 139676 length 10
-    Stream: column 1 section ROW_INDEX start: 139686 length 39
-    Stream: column 2 section ROW_INDEX start: 139725 length 42
-    Stream: column 3 section ROW_INDEX start: 139767 length 36
-    Stream: column 1 section PRESENT start: 139803 length 11
-    Stream: column 1 section DATA start: 139814 length 22594
-    Stream: column 2 section PRESENT start: 162408 length 11
-    Stream: column 2 section DATA start: 162419 length 43415
-    Stream: column 3 section PRESENT start: 205834 length 11
-    Stream: column 3 section DATA start: 205845 length 3390
-    Stream: column 3 section LENGTH start: 209235 length 38
-    Stream: column 3 section DICTIONARY_DATA start: 209273 length 133
+  Stripe: offset: 139574 data: 69570 rows: 5000 tail: 73 index: 120
+    Stream: column 0 section ROW_INDEX start: 139574 length 10
+    Stream: column 1 section ROW_INDEX start: 139584 length 36
+    Stream: column 2 section ROW_INDEX start: 139620 length 39
+    Stream: column 3 section ROW_INDEX start: 139659 length 35
+    Stream: column 1 section DATA start: 139694 length 22594
+    Stream: column 2 section DATA start: 162288 length 43415
+    Stream: column 3 section DATA start: 205703 length 3390
+    Stream: column 3 section LENGTH start: 209093 length 38
+    Stream: column 3 section DICTIONARY_DATA start: 209131 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT
     Encoding column 2: DIRECT
     Encoding column 3: DICTIONARY[35]
-  Stripe: offset: 209491 data: 69584 rows: 5000 tail: 84 index: 126
-    Stream: column 0 section ROW_INDEX start: 209491 length 10
-    Stream: column 1 section ROW_INDEX start: 209501 length 38
-    Stream: column 2 section ROW_INDEX start: 209539 length 42
-    Stream: column 3 section ROW_INDEX start: 209581 length 36
-    Stream: column 1 section PRESENT start: 209617 length 11
-    Stream: column 1 section DATA start: 209628 length 22575
-    Stream: column 2 section PRESENT start: 232203 length 11
-    Stream: column 2 section DATA start: 232214 length 43426
-    Stream: column 3 section PRESENT start: 275640 length 11
-    Stream: column 3 section DATA start: 275651 length 3379
-    Stream: column 3 section LENGTH start: 279030 length 38
-    Stream: column 3 section DICTIONARY_DATA start: 279068 length 133
+  Stripe: offset: 209337 data: 69551 rows: 5000 tail: 72 index: 119
+    Stream: column 0 section ROW_INDEX start: 209337 length 10
+    Stream: column 1 section ROW_INDEX start: 209347 length 35
+    Stream: column 2 section ROW_INDEX start: 209382 length 39
+    Stream: column 3 section ROW_INDEX start: 209421 length 35
+    Stream: column 1 section DATA start: 209456 length 22575
+    Stream: column 2 section DATA start: 232031 length 43426
+    Stream: column 3 section DATA start: 275457 length 3379
+    Stream: column 3 section LENGTH start: 278836 length 38
+    Stream: column 3 section DICTIONARY_DATA start: 278874 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT
     Encoding column 2: DIRECT
     Encoding column 3: DICTIONARY[35]
-  Stripe: offset: 279285 data: 14111 rows: 1000 tail: 80 index: 127
-    Stream: column 0 section ROW_INDEX start: 279285 length 10
-    Stream: column 1 section ROW_INDEX start: 279295 length 39
-    Stream: column 2 section ROW_INDEX start: 279334 length 42
-    Stream: column 3 section ROW_INDEX start: 279376 length 36
-    Stream: column 1 section PRESENT start: 279412 length 5
-    Stream: column 1 section DATA start: 279417 length 4529
-    Stream: column 2 section PRESENT start: 283946 length 5
-    Stream: column 2 section DATA start: 283951 length 8690
-    Stream: column 3 section PRESENT start: 292641 length 5
-    Stream: column 3 section DATA start: 292646 length 706
-    Stream: column 3 section LENGTH start: 293352 length 38
-    Stream: column 3 section DICTIONARY_DATA start: 293390 length 133
+  Stripe: offset: 279079 data: 14096 rows: 1000 tail: 68 index: 120
+    Stream: column 0 section ROW_INDEX start: 279079 length 10
+    Stream: column 1 section ROW_INDEX start: 279089 length 36
+    Stream: column 2 section ROW_INDEX start: 279125 length 39
+    Stream: column 3 section ROW_INDEX start: 279164 length 35
+    Stream: column 1 section DATA start: 279199 length 4529
+    Stream: column 2 section DATA start: 283728 length 8690
+    Stream: column 3 section DATA start: 292418 length 706
+    Stream: column 3 section LENGTH start: 293124 length 38
+    Stream: column 3 section DICTIONARY_DATA start: 293162 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT
     Encoding column 2: DIRECT