You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by om...@apache.org on 2013/06/28 22:31:17 UTC
svn commit: r1497912 - in /hive/trunk/ql/src:
java/org/apache/hadoop/hive/ql/io/orc/
test/org/apache/hadoop/hive/ql/io/orc/ test/resources/
Author: omalley
Date: Fri Jun 28 20:31:16 2013
New Revision: 1497912
URL: http://svn.apache.org/r1497912
Log:
HIVE-4478. In ORC remove ispresent stream from columns that contain no null
values in a stripe. (Prasanth Jayachandran via omalley)
Added:
hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java
Modified:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OutStream.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java
hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java
hive/trunk/ql/src/test/resources/orc-file-dump.out
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OutStream.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OutStream.java?rev=1497912&r1=1497911&r2=1497912&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OutStream.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OutStream.java Fri Jun 28 20:31:16 2013
@@ -34,6 +34,8 @@ class OutStream extends PositionedOutput
static final int HEADER_SIZE = 3;
private final String name;
private final OutputReceiver receiver;
+ // if enabled the stream will be suppressed when writing stripe
+ private boolean suppress;
/**
* Stores the uncompressed bytes that have been serialized, but not
@@ -70,6 +72,7 @@ class OutStream extends PositionedOutput
this.bufferSize = bufferSize;
this.codec = codec;
this.receiver = receiver;
+ this.suppress = false;
}
public void clear() throws IOException {
@@ -78,6 +81,7 @@ class OutStream extends PositionedOutput
compressed = null;
overflow = null;
current = null;
+ suppress = false;
}
/**
@@ -264,5 +268,20 @@ class OutStream extends PositionedOutput
}
return result;
}
+
+ /**
+ * Set suppress flag
+ */
+ public void suppress() {
+ suppress = true;
+ }
+
+ /**
+ * Returns the state of suppress flag
+ * @return value of suppress flag
+ */
+ public boolean isSuppressed() {
+ return suppress;
+ }
}
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java?rev=1497912&r1=1497911&r2=1497912&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java Fri Jun 28 20:31:16 2013
@@ -33,6 +33,7 @@ import org.apache.hadoop.fs.FSDataOutput
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndexEntry;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
@@ -231,6 +232,14 @@ class WriterImpl implements Writer, Memo
}
/**
+ * Check the state of suppress flag in output stream
+ * @return value of suppress flag
+ */
+ public boolean isSuppressed() {
+ return outStream.isSuppressed();
+ }
+
+ /**
* Write the saved compressed buffers to the OutputStream.
* @param out the stream to write to
* @throws IOException
@@ -291,9 +300,9 @@ class WriterImpl implements Writer, Memo
* @return The output outStream that the section needs to be written to.
* @throws IOException
*/
- public PositionedOutputStream createStream(int column,
- OrcProto.Stream.Kind kind
- ) throws IOException {
+ public OutStream createStream(int column,
+ OrcProto.Stream.Kind kind
+ ) throws IOException {
StreamName name = new StreamName(column, kind);
BufferedStream result = streams.get(name);
if (result == null) {
@@ -325,6 +334,14 @@ class WriterImpl implements Writer, Memo
public boolean buildIndex() {
return buildIndex;
}
+
+ /**
+ * Is the ORC file compressed?
+ * @return are the streams compressed
+ */
+ public boolean isCompressed() {
+ return codec != null;
+ }
}
/**
@@ -337,6 +354,7 @@ class WriterImpl implements Writer, Memo
protected final int id;
protected final ObjectInspector inspector;
private final BitFieldWriter isPresent;
+ private final boolean isCompressed;
protected final ColumnStatisticsImpl indexStatistics;
private final ColumnStatisticsImpl fileStatistics;
protected TreeWriter[] childrenWriters;
@@ -344,6 +362,8 @@ class WriterImpl implements Writer, Memo
private final OrcProto.RowIndex.Builder rowIndex;
private final OrcProto.RowIndexEntry.Builder rowIndexEntry;
private final PositionedOutputStream rowIndexStream;
+ private boolean foundNulls;
+ private OutStream isPresentOutStream;
/**
* Create a tree writer.
@@ -356,14 +376,17 @@ class WriterImpl implements Writer, Memo
TreeWriter(int columnId, ObjectInspector inspector,
StreamFactory streamFactory,
boolean nullable) throws IOException {
+ this.isCompressed = streamFactory.isCompressed();
this.id = columnId;
this.inspector = inspector;
if (nullable) {
- isPresent = new BitFieldWriter(streamFactory.createStream(id,
- OrcProto.Stream.Kind.PRESENT), 1);
+ isPresentOutStream = streamFactory.createStream(id,
+ OrcProto.Stream.Kind.PRESENT);
+ isPresent = new BitFieldWriter(isPresentOutStream, 1);
} else {
isPresent = null;
}
+ this.foundNulls = false;
indexStatistics = ColumnStatisticsImpl.create(inspector);
fileStatistics = ColumnStatisticsImpl.create(inspector);
childrenWriters = new TreeWriter[0];
@@ -401,6 +424,20 @@ class WriterImpl implements Writer, Memo
}
if (isPresent != null) {
isPresent.write(obj == null ? 0 : 1);
+ if(obj == null) {
+ foundNulls = true;
+ }
+ }
+ }
+
+ private void removeIsPresentPositions() {
+ for(int i=0; i < rowIndex.getEntryCount(); ++i) {
+ RowIndexEntry.Builder entry = rowIndex.getEntryBuilder(i);
+ List<Long> positions = entry.getPositionsList();
+ // bit streams use 3 positions if uncompressed, 4 if compressed
+ positions = positions.subList(isCompressed ? 4 : 3, positions.size());
+ entry.clearPositions();
+ entry.addAllPositions(positions);
}
}
@@ -418,7 +455,21 @@ class WriterImpl implements Writer, Memo
int requiredIndexEntries) throws IOException {
if (isPresent != null) {
isPresent.flush();
+
+ // if no nulls are found in a stream, then suppress the stream
+ if(!foundNulls) {
+ isPresentOutStream.suppress();
+ // since isPresent bitstream is suppressed, update the index to
+ // remove the positions of the isPresent stream
+ if (rowIndexStream != null) {
+ removeIsPresentPositions();
+ }
+ }
}
+
+ // reset the flag for next stripe
+ foundNulls = false;
+
builder.addColumns(getEncoding());
if (rowIndexStream != null) {
if (rowIndex.getEntryCount() != requiredIndexEntries) {
@@ -810,6 +861,7 @@ class WriterImpl implements Writer, Memo
* and augments them with the final information as the stripe is written.
* @throws IOException
*/
+ @Override
void createRowIndexEntry() throws IOException {
getFileStatistics().merge(indexStatistics);
OrcProto.RowIndexEntry.Builder rowIndexEntry = getRowIndexEntry();
@@ -1349,19 +1401,21 @@ class WriterImpl implements Writer, Memo
long indexEnd = start;
for(Map.Entry<StreamName, BufferedStream> pair: streams.entrySet()) {
BufferedStream stream = pair.getValue();
- stream.flush();
- stream.spillTo(rawWriter);
- stream.clear();
- long end = rawWriter.getPos();
- StreamName name = pair.getKey();
- builder.addStreams(OrcProto.Stream.newBuilder()
- .setColumn(name.getColumn())
- .setKind(name.getKind())
- .setLength(end-section));
- section = end;
- if (StreamName.Area.INDEX == name.getArea()) {
- indexEnd = end;
+ if (!stream.isSuppressed()) {
+ stream.flush();
+ stream.spillTo(rawWriter);
+ long end = rawWriter.getPos();
+ StreamName name = pair.getKey();
+ builder.addStreams(OrcProto.Stream.newBuilder()
+ .setColumn(name.getColumn())
+ .setKind(name.getKind())
+ .setLength(end-section));
+ section = end;
+ if (StreamName.Area.INDEX == name.getArea()) {
+ indexEnd = end;
+ }
}
+ stream.clear();
}
builder.build().writeTo(protobufWriter);
protobufWriter.flush();
Modified: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java?rev=1497912&r1=1497911&r2=1497912&view=diff
==============================================================================
--- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java (original)
+++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java Fri Jun 28 20:31:16 2013
@@ -43,7 +43,7 @@ public class TestFileDump {
Path workDir = new Path(System.getProperty("test.tmp.dir",
"target" + File.separator + "test" + File.separator + "tmp"));
Path resourceDir = new Path(System.getProperty("test.build.resources",
- "src" + File.separator + "test" + File.separator + "resources"));
+ "ql" + File.separator + "src" + File.separator + "test" + File.separator + "resources"));
Configuration conf;
FileSystem fs;
Added: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java?rev=1497912&view=auto
==============================================================================
--- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java (added)
+++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java Fri Jun 28 20:31:16 2013
@@ -0,0 +1,338 @@
+package org.apache.hadoop.hive.ql.io.orc;
+
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.io.BooleanWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+
+import com.google.common.collect.Lists;
+
+public class TestOrcNullOptimization {
+
+ public static class MyStruct {
+ Integer a;
+ String b;
+ Boolean c;
+ List<InnerStruct> list = new ArrayList<InnerStruct>();
+
+ public MyStruct(Integer a, String b, Boolean c, List<InnerStruct> l) {
+ this.a = a;
+ this.b = b;
+ this.c = c;
+ this.list = l;
+ }
+ }
+
+ public static class InnerStruct {
+ Integer z;
+
+ public InnerStruct(int z) {
+ this.z = z;
+ }
+ }
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir",
+ "target" + File.separator + "test" + File.separator + "tmp"));
+
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestOrcNullOptimization." +
+ testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ @Test
+ public void testMultiStripeWithNull() throws Exception {
+ ObjectInspector inspector;
+ synchronized (TestOrcNullOptimization.class) {
+ inspector = ObjectInspectorFactory.getReflectionObjectInspector
+ (MyStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+ Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
+ 100000, CompressionKind.NONE, 10000, 10000);
+ Random rand = new Random(100);
+ writer.addRow(new MyStruct(null, null, true, Lists.newArrayList(new InnerStruct(100))));
+ for (int i = 2; i < 20000; i++) {
+ writer.addRow(new MyStruct(rand.nextInt(1), "a", true, Lists
+ .newArrayList(new InnerStruct(100))));
+ }
+ writer.addRow(new MyStruct(null, null, true, Lists.newArrayList(new InnerStruct(100))));
+ writer.close();
+
+ Reader reader = OrcFile.createReader(fs, testFilePath);
+ // check the stats
+ ColumnStatistics[] stats = reader.getStatistics();
+ assertEquals(20000, reader.getNumberOfRows());
+ assertEquals(20000, stats[0].getNumberOfValues());
+
+ assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMaximum());
+ assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMinimum());
+ assertEquals(true, ((IntegerColumnStatistics) stats[1]).isSumDefined());
+ assertEquals(0, ((IntegerColumnStatistics) stats[1]).getSum());
+ assertEquals("count: 19998 min: 0 max: 0 sum: 0",
+ stats[1].toString());
+
+ assertEquals("a", ((StringColumnStatistics) stats[2]).getMaximum());
+ assertEquals("a", ((StringColumnStatistics) stats[2]).getMinimum());
+ assertEquals(19998, ((StringColumnStatistics) stats[2]).getNumberOfValues());
+ assertEquals("count: 19998 min: a max: a",
+ stats[2].toString());
+
+ // check the inspectors
+ StructObjectInspector readerInspector =
+ (StructObjectInspector) reader.getObjectInspector();
+ assertEquals(ObjectInspector.Category.STRUCT,
+ readerInspector.getCategory());
+ assertEquals("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>",
+ readerInspector.getTypeName());
+
+ RecordReader rows = reader.rows(null);
+
+ List<Boolean> expected = Lists.newArrayList();
+ for (StripeInformation sinfo : reader.getStripes()) {
+ expected.add(false);
+ }
+ // only the first and last stripe will have PRESENT stream
+ expected.set(0, true);
+ expected.set(expected.size() - 1, true);
+
+ List<Boolean> got = Lists.newArrayList();
+ // check if the strip footer contains PRESENT stream
+ for (StripeInformation sinfo : reader.getStripes()) {
+ OrcProto.StripeFooter sf = ((RecordReaderImpl) rows).readStripeFooter(sinfo);
+ got.add(sf.toString().indexOf(OrcProto.Stream.Kind.PRESENT.toString()) != -1);
+ }
+ assertEquals(expected, got);
+
+ // row 1
+ OrcStruct row = (OrcStruct) rows.next(null);
+ assertNotNull(row);
+ assertNull(row.getFieldValue(0));
+ assertNull(row.getFieldValue(1));
+ assertEquals(new BooleanWritable(true), row.getFieldValue(2));
+ assertEquals(new IntWritable(100),
+ ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).getFieldValue(0));
+
+ rows.seekToRow(19998);
+ // last-1 row
+ row = (OrcStruct) rows.next(null);
+ assertNotNull(row);
+ assertNotNull(row.getFieldValue(1));
+ assertEquals(new IntWritable(0), row.getFieldValue(0));
+ assertEquals(new BooleanWritable(true), row.getFieldValue(2));
+ assertEquals(new IntWritable(100),
+ ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).getFieldValue(0));
+
+ // last row
+ row = (OrcStruct) rows.next(row);
+ assertNotNull(row);
+ assertNull(row.getFieldValue(0));
+ assertNull(row.getFieldValue(1));
+ assertEquals(new BooleanWritable(true), row.getFieldValue(2));
+ assertEquals(new IntWritable(100),
+ ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).getFieldValue(0));
+
+ rows.close();
+ }
+
+ @Test
+ public void testMultiStripeWithoutNull() throws Exception {
+ ObjectInspector inspector;
+ synchronized (TestOrcNullOptimization.class) {
+ inspector = ObjectInspectorFactory.getReflectionObjectInspector
+ (MyStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+ Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
+ 100000, CompressionKind.NONE, 10000, 10000);
+ Random rand = new Random(100);
+ for (int i = 1; i < 20000; i++) {
+ writer.addRow(new MyStruct(rand.nextInt(1), "a", true, Lists
+ .newArrayList(new InnerStruct(100))));
+ }
+ writer.addRow(new MyStruct(0, "b", true, Lists.newArrayList(new InnerStruct(100))));
+ writer.close();
+
+ Reader reader = OrcFile.createReader(fs, testFilePath);
+ // check the stats
+ ColumnStatistics[] stats = reader.getStatistics();
+ assertEquals(20000, reader.getNumberOfRows());
+ assertEquals(20000, stats[0].getNumberOfValues());
+
+ assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMaximum());
+ assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMinimum());
+ assertEquals(true, ((IntegerColumnStatistics) stats[1]).isSumDefined());
+ assertEquals(0, ((IntegerColumnStatistics) stats[1]).getSum());
+ assertEquals("count: 20000 min: 0 max: 0 sum: 0",
+ stats[1].toString());
+
+ assertEquals("b", ((StringColumnStatistics) stats[2]).getMaximum());
+ assertEquals("a", ((StringColumnStatistics) stats[2]).getMinimum());
+ assertEquals(20000, ((StringColumnStatistics) stats[2]).getNumberOfValues());
+ assertEquals("count: 20000 min: a max: b",
+ stats[2].toString());
+
+ // check the inspectors
+ StructObjectInspector readerInspector =
+ (StructObjectInspector) reader.getObjectInspector();
+ assertEquals(ObjectInspector.Category.STRUCT,
+ readerInspector.getCategory());
+ assertEquals("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>",
+ readerInspector.getTypeName());
+
+ RecordReader rows = reader.rows(null);
+
+ // none of the stripes will have PRESENT stream
+ List<Boolean> expected = Lists.newArrayList();
+ for (StripeInformation sinfo : reader.getStripes()) {
+ expected.add(false);
+ }
+
+ List<Boolean> got = Lists.newArrayList();
+ // check if the strip footer contains PRESENT stream
+ for (StripeInformation sinfo : reader.getStripes()) {
+ OrcProto.StripeFooter sf = ((RecordReaderImpl) rows).readStripeFooter(sinfo);
+ got.add(sf.toString().indexOf(OrcProto.Stream.Kind.PRESENT.toString()) != -1);
+ }
+ assertEquals(expected, got);
+
+ rows.seekToRow(19998);
+ // last-1 row
+ OrcStruct row = (OrcStruct) rows.next(null);
+ assertNotNull(row);
+ assertNotNull(row.getFieldValue(1));
+ assertEquals(new IntWritable(0), row.getFieldValue(0));
+ assertEquals("a", row.getFieldValue(1).toString());
+ assertEquals(new BooleanWritable(true), row.getFieldValue(2));
+ assertEquals(new IntWritable(100),
+ ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).getFieldValue(0));
+
+ // last row
+ row = (OrcStruct) rows.next(row);
+ assertNotNull(row);
+ assertNotNull(row.getFieldValue(0));
+ assertNotNull(row.getFieldValue(1));
+ assertEquals("b", row.getFieldValue(1).toString());
+ assertEquals(new BooleanWritable(true), row.getFieldValue(2));
+ assertEquals(new IntWritable(100),
+ ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).getFieldValue(0));
+
+ rows.close();
+ }
+
+ @Test
+ public void testColumnsWithNullAndCompression() throws Exception {
+ ObjectInspector inspector;
+ synchronized (TestOrcNullOptimization.class) {
+ inspector = ObjectInspectorFactory.getReflectionObjectInspector
+ (MyStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+ Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
+ 100000, CompressionKind.ZLIB, 10000, 10000);
+ writer.addRow(new MyStruct(3, "a", true, Lists.newArrayList(new InnerStruct(100))));
+ writer.addRow(new MyStruct(null, "b", true, Lists.newArrayList(new InnerStruct(100))));
+ writer.addRow(new MyStruct(3, null, false, Lists.newArrayList(new InnerStruct(100))));
+ writer.addRow(new MyStruct(3, "d", true, Lists.newArrayList(new InnerStruct(100))));
+ writer.addRow(new MyStruct(2, "e", true, Lists.newArrayList(new InnerStruct(100))));
+ writer.addRow(new MyStruct(2, "f", true, Lists.newArrayList(new InnerStruct(100))));
+ writer.addRow(new MyStruct(2, "g", true, Lists.newArrayList(new InnerStruct(100))));
+ writer.addRow(new MyStruct(2, "h", true, Lists.newArrayList(new InnerStruct(100))));
+ writer.close();
+
+ Reader reader = OrcFile.createReader(fs, testFilePath);
+ // check the stats
+ ColumnStatistics[] stats = reader.getStatistics();
+ assertEquals(8, reader.getNumberOfRows());
+ assertEquals(8, stats[0].getNumberOfValues());
+
+ assertEquals(3, ((IntegerColumnStatistics) stats[1]).getMaximum());
+ assertEquals(2, ((IntegerColumnStatistics) stats[1]).getMinimum());
+ assertEquals(true, ((IntegerColumnStatistics) stats[1]).isSumDefined());
+ assertEquals(17, ((IntegerColumnStatistics) stats[1]).getSum());
+ assertEquals("count: 7 min: 2 max: 3 sum: 17",
+ stats[1].toString());
+
+ assertEquals("h", ((StringColumnStatistics) stats[2]).getMaximum());
+ assertEquals("a", ((StringColumnStatistics) stats[2]).getMinimum());
+ assertEquals(7, ((StringColumnStatistics) stats[2]).getNumberOfValues());
+ assertEquals("count: 7 min: a max: h",
+ stats[2].toString());
+
+ // check the inspectors
+ StructObjectInspector readerInspector =
+ (StructObjectInspector) reader.getObjectInspector();
+ assertEquals(ObjectInspector.Category.STRUCT,
+ readerInspector.getCategory());
+ assertEquals("struct<a:int,b:string,c:boolean,list:array<struct<z:int>>>",
+ readerInspector.getTypeName());
+
+ RecordReader rows = reader.rows(null);
+ // only the last strip will have PRESENT stream
+ List<Boolean> expected = Lists.newArrayList();
+ for (StripeInformation sinfo : reader.getStripes()) {
+ expected.add(false);
+ }
+ expected.set(expected.size() - 1, true);
+
+ List<Boolean> got = Lists.newArrayList();
+ // check if the strip footer contains PRESENT stream
+ for (StripeInformation sinfo : reader.getStripes()) {
+ OrcProto.StripeFooter sf = ((RecordReaderImpl) rows).readStripeFooter(sinfo);
+ got.add(sf.toString().indexOf(OrcProto.Stream.Kind.PRESENT.toString()) != -1);
+ }
+ assertEquals(expected, got);
+
+ // row 1
+ OrcStruct row = (OrcStruct) rows.next(null);
+ assertNotNull(row);
+ assertEquals(new IntWritable(3), row.getFieldValue(0));
+ assertEquals("a", row.getFieldValue(1).toString());
+ assertEquals(new BooleanWritable(true), row.getFieldValue(2));
+ assertEquals(new IntWritable(100),
+ ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).getFieldValue(0));
+
+ // row 2
+ row = (OrcStruct) rows.next(row);
+ assertNotNull(row);
+ assertNull(row.getFieldValue(0));
+ assertEquals("b", row.getFieldValue(1).toString());
+ assertEquals(new BooleanWritable(true), row.getFieldValue(2));
+ assertEquals(new IntWritable(100),
+ ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).getFieldValue(0));
+
+ // row 3
+ row = (OrcStruct) rows.next(row);
+ assertNotNull(row);
+ assertNull(row.getFieldValue(1));
+ assertEquals(new IntWritable(3), row.getFieldValue(0));
+ assertEquals(new BooleanWritable(false), row.getFieldValue(2));
+ assertEquals(new IntWritable(100),
+ ((OrcStruct) ((ArrayList<?>) row.getFieldValue(3)).get(0)).getFieldValue(0));
+ rows.close();
+ }
+}
Modified: hive/trunk/ql/src/test/resources/orc-file-dump.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/resources/orc-file-dump.out?rev=1497912&r1=1497911&r2=1497912&view=diff
==============================================================================
--- hive/trunk/ql/src/test/resources/orc-file-dump.out (original)
+++ hive/trunk/ql/src/test/resources/orc-file-dump.out Fri Jun 28 20:31:16 2013
@@ -11,87 +11,72 @@ Statistics:
Column 3: count: 21000 min: Darkness, max: worst
Stripes:
- Stripe: offset: 3 data: 69638 rows: 5000 tail: 85 index: 126
+ Stripe: offset: 3 data: 69605 rows: 5000 tail: 72 index: 119
Stream: column 0 section ROW_INDEX start: 3 length 10
- Stream: column 1 section ROW_INDEX start: 13 length 38
- Stream: column 2 section ROW_INDEX start: 51 length 42
- Stream: column 3 section ROW_INDEX start: 93 length 36
- Stream: column 1 section PRESENT start: 129 length 11
- Stream: column 1 section DATA start: 140 length 22605
- Stream: column 2 section PRESENT start: 22745 length 11
- Stream: column 2 section DATA start: 22756 length 43426
- Stream: column 3 section PRESENT start: 66182 length 11
- Stream: column 3 section DATA start: 66193 length 3403
- Stream: column 3 section LENGTH start: 69596 length 38
- Stream: column 3 section DICTIONARY_DATA start: 69634 length 133
+ Stream: column 1 section ROW_INDEX start: 13 length 35
+ Stream: column 2 section ROW_INDEX start: 48 length 39
+ Stream: column 3 section ROW_INDEX start: 87 length 35
+ Stream: column 1 section DATA start: 122 length 22605
+ Stream: column 2 section DATA start: 22727 length 43426
+ Stream: column 3 section DATA start: 66153 length 3403
+ Stream: column 3 section LENGTH start: 69556 length 38
+ Stream: column 3 section DICTIONARY_DATA start: 69594 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT
Encoding column 2: DIRECT
Encoding column 3: DICTIONARY[35]
- Stripe: offset: 69852 data: 69617 rows: 5000 tail: 83 index: 124
- Stream: column 0 section ROW_INDEX start: 69852 length 10
- Stream: column 1 section ROW_INDEX start: 69862 length 36
- Stream: column 2 section ROW_INDEX start: 69898 length 42
- Stream: column 3 section ROW_INDEX start: 69940 length 36
- Stream: column 1 section PRESENT start: 69976 length 11
- Stream: column 1 section DATA start: 69987 length 22597
- Stream: column 2 section PRESENT start: 92584 length 11
- Stream: column 2 section DATA start: 92595 length 43439
- Stream: column 3 section PRESENT start: 136034 length 11
- Stream: column 3 section DATA start: 136045 length 3377
- Stream: column 3 section LENGTH start: 139422 length 38
- Stream: column 3 section DICTIONARY_DATA start: 139460 length 133
+ Stripe: offset: 69799 data: 69584 rows: 5000 tail: 73 index: 118
+ Stream: column 0 section ROW_INDEX start: 69799 length 10
+ Stream: column 1 section ROW_INDEX start: 69809 length 34
+ Stream: column 2 section ROW_INDEX start: 69843 length 39
+ Stream: column 3 section ROW_INDEX start: 69882 length 35
+ Stream: column 1 section DATA start: 69917 length 22597
+ Stream: column 2 section DATA start: 92514 length 43439
+ Stream: column 3 section DATA start: 135953 length 3377
+ Stream: column 3 section LENGTH start: 139330 length 38
+ Stream: column 3 section DICTIONARY_DATA start: 139368 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT
Encoding column 2: DIRECT
Encoding column 3: DICTIONARY[35]
- Stripe: offset: 139676 data: 69603 rows: 5000 tail: 85 index: 127
- Stream: column 0 section ROW_INDEX start: 139676 length 10
- Stream: column 1 section ROW_INDEX start: 139686 length 39
- Stream: column 2 section ROW_INDEX start: 139725 length 42
- Stream: column 3 section ROW_INDEX start: 139767 length 36
- Stream: column 1 section PRESENT start: 139803 length 11
- Stream: column 1 section DATA start: 139814 length 22594
- Stream: column 2 section PRESENT start: 162408 length 11
- Stream: column 2 section DATA start: 162419 length 43415
- Stream: column 3 section PRESENT start: 205834 length 11
- Stream: column 3 section DATA start: 205845 length 3390
- Stream: column 3 section LENGTH start: 209235 length 38
- Stream: column 3 section DICTIONARY_DATA start: 209273 length 133
+ Stripe: offset: 139574 data: 69570 rows: 5000 tail: 73 index: 120
+ Stream: column 0 section ROW_INDEX start: 139574 length 10
+ Stream: column 1 section ROW_INDEX start: 139584 length 36
+ Stream: column 2 section ROW_INDEX start: 139620 length 39
+ Stream: column 3 section ROW_INDEX start: 139659 length 35
+ Stream: column 1 section DATA start: 139694 length 22594
+ Stream: column 2 section DATA start: 162288 length 43415
+ Stream: column 3 section DATA start: 205703 length 3390
+ Stream: column 3 section LENGTH start: 209093 length 38
+ Stream: column 3 section DICTIONARY_DATA start: 209131 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT
Encoding column 2: DIRECT
Encoding column 3: DICTIONARY[35]
- Stripe: offset: 209491 data: 69584 rows: 5000 tail: 84 index: 126
- Stream: column 0 section ROW_INDEX start: 209491 length 10
- Stream: column 1 section ROW_INDEX start: 209501 length 38
- Stream: column 2 section ROW_INDEX start: 209539 length 42
- Stream: column 3 section ROW_INDEX start: 209581 length 36
- Stream: column 1 section PRESENT start: 209617 length 11
- Stream: column 1 section DATA start: 209628 length 22575
- Stream: column 2 section PRESENT start: 232203 length 11
- Stream: column 2 section DATA start: 232214 length 43426
- Stream: column 3 section PRESENT start: 275640 length 11
- Stream: column 3 section DATA start: 275651 length 3379
- Stream: column 3 section LENGTH start: 279030 length 38
- Stream: column 3 section DICTIONARY_DATA start: 279068 length 133
+ Stripe: offset: 209337 data: 69551 rows: 5000 tail: 72 index: 119
+ Stream: column 0 section ROW_INDEX start: 209337 length 10
+ Stream: column 1 section ROW_INDEX start: 209347 length 35
+ Stream: column 2 section ROW_INDEX start: 209382 length 39
+ Stream: column 3 section ROW_INDEX start: 209421 length 35
+ Stream: column 1 section DATA start: 209456 length 22575
+ Stream: column 2 section DATA start: 232031 length 43426
+ Stream: column 3 section DATA start: 275457 length 3379
+ Stream: column 3 section LENGTH start: 278836 length 38
+ Stream: column 3 section DICTIONARY_DATA start: 278874 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT
Encoding column 2: DIRECT
Encoding column 3: DICTIONARY[35]
- Stripe: offset: 279285 data: 14111 rows: 1000 tail: 80 index: 127
- Stream: column 0 section ROW_INDEX start: 279285 length 10
- Stream: column 1 section ROW_INDEX start: 279295 length 39
- Stream: column 2 section ROW_INDEX start: 279334 length 42
- Stream: column 3 section ROW_INDEX start: 279376 length 36
- Stream: column 1 section PRESENT start: 279412 length 5
- Stream: column 1 section DATA start: 279417 length 4529
- Stream: column 2 section PRESENT start: 283946 length 5
- Stream: column 2 section DATA start: 283951 length 8690
- Stream: column 3 section PRESENT start: 292641 length 5
- Stream: column 3 section DATA start: 292646 length 706
- Stream: column 3 section LENGTH start: 293352 length 38
- Stream: column 3 section DICTIONARY_DATA start: 293390 length 133
+ Stripe: offset: 279079 data: 14096 rows: 1000 tail: 68 index: 120
+ Stream: column 0 section ROW_INDEX start: 279079 length 10
+ Stream: column 1 section ROW_INDEX start: 279089 length 36
+ Stream: column 2 section ROW_INDEX start: 279125 length 39
+ Stream: column 3 section ROW_INDEX start: 279164 length 35
+ Stream: column 1 section DATA start: 279199 length 4529
+ Stream: column 2 section DATA start: 283728 length 8690
+ Stream: column 3 section DATA start: 292418 length 706
+ Stream: column 3 section LENGTH start: 293124 length 38
+ Stream: column 3 section DICTIONARY_DATA start: 293162 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT
Encoding column 2: DIRECT