You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by pr...@apache.org on 2014/09/02 22:53:05 UTC
svn commit: r1622119 - in /hive/trunk:
common/src/java/org/apache/hadoop/hive/conf/
ql/src/java/org/apache/hadoop/hive/ql/io/orc/
ql/src/test/org/apache/hadoop/hive/ql/io/orc/
Author: prasanthj
Date: Tue Sep 2 20:53:04 2014
New Revision: 1622119
URL: http://svn.apache.org/r1622119
Log:
HIVE-7832: Do ORC dictionary check at a finer level and preserve encoding across stripes (Prasanth J reviewed by Gopal V)
Added:
hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestStringDictionary.java
Modified:
hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java
hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java
Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
URL: http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=1622119&r1=1622118&r2=1622119&view=diff
==============================================================================
--- hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (original)
+++ hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Tue Sep 2 20:53:04 2014
@@ -836,6 +836,10 @@ public class HiveConf extends Configurat
"If the number of keys in a dictionary is greater than this fraction of the total number of\n" +
"non-null rows, turn off dictionary encoding. Use 1 to always use dictionary encoding."),
HIVE_ORC_DEFAULT_ROW_INDEX_STRIDE("hive.exec.orc.default.row.index.stride", 10000, "Define the default ORC index stride"),
+ HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK("hive.orc.row.index.stride.dictionary.check", true,
+ "If enabled dictionary check will happen after first row index stride (default 10000 rows)\n" +
+ "else dictionary check will happen before writing first stripe. In both cases, the decision\n" +
+ "to use dictionary or not will be retained thereafter."),
HIVE_ORC_DEFAULT_BUFFER_SIZE("hive.exec.orc.default.buffer.size", 256 * 1024, "Define the default ORC buffer size"),
HIVE_ORC_DEFAULT_BLOCK_PADDING("hive.exec.orc.default.block.padding", true, "Define the default block padding"),
HIVE_ORC_BLOCK_PADDING_TOLERANCE("hive.exec.orc.block.padding.tolerance", 0.05f,
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java?rev=1622119&r1=1622118&r2=1622119&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java Tue Sep 2 20:53:04 2014
@@ -96,9 +96,6 @@ class WriterImpl implements Writer, Memo
private static final int HDFS_BUFFER_SIZE = 256 * 1024;
private static final int MIN_ROW_INDEX_STRIDE = 1000;
- // HDFS requires blocks < 2GB and multiples of 512, so pick 1.5GB
- private static final long MAX_BLOCK_SIZE = 1536 * 1024 * 1024;
-
// threshold above which buffer size will be automatically resized
private static final int COLUMN_COUNT_THRESHOLD = 1000;
@@ -135,8 +132,6 @@ class WriterImpl implements Writer, Memo
new TreeMap<String, ByteString>();
private final StreamFactory streamFactory = new StreamFactory();
private final TreeWriter treeWriter;
- private final OrcProto.RowIndex.Builder rowIndex =
- OrcProto.RowIndex.newBuilder();
private final boolean buildIndex;
private final MemoryManager memoryManager;
private final OrcFile.Version version;
@@ -678,7 +673,7 @@ class WriterImpl implements Writer, Memo
if (rowIndexStream != null) {
if (rowIndex.getEntryCount() != requiredIndexEntries) {
throw new IllegalArgumentException("Column has wrong number of " +
- "index entries found: " + rowIndexEntry + " expected: " +
+ "index entries found: " + rowIndex.getEntryCount() + " expected: " +
requiredIndexEntries);
}
rowIndex.build().writeTo(rowIndexStream);
@@ -1005,6 +1000,8 @@ class WriterImpl implements Writer, Memo
private final float dictionaryKeySizeThreshold;
private boolean useDictionaryEncoding = true;
private boolean isDirectV2 = true;
+ private boolean doneDictionaryCheck;
+ private final boolean strideDictionaryCheck;
StringTreeWriter(int columnId,
ObjectInspector inspector,
@@ -1025,9 +1022,14 @@ class WriterImpl implements Writer, Memo
directLengthOutput = createIntegerWriter(writer.createStream(id,
OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer);
dictionaryKeySizeThreshold = writer.getConfiguration().getFloat(
- HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.varname,
- HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.
- defaultFloatVal);
+ HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.varname,
+ HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.
+ defaultFloatVal);
+ strideDictionaryCheck = writer.getConfiguration().getBoolean(
+ HiveConf.ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.varname,
+ HiveConf.ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.
+ defaultBoolVal);
+ doneDictionaryCheck = false;
}
/**
@@ -1045,21 +1047,71 @@ class WriterImpl implements Writer, Memo
super.write(obj);
if (obj != null) {
Text val = getTextValue(obj);
- rows.add(dictionary.add(val));
+ if (useDictionaryEncoding || !strideDictionaryCheck) {
+ rows.add(dictionary.add(val));
+ } else {
+ // write data and length
+ directStreamOutput.write(val.getBytes(), 0, val.getLength());
+ directLengthOutput.write(val.getLength());
+ }
indexStatistics.updateString(val);
}
}
+ private boolean checkDictionaryEncoding() {
+ if (!doneDictionaryCheck) {
+ // Set the flag indicating whether or not to use dictionary encoding
+ // based on whether or not the fraction of distinct keys over number of
+ // non-null rows is less than the configured threshold
+ float ratio = rows.size() > 0 ? (float) (dictionary.size()) / rows.size() : 0.0f;
+ useDictionaryEncoding = !isDirectV2 || ratio <= dictionaryKeySizeThreshold;
+ doneDictionaryCheck = true;
+ }
+ return useDictionaryEncoding;
+ }
+
@Override
void writeStripe(OrcProto.StripeFooter.Builder builder,
int requiredIndexEntries) throws IOException {
- // Set the flag indicating whether or not to use dictionary encoding
- // based on whether or not the fraction of distinct keys over number of
- // non-null rows is less than the configured threshold
- useDictionaryEncoding =
- (!isDirectV2) || (rows.size() > 0 &&
- (float)(dictionary.size()) / rows.size() <=
- dictionaryKeySizeThreshold);
+ // if rows in stripe is less than dictionaryCheckAfterRows, dictionary
+ // checking would not have happened. So do it again here.
+ checkDictionaryEncoding();
+
+ if (useDictionaryEncoding) {
+ flushDictionary();
+ } else {
+ // flushout any left over entries from dictionary
+ if (rows.size() > 0) {
+ flushDictionary();
+ }
+
+ // suppress the stream for every stripe if dictionary is disabled
+ stringOutput.suppress();
+ }
+
+ // we need to build the rowindex before calling super, since it
+ // writes it out.
+ super.writeStripe(builder, requiredIndexEntries);
+ stringOutput.flush();
+ lengthOutput.flush();
+ rowOutput.flush();
+ directStreamOutput.flush();
+ directLengthOutput.flush();
+ // reset all of the fields to be ready for the next stripe.
+ dictionary.clear();
+ savedRowIndex.clear();
+ rowIndexValueCount.clear();
+ recordPosition(rowIndexPosition);
+ rowIndexValueCount.add(0L);
+
+ if (!useDictionaryEncoding) {
+ // record the start positions of first index stride of next stripe i.e
+ // beginning of the direct streams when dictionary is disabled
+ recordDirectStreamPosition();
+ }
+ }
+
+ private void flushDictionary() throws IOException {
final int[] dumpOrder = new int[dictionary.size()];
if (useDictionaryEncoding) {
@@ -1113,21 +1165,7 @@ class WriterImpl implements Writer, Memo
}
}
}
- // we need to build the rowindex before calling super, since it
- // writes it out.
- super.writeStripe(builder, requiredIndexEntries);
- stringOutput.flush();
- lengthOutput.flush();
- rowOutput.flush();
- directStreamOutput.flush();
- directLengthOutput.flush();
- // reset all of the fields to be ready for the next stripe.
- dictionary.clear();
rows.clear();
- savedRowIndex.clear();
- rowIndexValueCount.clear();
- recordPosition(rowIndexPosition);
- rowIndexValueCount.add(0L);
}
@Override
@@ -1165,10 +1203,30 @@ class WriterImpl implements Writer, Memo
OrcProto.RowIndexEntry.Builder rowIndexEntry = getRowIndexEntry();
rowIndexEntry.setStatistics(indexStatistics.serialize());
indexStatistics.reset();
- savedRowIndex.add(rowIndexEntry.build());
+ OrcProto.RowIndexEntry base = rowIndexEntry.build();
+ savedRowIndex.add(base);
rowIndexEntry.clear();
recordPosition(rowIndexPosition);
rowIndexValueCount.add(Long.valueOf(rows.size()));
+ if (strideDictionaryCheck) {
+ checkDictionaryEncoding();
+ }
+ if (!useDictionaryEncoding) {
+ if (rows.size() > 0) {
+ flushDictionary();
+ // just record the start positions of next index stride
+ recordDirectStreamPosition();
+ } else {
+ // record the start positions of next index stride
+ recordDirectStreamPosition();
+ getRowIndex().addEntry(base);
+ }
+ }
+ }
+
+ private void recordDirectStreamPosition() throws IOException {
+ directStreamOutput.getPosition(rowIndexPosition);
+ directLengthOutput.getPosition(rowIndexPosition);
}
@Override
Modified: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java?rev=1622119&r1=1622118&r2=1622119&view=diff
==============================================================================
--- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java (original)
+++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java Tue Sep 2 20:53:04 2014
@@ -42,6 +42,7 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.io.orc.OrcFile.Version;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
import org.apache.hadoop.hive.serde2.io.ByteWritable;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
@@ -1684,7 +1685,7 @@ public class TestOrcFile {
}
@Test
- public void testMemoryManagement() throws Exception {
+ public void testMemoryManagementV11() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector
@@ -1699,7 +1700,8 @@ public class TestOrcFile {
.stripeSize(50000)
.bufferSize(100)
.rowIndexStride(0)
- .memory(memory));
+ .memory(memory)
+ .version(Version.V_0_11));
assertEquals(testFilePath, memory.path);
for(int i=0; i < 2500; ++i) {
writer.addRow(new InnerStruct(i*300, Integer.toHexString(10*i)));
@@ -1719,6 +1721,45 @@ public class TestOrcFile {
}
@Test
+ public void testMemoryManagementV12() throws Exception {
+ ObjectInspector inspector;
+ synchronized (TestOrcFile.class) {
+ inspector = ObjectInspectorFactory.getReflectionObjectInspector
+ (InnerStruct.class,
+ ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+ MyMemoryManager memory = new MyMemoryManager(conf, 10000, 0.1);
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .inspector(inspector)
+ .compress(CompressionKind.NONE)
+ .stripeSize(50000)
+ .bufferSize(100)
+ .rowIndexStride(0)
+ .memory(memory)
+ .version(Version.V_0_12));
+ assertEquals(testFilePath, memory.path);
+ for(int i=0; i < 2500; ++i) {
+ writer.addRow(new InnerStruct(i*300, Integer.toHexString(10*i)));
+ }
+ writer.close();
+ assertEquals(null, memory.path);
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ int i = 0;
+ for(StripeInformation stripe: reader.getStripes()) {
+ i += 1;
+ assertTrue("stripe " + i + " is too long at " + stripe.getDataLength(),
+ stripe.getDataLength() < 5000);
+ }
+ // with HIVE-7832, the dictionaries will be disabled after writing the first
+ // stripe as there are too many distinct values. Hence only 3 stripes as
+ // compared to 25 stripes in version 0.11 (above test case)
+ assertEquals(3, i);
+ assertEquals(2500, reader.getNumberOfRows());
+ }
+
+ @Test
public void testPredicatePushdown() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
Added: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestStringDictionary.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestStringDictionary.java?rev=1622119&view=auto
==============================================================================
--- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestStringDictionary.java (added)
+++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestStringDictionary.java Tue Sep 2 20:53:04 2014
@@ -0,0 +1,258 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.io.orc;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
+import org.apache.hadoop.hive.ql.io.orc.OrcFile.Version;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.io.Text;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+
+public class TestStringDictionary {
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test"
+ + File.separator + "tmp"));
+
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ @Test
+ public void testTooManyDistinct() throws Exception {
+ ObjectInspector inspector;
+ synchronized (TestOrcFile.class) {
+ inspector = ObjectInspectorFactory.getReflectionObjectInspector(Text.class,
+ ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).inspector(inspector).compress(CompressionKind.NONE)
+ .bufferSize(10000));
+ for (int i = 0; i < 20000; i++) {
+ writer.addRow(new Text(String.valueOf(i)));
+ }
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ int idx = 0;
+ while (rows.hasNext()) {
+ Object row = rows.next(null);
+ assertEquals(new Text(String.valueOf(idx++)), row);
+ }
+
+ // make sure the encoding type is correct
+ for (StripeInformation stripe : reader.getStripes()) {
+ // hacky but does the job, this casting will work as long this test resides
+ // within the same package as ORC reader
+ OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ assertEquals(OrcProto.ColumnEncoding.Kind.DIRECT_V2, encoding.getKind());
+ }
+ }
+ }
+
+ @Test
+ public void testHalfDistinct() throws Exception {
+ ObjectInspector inspector;
+ synchronized (TestOrcFile.class) {
+ inspector = ObjectInspectorFactory.getReflectionObjectInspector(Text.class,
+ ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).inspector(inspector).compress(CompressionKind.NONE)
+ .bufferSize(10000));
+ Random rand = new Random(123);
+ int[] input = new int[20000];
+ for (int i = 0; i < 20000; i++) {
+ input[i] = rand.nextInt(10000);
+ }
+
+ for (int i = 0; i < 20000; i++) {
+ writer.addRow(new Text(String.valueOf(input[i])));
+ }
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ int idx = 0;
+ while (rows.hasNext()) {
+ Object row = rows.next(null);
+ assertEquals(new Text(String.valueOf(input[idx++])), row);
+ }
+
+ // make sure the encoding type is correct
+ for (StripeInformation stripe : reader.getStripes()) {
+ // hacky but does the job, this casting will work as long this test resides
+ // within the same package as ORC reader
+ OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ assertEquals(OrcProto.ColumnEncoding.Kind.DICTIONARY_V2, encoding.getKind());
+ }
+ }
+ }
+
+ @Test
+ public void testTooManyDistinctCheckDisabled() throws Exception {
+ ObjectInspector inspector;
+ synchronized (TestOrcFile.class) {
+ inspector = ObjectInspectorFactory.getReflectionObjectInspector(Text.class,
+ ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+
+ conf.setBoolean(ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.varname, false);
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).inspector(inspector).compress(CompressionKind.NONE)
+ .bufferSize(10000));
+ for (int i = 0; i < 20000; i++) {
+ writer.addRow(new Text(String.valueOf(i)));
+ }
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ int idx = 0;
+ while (rows.hasNext()) {
+ Object row = rows.next(null);
+ assertEquals(new Text(String.valueOf(idx++)), row);
+ }
+
+ // make sure the encoding type is correct
+ for (StripeInformation stripe : reader.getStripes()) {
+ // hacky but does the job, this casting will work as long this test resides
+ // within the same package as ORC reader
+ OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ assertEquals(OrcProto.ColumnEncoding.Kind.DIRECT_V2, encoding.getKind());
+ }
+ }
+ }
+
+ @Test
+ public void testHalfDistinctCheckDisabled() throws Exception {
+ ObjectInspector inspector;
+ synchronized (TestOrcFile.class) {
+ inspector = ObjectInspectorFactory.getReflectionObjectInspector(Text.class,
+ ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+
+ conf.setBoolean(ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.varname, false);
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).inspector(inspector).compress(CompressionKind.NONE)
+ .bufferSize(10000));
+ Random rand = new Random(123);
+ int[] input = new int[20000];
+ for (int i = 0; i < 20000; i++) {
+ input[i] = rand.nextInt(10000);
+ }
+
+ for (int i = 0; i < 20000; i++) {
+ writer.addRow(new Text(String.valueOf(input[i])));
+ }
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ int idx = 0;
+ while (rows.hasNext()) {
+ Object row = rows.next(null);
+ assertEquals(new Text(String.valueOf(input[idx++])), row);
+ }
+
+ // make sure the encoding type is correct
+ for (StripeInformation stripe : reader.getStripes()) {
+ // hacky but does the job, this casting will work as long this test resides
+ // within the same package as ORC reader
+ OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ assertEquals(OrcProto.ColumnEncoding.Kind.DICTIONARY_V2, encoding.getKind());
+ }
+ }
+ }
+
+ @Test
+ public void testTooManyDistinctV11AlwaysDictionary() throws Exception {
+ ObjectInspector inspector;
+ synchronized (TestOrcFile.class) {
+ inspector = ObjectInspectorFactory.getReflectionObjectInspector(Text.class,
+ ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+ }
+
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).inspector(inspector).compress(CompressionKind.NONE)
+ .version(Version.V_0_11).bufferSize(10000));
+ for (int i = 0; i < 20000; i++) {
+ writer.addRow(new Text(String.valueOf(i)));
+ }
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ int idx = 0;
+ while (rows.hasNext()) {
+ Object row = rows.next(null);
+ assertEquals(new Text(String.valueOf(idx++)), row);
+ }
+
+ // make sure the encoding type is correct
+ for (StripeInformation stripe : reader.getStripes()) {
+ // hacky but does the job, this casting will work as long this test resides
+ // within the same package as ORC reader
+ OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ assertEquals(OrcProto.ColumnEncoding.Kind.DICTIONARY, encoding.getKind());
+ }
+ }
+
+ }
+
+}