You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2018/06/04 22:49:05 UTC
orc git commit: ORC-373: When "orc.dictionary.key.threshold" is set
to 0, never try dictionary encoding.
Repository: orc
Updated Branches:
refs/heads/master 665baaff4 -> 0a815b5ce
ORC-373: When "orc.dictionary.key.threshold" is set to 0, never try dictionary
encoding.
Fixes #279
Signed-off-by: Owen O'Malley <om...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/0a815b5c
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/0a815b5c
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/0a815b5c
Branch: refs/heads/master
Commit: 0a815b5ce962aab82d5eba782c3218e29017ebce
Parents: 665baaf
Author: Prasanth Jayachandran <pr...@apache.org>
Authored: Fri Jun 1 19:24:22 2018 -0700
Committer: Owen O'Malley <om...@apache.org>
Committed: Mon Jun 4 15:46:20 2018 -0700
----------------------------------------------------------------------
.../orc/impl/writer/StringBaseTreeWriter.java | 7 +-
.../apache/orc/impl/writer/TreeWriterBase.java | 6 +-
.../org/apache/orc/TestStringDictionary.java | 124 ++++++++++++++++++-
.../test/org/apache/orc/impl/TestInStream.java | 4 +-
4 files changed, 134 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/orc/blob/0a815b5c/java/core/src/java/org/apache/orc/impl/writer/StringBaseTreeWriter.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/writer/StringBaseTreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/StringBaseTreeWriter.java
index be4e6dc..742c1ed 100644
--- a/java/core/src/java/org/apache/orc/impl/writer/StringBaseTreeWriter.java
+++ b/java/core/src/java/org/apache/orc/impl/writer/StringBaseTreeWriter.java
@@ -80,7 +80,12 @@ public abstract class StringBaseTreeWriter extends TreeWriterBase {
OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.getDouble(conf);
strideDictionaryCheck =
OrcConf.ROW_INDEX_STRIDE_DICTIONARY_CHECK.getBoolean(conf);
- doneDictionaryCheck = false;
+ if (dictionaryKeySizeThreshold <= 0.0) {
+ useDictionaryEncoding = false;
+ doneDictionaryCheck = true;
+ } else {
+ doneDictionaryCheck = false;
+ }
}
private void checkDictionaryEncoding() {
http://git-wip-us.apache.org/repos/asf/orc/blob/0a815b5c/java/core/src/java/org/apache/orc/impl/writer/TreeWriterBase.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/writer/TreeWriterBase.java b/java/core/src/java/org/apache/orc/impl/writer/TreeWriterBase.java
index d6145cd..7934b21 100644
--- a/java/core/src/java/org/apache/orc/impl/writer/TreeWriterBase.java
+++ b/java/core/src/java/org/apache/orc/impl/writer/TreeWriterBase.java
@@ -18,6 +18,9 @@
package org.apache.orc.impl.writer;
+import java.io.IOException;
+import java.util.List;
+
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.OrcFile;
@@ -36,9 +39,6 @@ import org.apache.orc.util.BloomFilter;
import org.apache.orc.util.BloomFilterIO;
import org.apache.orc.util.BloomFilterUtf8;
-import java.io.IOException;
-import java.util.List;
-
/**
* The parent class of all of the writers for each column. Each column
* is written by an instance of this class. The compound types (struct,
http://git-wip-us.apache.org/repos/asf/orc/blob/0a815b5c/java/core/src/test/org/apache/orc/TestStringDictionary.java
----------------------------------------------------------------------
diff --git a/java/core/src/test/org/apache/orc/TestStringDictionary.java b/java/core/src/test/org/apache/orc/TestStringDictionary.java
index 46209bb..dbd615a 100644
--- a/java/core/src/test/org/apache/orc/TestStringDictionary.java
+++ b/java/core/src/test/org/apache/orc/TestStringDictionary.java
@@ -20,6 +20,11 @@ package org.apache.orc;
import static org.junit.Assert.assertEquals;
import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.Map;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
@@ -28,7 +33,16 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.impl.OutStream;
import org.apache.orc.impl.RecordReaderImpl;
+import org.apache.orc.impl.RunLengthIntegerWriter;
+import org.apache.orc.impl.StreamName;
+import org.apache.orc.impl.TestInStream;
+import org.apache.orc.impl.WriterImpl;
+import org.apache.orc.impl.writer.StringTreeWriter;
+import org.apache.orc.impl.writer.TreeWriter;
+import org.apache.orc.impl.writer.WriterContext;
+import org.apache.orc.impl.writer.WriterImplV2;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
@@ -124,7 +138,8 @@ public class TestStringDictionary {
writer.addRowBatch(batch);
writer.close();
- Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
RecordReader rows = reader.rows();
batch = reader.getSchema().createRowBatch();
col = (BytesColumnVector) batch.cols[0];
@@ -147,6 +162,113 @@ public class TestStringDictionary {
}
}
+ static class WriterContextImpl implements WriterContext {
+ private final TypeDescription schema;
+ private final Configuration conf;
+ private final Map<StreamName, TestInStream.OutputCollector> streams =
+ new HashMap<>();
+
+ WriterContextImpl(TypeDescription schema, Configuration conf) {
+ this.schema = schema;
+ this.conf = conf;
+ }
+
+ @Override
+ public OutStream createStream(int column, OrcProto.Stream.Kind kind) throws IOException {
+ TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
+ streams.put(new StreamName(column, kind), collect);
+ return new OutStream("test", 1000, null, collect);
+ }
+
+ @Override
+ public int getRowIndexStride() {
+ return 10000;
+ }
+
+ @Override
+ public boolean buildIndex() {
+ return OrcConf.ENABLE_INDEXES.getBoolean(conf);
+ }
+
+ @Override
+ public boolean isCompressed() {
+ return false;
+ }
+
+ @Override
+ public OrcFile.EncodingStrategy getEncodingStrategy() {
+ return OrcFile.EncodingStrategy.SPEED;
+ }
+
+ @Override
+ public boolean[] getBloomFilterColumns() {
+ return new boolean[schema.getMaximumId() + 1];
+ }
+
+ @Override
+ public double getBloomFilterFPP() {
+ return 0;
+ }
+
+ @Override
+ public Configuration getConfiguration() {
+ return conf;
+ }
+
+ @Override
+ public OrcFile.Version getVersion() {
+ return OrcFile.Version.V_0_12;
+ }
+
+ @Override
+ public PhysicalWriter getPhysicalWriter() {
+ return null;
+ }
+
+ @Override
+ public OrcFile.BloomFilterVersion getBloomFilterVersion() {
+ return OrcFile.BloomFilterVersion.UTF8;
+ }
+
+ @Override
+ public void writeIndex(StreamName name, OrcProto.RowIndex.Builder index) {
+
+ }
+
+ @Override
+ public void writeBloomFilter(StreamName name,
+ OrcProto.BloomFilterIndex.Builder bloom) {
+
+ }
+
+ @Override
+ public boolean getUseUTCTimestamp() {
+ return true;
+ }
+ }
+
+ @Test
+ public void testNonDistinctDisabled() throws Exception {
+ TypeDescription schema = TypeDescription.createString();
+
+ conf.set(OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.getAttribute(), "0.0");
+ WriterContextImpl writerContext = new WriterContextImpl(schema, conf);
+ StringTreeWriter writer = (StringTreeWriter)
+ TreeWriter.Factory.create(schema, writerContext, true);
+
+ VectorizedRowBatch batch = schema.createRowBatch();
+ BytesColumnVector col = (BytesColumnVector) batch.cols[0];
+ batch.size = 1024;
+ col.isRepeating = true;
+ col.setVal(0, "foobar".getBytes(StandardCharsets.UTF_8));
+ writer.writeBatch(col, 0, batch.size);
+ TestInStream.OutputCollector output = writerContext.streams.get(
+ new StreamName(0, OrcProto.Stream.Kind.DATA));
+ // Check to make sure that the strings are being written to the stream,
+ // even before we get to the first rowGroup. (6 * 1024 / 1000 * 1000)
+ assertEquals(6000, output.buffer.size());
+ }
+
@Test
public void testTooManyDistinctCheckDisabled() throws Exception {
TypeDescription schema = TypeDescription.createString();
http://git-wip-us.apache.org/repos/asf/orc/blob/0a815b5c/java/core/src/test/org/apache/orc/impl/TestInStream.java
----------------------------------------------------------------------
diff --git a/java/core/src/test/org/apache/orc/impl/TestInStream.java b/java/core/src/test/org/apache/orc/impl/TestInStream.java
index d40676c..a27c49a 100644
--- a/java/core/src/test/org/apache/orc/impl/TestInStream.java
+++ b/java/core/src/test/org/apache/orc/impl/TestInStream.java
@@ -36,8 +36,8 @@ import org.junit.Test;
public class TestInStream {
- static class OutputCollector implements PhysicalWriter.OutputReceiver {
- DynamicByteArray buffer = new DynamicByteArray();
+ public static class OutputCollector implements PhysicalWriter.OutputReceiver {
+ public DynamicByteArray buffer = new DynamicByteArray();
@Override
public void output(ByteBuffer buffer) throws IOException {