You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by pr...@apache.org on 2015/10/23 01:30:41 UTC
[2/2] hive git commit: HIVE-11807: Set ORC buffer size in relation to
set stripe size (Owen O'Malley reviewed by Gopal V)
HIVE-11807: Set ORC buffer size in relation to set stripe size (Owen O'Malley reviewed by Gopal V)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/51a0c03f
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/51a0c03f
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/51a0c03f
Branch: refs/heads/branch-1
Commit: 51a0c03f4fe1c90ca6ba9dbb7dad37a4a139c891
Parents: 30ca4fb
Author: Prasanth Jayachandran <j....@gmail.com>
Authored: Thu Oct 22 16:30:22 2015 -0700
Committer: Prasanth Jayachandran <j....@gmail.com>
Committed: Thu Oct 22 16:30:22 2015 -0700
----------------------------------------------------------------------
.../hadoop/hive/ql/io/orc/WriterImpl.java | 99 ++++----
.../hadoop/hive/ql/io/orc/TestOrcWideTable.java | 232 ++-----------------
.../resources/orc-file-dump-bloomfilter.out | 112 ++++-----
.../resources/orc-file-dump-bloomfilter2.out | 144 ++++++------
.../orc-file-dump-dictionary-threshold.out | 180 +++++++-------
ql/src/test/resources/orc-file-dump.json | 184 +++++++--------
ql/src/test/resources/orc-file-dump.out | 158 ++++++-------
ql/src/test/resources/orc-file-has-null.out | 80 +++----
8 files changed, 493 insertions(+), 696 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/51a0c03f/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java
index 58c7577..b5e6ad1 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java
@@ -22,7 +22,6 @@ import static com.google.common.base.Preconditions.checkArgument;
import java.io.IOException;
import java.io.OutputStream;
-import java.lang.management.ManagementFactory;
import java.nio.ByteBuffer;
import java.sql.Timestamp;
import java.util.ArrayList;
@@ -209,7 +208,8 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
if (allColumns == null) {
allColumns = getColumnNamesFromInspector(inspector);
}
- this.bufferSize = getEstimatedBufferSize(allColumns, bufferSize);
+ this.bufferSize = getEstimatedBufferSize(defaultStripeSize,
+ countColumns(inspector), bufferSize);
if (version == OrcFile.Version.V_0_11) {
/* do not write bloom filters for ORC v11 */
this.bloomFilterColumns =
@@ -242,49 +242,58 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
return joiner.join(fieldNames);
}
- @VisibleForTesting
- int getEstimatedBufferSize(int bs) {
- return getEstimatedBufferSize(conf.get(IOConstants.COLUMNS), bs);
- }
-
- int getEstimatedBufferSize(String colNames, int bs) {
- long availableMem = getMemoryAvailableForORC();
- if (colNames != null) {
- final int numCols = colNames.split(",").length;
- if (numCols > COLUMN_COUNT_THRESHOLD) {
- // In BufferedStream, there are 3 outstream buffers (compressed,
- // uncompressed and overflow) and list of previously compressed buffers.
- // Since overflow buffer is rarely used, lets consider only 2 allocation.
- // Also, initially, the list of compression buffers will be empty.
- final int outStreamBuffers = codec == null ? 1 : 2;
-
- // max possible streams per column is 5. For string columns, there is
- // ROW_INDEX, PRESENT, DATA, LENGTH, DICTIONARY_DATA streams.
- final int maxStreams = 5;
-
- // Lets assume 10% memory for holding dictionary in memory and other
- // object allocations
- final long miscAllocation = (long) (0.1f * availableMem);
-
- // compute the available memory
- final long remainingMem = availableMem - miscAllocation;
-
- int estBufferSize = (int) (remainingMem /
- (maxStreams * outStreamBuffers * numCols));
- estBufferSize = getClosestBufferSize(estBufferSize, bs);
- if (estBufferSize > bs) {
- estBufferSize = bs;
+ static int countColumns(ObjectInspector oi) {
+ switch (oi.getCategory()) {
+ case PRIMITIVE:
+ return 1;
+ case STRUCT: {
+ int result = 1;
+ for(StructField field:
+ ((StructObjectInspector) oi).getAllStructFieldRefs()) {
+ result += countColumns(field.getFieldObjectInspector());
}
-
- LOG.info("WIDE TABLE - Number of columns: " + numCols +
- " Chosen compression buffer size: " + estBufferSize);
- return estBufferSize;
+ return result;
+ }
+ case UNION: {
+ int result = 1;
+ for(ObjectInspector child:
+ ((UnionObjectInspector) oi).getObjectInspectors()) {
+ result += countColumns(child);
+ }
+ return result;
+ }
+ case LIST:
+ return 1 + countColumns(
+ ((ListObjectInspector)oi).getListElementObjectInspector());
+ case MAP: {
+ MapObjectInspector moi = (MapObjectInspector) oi;
+ return 1 + countColumns(moi.getMapKeyObjectInspector()) +
+ countColumns(moi.getMapValueObjectInspector());
}
+ default:
+ throw new IllegalArgumentException("Unknown category " +
+ oi.getCategory());
}
- return bs;
}
- private int getClosestBufferSize(int estBufferSize, int bs) {
+ @VisibleForTesting
+ static int getEstimatedBufferSize(long stripeSize, int numColumns, int bs) {
+ // The worst case is that there are 2 big streams per a column and
+ // we want to guarantee that each stream gets ~10 buffers.
+ // This keeps buffers small enough that we don't get really small stripe
+ // sizes.
+ int estBufferSize = (int) (stripeSize / (20 * numColumns));
+ estBufferSize = getClosestBufferSize(estBufferSize);
+ if (estBufferSize > bs) {
+ estBufferSize = bs;
+ } else {
+ LOG.info("WIDE TABLE - Number of columns: " + numColumns +
+ " Chosen compression buffer size: " + estBufferSize);
+ }
+ return estBufferSize;
+ }
+
+ private static int getClosestBufferSize(int estBufferSize) {
final int kb4 = 4 * 1024;
final int kb8 = 8 * 1024;
final int kb16 = 16 * 1024;
@@ -309,16 +318,6 @@ public class WriterImpl implements Writer, MemoryManager.Callback {
}
}
- // the assumption is only one ORC writer open at a time, which holds true for
- // most of the cases. HIVE-6455 forces single writer case.
- private long getMemoryAvailableForORC() {
- HiveConf.ConfVars poolVar = HiveConf.ConfVars.HIVE_ORC_FILE_MEMORY_POOL;
- double maxLoad = conf.getFloat(poolVar.varname, poolVar.defaultFloatVal);
- long totalMemoryPool = Math.round(ManagementFactory.getMemoryMXBean().
- getHeapMemoryUsage().getMax() * maxLoad);
- return totalMemoryPool;
- }
-
public static CompressionCodec createCodec(CompressionKind kind) {
switch (kind) {
case NONE:
http://git-wip-us.apache.org/repos/asf/hive/blob/51a0c03f/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcWideTable.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcWideTable.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcWideTable.java
index a3d3ec5..6b6cb2c 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcWideTable.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcWideTable.java
@@ -38,241 +38,39 @@ import org.junit.rules.TestName;
public class TestOrcWideTable {
- private static final int MEMORY_FOR_ORC = 512 * 1024 * 1024;
- Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test"
- + File.separator + "tmp"));
-
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
- float memoryPercent;
-
- @Rule
- public TestName testCaseName = new TestName();
-
- @Before
- public void openFileSystem() throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- testFilePath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc");
- fs.delete(testFilePath, false);
- // make sure constant memory is available for ORC always
- memoryPercent = (float) MEMORY_FOR_ORC / (float) ManagementFactory.getMemoryMXBean().
- getHeapMemoryUsage().getMax();
- conf.setFloat(HiveConf.ConfVars.HIVE_ORC_FILE_MEMORY_POOL.varname, memoryPercent);
- }
-
@Test
public void testBufferSizeFor1Col() throws IOException {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
- int bufferSize = 128 * 1024;
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000)
- .compress(CompressionKind.NONE).bufferSize(bufferSize));
- final int newBufferSize;
- if (writer instanceof WriterImpl) {
- WriterImpl orcWriter = (WriterImpl) writer;
- newBufferSize = orcWriter.getEstimatedBufferSize(bufferSize);
- assertEquals(bufferSize, newBufferSize);
- }
+ assertEquals(128 * 1024, WriterImpl.getEstimatedBufferSize(512 * 1024 * 1024,
+ 1, 128*1024));
}
@Test
- public void testBufferSizeFor1000Col() throws IOException {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
- int bufferSize = 128 * 1024;
- String columns = getRandomColumnNames(1000);
- // just for testing. manually write the column names
- conf.set(IOConstants.COLUMNS, columns);
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000)
- .compress(CompressionKind.NONE).bufferSize(bufferSize));
- final int newBufferSize;
- if (writer instanceof WriterImpl) {
- WriterImpl orcWriter = (WriterImpl) writer;
- newBufferSize = orcWriter.getEstimatedBufferSize(bufferSize);
- assertEquals(bufferSize, newBufferSize);
- }
+ public void testBufferSizeFor50Col() throws IOException {
+ assertEquals(256 * 1024, WriterImpl.getEstimatedBufferSize(256 * 1024 * 1024,
+ 50, 256*1024));
}
@Test
- public void testBufferSizeFor2000Col() throws IOException {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
- int bufferSize = 256 * 1024;
- String columns = getRandomColumnNames(2000);
- // just for testing. manually write the column names
- conf.set(IOConstants.COLUMNS, columns);
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000)
- .compress(CompressionKind.ZLIB).bufferSize(bufferSize));
- final int newBufferSize;
- if (writer instanceof WriterImpl) {
- WriterImpl orcWriter = (WriterImpl) writer;
- newBufferSize = orcWriter.getEstimatedBufferSize(bufferSize);
- assertEquals(32 * 1024, newBufferSize);
- }
+ public void testBufferSizeFor1000Col() throws IOException {
+ assertEquals(32 * 1024, WriterImpl.getEstimatedBufferSize(512 * 1024 * 1024,
+ 1000, 128*1024));
}
@Test
- public void testBufferSizeFor2000ColNoCompression() throws IOException {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
- int bufferSize = 256 * 1024;
- String columns = getRandomColumnNames(2000);
- // just for testing. manually write the column names
- conf.set(IOConstants.COLUMNS, columns);
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000)
- .compress(CompressionKind.NONE).bufferSize(bufferSize));
- final int newBufferSize;
- if (writer instanceof WriterImpl) {
- WriterImpl orcWriter = (WriterImpl) writer;
- newBufferSize = orcWriter.getEstimatedBufferSize(bufferSize);
- assertEquals(64 * 1024, newBufferSize);
- }
+ public void testBufferSizeFor2000Col() throws IOException {
+ assertEquals(16 * 1024, WriterImpl.getEstimatedBufferSize(512 * 1024 * 1024,
+ 2000, 256*1024));
}
@Test
public void testBufferSizeFor4000Col() throws IOException {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
- int bufferSize = 256 * 1024;
- String columns = getRandomColumnNames(4000);
- // just for testing. manually write the column names
- conf.set(IOConstants.COLUMNS, columns);
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000)
- .compress(CompressionKind.ZLIB).bufferSize(bufferSize));
- final int newBufferSize;
- if (writer instanceof WriterImpl) {
- WriterImpl orcWriter = (WriterImpl) writer;
- newBufferSize = orcWriter.getEstimatedBufferSize(bufferSize);
- assertEquals(16 * 1024, newBufferSize);
- }
- }
-
- @Test
- public void testBufferSizeFor4000ColNoCompression() throws IOException {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
- int bufferSize = 256 * 1024;
- String columns = getRandomColumnNames(4000);
- // just for testing. manually write the column names
- conf.set(IOConstants.COLUMNS, columns);
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000)
- .compress(CompressionKind.NONE).bufferSize(bufferSize));
- final int newBufferSize;
- if (writer instanceof WriterImpl) {
- WriterImpl orcWriter = (WriterImpl) writer;
- newBufferSize = orcWriter.getEstimatedBufferSize(bufferSize);
- assertEquals(32 * 1024, newBufferSize);
- }
+ assertEquals(8 * 1024, WriterImpl.getEstimatedBufferSize(512 * 1024 * 1024,
+ 4000, 256*1024));
}
@Test
public void testBufferSizeFor25000Col() throws IOException {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
- int bufferSize = 256 * 1024;
- String columns = getRandomColumnNames(25000);
- // just for testing. manually write the column names
- conf.set(IOConstants.COLUMNS, columns);
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000)
- .compress(CompressionKind.NONE).bufferSize(bufferSize));
- final int newBufferSize;
- if (writer instanceof WriterImpl) {
- WriterImpl orcWriter = (WriterImpl) writer;
- newBufferSize = orcWriter.getEstimatedBufferSize(bufferSize);
- // 4K is the minimum buffer size
- assertEquals(4 * 1024, newBufferSize);
- }
- }
-
- @Test
- public void testBufferSizeManualOverride1() throws IOException {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
- int bufferSize = 1024;
- String columns = getRandomColumnNames(2000);
- // just for testing. manually write the column names
- conf.set(IOConstants.COLUMNS, columns);
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000)
- .compress(CompressionKind.NONE).bufferSize(bufferSize));
- final int newBufferSize;
- if (writer instanceof WriterImpl) {
- WriterImpl orcWriter = (WriterImpl) writer;
- newBufferSize = orcWriter.getEstimatedBufferSize(bufferSize);
- assertEquals(bufferSize, newBufferSize);
- }
- }
-
- @Test
- public void testBufferSizeManualOverride2() throws IOException {
- ObjectInspector inspector;
- synchronized (TestOrcFile.class) {
- inspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class,
- ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
- }
- int bufferSize = 2 * 1024;
- String columns = getRandomColumnNames(4000);
- // just for testing. manually write the column names
- conf.set(IOConstants.COLUMNS, columns);
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000)
- .compress(CompressionKind.NONE).bufferSize(bufferSize));
- final int newBufferSize;
- if (writer instanceof WriterImpl) {
- WriterImpl orcWriter = (WriterImpl) writer;
- newBufferSize = orcWriter.getEstimatedBufferSize(bufferSize);
- assertEquals(bufferSize, newBufferSize);
- }
- }
-
- private String getRandomColumnNames(int n) {
- StringBuilder sb = new StringBuilder();
- for (int i = 0; i < n - 1; i++) {
- sb.append("col").append(i).append(",");
- }
- sb.append("col").append(n - 1);
- return sb.toString();
+ assertEquals(4 * 1024, WriterImpl.getEstimatedBufferSize(512 * 1024 * 1024,
+ 25000, 256*1024));
}
}
http://git-wip-us.apache.org/repos/asf/hive/blob/51a0c03f/ql/src/test/resources/orc-file-dump-bloomfilter.out
----------------------------------------------------------------------
diff --git a/ql/src/test/resources/orc-file-dump-bloomfilter.out b/ql/src/test/resources/orc-file-dump-bloomfilter.out
index add163c..3420135 100644
--- a/ql/src/test/resources/orc-file-dump-bloomfilter.out
+++ b/ql/src/test/resources/orc-file-dump-bloomfilter.out
@@ -2,7 +2,7 @@ Structure for TestFileDump.testDump.orc
File Version: 0.12 with HIVE_8732
Rows: 21000
Compression: ZLIB
-Compression size: 10000
+Compression size: 4096
Type: struct<i:int,l:bigint,s:string>
Stripe Statistics:
@@ -39,17 +39,17 @@ File Statistics:
Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
Stripes:
- Stripe: offset: 3 data: 63765 rows: 5000 tail: 86 index: 845
+ Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 951
Stream: column 0 section ROW_INDEX start: 3 length 17
- Stream: column 1 section ROW_INDEX start: 20 length 164
- Stream: column 2 section ROW_INDEX start: 184 length 173
- Stream: column 3 section ROW_INDEX start: 357 length 87
- Stream: column 3 section BLOOM_FILTER start: 444 length 404
- Stream: column 1 section DATA start: 848 length 20029
- Stream: column 2 section DATA start: 20877 length 40035
- Stream: column 3 section DATA start: 60912 length 3543
- Stream: column 3 section LENGTH start: 64455 length 25
- Stream: column 3 section DICTIONARY_DATA start: 64480 length 133
+ Stream: column 1 section ROW_INDEX start: 20 length 166
+ Stream: column 2 section ROW_INDEX start: 186 length 169
+ Stream: column 3 section ROW_INDEX start: 355 length 87
+ Stream: column 3 section BLOOM_FILTER start: 442 length 512
+ Stream: column 1 section DATA start: 954 length 20035
+ Stream: column 2 section DATA start: 20989 length 40050
+ Stream: column 3 section DATA start: 61039 length 3543
+ Stream: column 3 section LENGTH start: 64582 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 64607 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -67,17 +67,17 @@ Stripes:
Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Stripe: offset: 64699 data: 63754 rows: 5000 tail: 86 index: 837
- Stream: column 0 section ROW_INDEX start: 64699 length 17
- Stream: column 1 section ROW_INDEX start: 64716 length 162
- Stream: column 2 section ROW_INDEX start: 64878 length 171
- Stream: column 3 section ROW_INDEX start: 65049 length 83
- Stream: column 3 section BLOOM_FILTER start: 65132 length 404
- Stream: column 1 section DATA start: 65536 length 20029
- Stream: column 2 section DATA start: 85565 length 40035
- Stream: column 3 section DATA start: 125600 length 3532
- Stream: column 3 section LENGTH start: 129132 length 25
- Stream: column 3 section DICTIONARY_DATA start: 129157 length 133
+ Stripe: offset: 64826 data: 63775 rows: 5000 tail: 86 index: 944
+ Stream: column 0 section ROW_INDEX start: 64826 length 17
+ Stream: column 1 section ROW_INDEX start: 64843 length 164
+ Stream: column 2 section ROW_INDEX start: 65007 length 168
+ Stream: column 3 section ROW_INDEX start: 65175 length 83
+ Stream: column 3 section BLOOM_FILTER start: 65258 length 512
+ Stream: column 1 section DATA start: 65770 length 20035
+ Stream: column 2 section DATA start: 85805 length 40050
+ Stream: column 3 section DATA start: 125855 length 3532
+ Stream: column 3 section LENGTH start: 129387 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 129412 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -95,17 +95,17 @@ Stripes:
Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Stripe: offset: 129376 data: 63766 rows: 5000 tail: 86 index: 841
- Stream: column 0 section ROW_INDEX start: 129376 length 17
- Stream: column 1 section ROW_INDEX start: 129393 length 159
- Stream: column 2 section ROW_INDEX start: 129552 length 171
- Stream: column 3 section ROW_INDEX start: 129723 length 90
- Stream: column 3 section BLOOM_FILTER start: 129813 length 404
- Stream: column 1 section DATA start: 130217 length 20029
- Stream: column 2 section DATA start: 150246 length 40035
- Stream: column 3 section DATA start: 190281 length 3544
- Stream: column 3 section LENGTH start: 193825 length 25
- Stream: column 3 section DICTIONARY_DATA start: 193850 length 133
+ Stripe: offset: 129631 data: 63787 rows: 5000 tail: 86 index: 950
+ Stream: column 0 section ROW_INDEX start: 129631 length 17
+ Stream: column 1 section ROW_INDEX start: 129648 length 163
+ Stream: column 2 section ROW_INDEX start: 129811 length 168
+ Stream: column 3 section ROW_INDEX start: 129979 length 90
+ Stream: column 3 section BLOOM_FILTER start: 130069 length 512
+ Stream: column 1 section DATA start: 130581 length 20035
+ Stream: column 2 section DATA start: 150616 length 40050
+ Stream: column 3 section DATA start: 190666 length 3544
+ Stream: column 3 section LENGTH start: 194210 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 194235 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -123,17 +123,17 @@ Stripes:
Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Stripe: offset: 194069 data: 63796 rows: 5000 tail: 86 index: 844
- Stream: column 0 section ROW_INDEX start: 194069 length 17
- Stream: column 1 section ROW_INDEX start: 194086 length 162
- Stream: column 2 section ROW_INDEX start: 194248 length 170
- Stream: column 3 section ROW_INDEX start: 194418 length 91
- Stream: column 3 section BLOOM_FILTER start: 194509 length 404
- Stream: column 1 section DATA start: 194913 length 20029
- Stream: column 2 section DATA start: 214942 length 40035
- Stream: column 3 section DATA start: 254977 length 3574
- Stream: column 3 section LENGTH start: 258551 length 25
- Stream: column 3 section DICTIONARY_DATA start: 258576 length 133
+ Stripe: offset: 194454 data: 63817 rows: 5000 tail: 86 index: 952
+ Stream: column 0 section ROW_INDEX start: 194454 length 17
+ Stream: column 1 section ROW_INDEX start: 194471 length 165
+ Stream: column 2 section ROW_INDEX start: 194636 length 167
+ Stream: column 3 section ROW_INDEX start: 194803 length 91
+ Stream: column 3 section BLOOM_FILTER start: 194894 length 512
+ Stream: column 1 section DATA start: 195406 length 20035
+ Stream: column 2 section DATA start: 215441 length 40050
+ Stream: column 3 section DATA start: 255491 length 3574
+ Stream: column 3 section LENGTH start: 259065 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 259090 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -151,17 +151,17 @@ Stripes:
Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
- Stripe: offset: 258795 data: 12940 rows: 1000 tail: 78 index: 432
- Stream: column 0 section ROW_INDEX start: 258795 length 12
- Stream: column 1 section ROW_INDEX start: 258807 length 38
- Stream: column 2 section ROW_INDEX start: 258845 length 41
- Stream: column 3 section ROW_INDEX start: 258886 length 40
- Stream: column 3 section BLOOM_FILTER start: 258926 length 301
- Stream: column 1 section DATA start: 259227 length 4007
- Stream: column 2 section DATA start: 263234 length 8007
- Stream: column 3 section DATA start: 271241 length 768
- Stream: column 3 section LENGTH start: 272009 length 25
- Stream: column 3 section DICTIONARY_DATA start: 272034 length 133
+ Stripe: offset: 259309 data: 12943 rows: 1000 tail: 78 index: 432
+ Stream: column 0 section ROW_INDEX start: 259309 length 12
+ Stream: column 1 section ROW_INDEX start: 259321 length 38
+ Stream: column 2 section ROW_INDEX start: 259359 length 41
+ Stream: column 3 section ROW_INDEX start: 259400 length 40
+ Stream: column 3 section BLOOM_FILTER start: 259440 length 301
+ Stream: column 1 section DATA start: 259741 length 4007
+ Stream: column 2 section DATA start: 263748 length 8010
+ Stream: column 3 section DATA start: 271758 length 768
+ Stream: column 3 section LENGTH start: 272526 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 272551 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -172,6 +172,6 @@ Stripes:
Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
-File length: 272790 bytes
+File length: 273307 bytes
Padding length: 0 bytes
Padding ratio: 0%
http://git-wip-us.apache.org/repos/asf/hive/blob/51a0c03f/ql/src/test/resources/orc-file-dump-bloomfilter2.out
----------------------------------------------------------------------
diff --git a/ql/src/test/resources/orc-file-dump-bloomfilter2.out b/ql/src/test/resources/orc-file-dump-bloomfilter2.out
index 06b65ce..462d41f 100644
--- a/ql/src/test/resources/orc-file-dump-bloomfilter2.out
+++ b/ql/src/test/resources/orc-file-dump-bloomfilter2.out
@@ -2,7 +2,7 @@ Structure for TestFileDump.testDump.orc
File Version: 0.12 with HIVE_8732
Rows: 21000
Compression: ZLIB
-Compression size: 10000
+Compression size: 4096
Type: struct<i:int,l:bigint,s:string>
Stripe Statistics:
@@ -39,27 +39,27 @@ File Statistics:
Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
Stripes:
- Stripe: offset: 3 data: 63765 rows: 5000 tail: 85 index: 6935
+ Stripe: offset: 3 data: 63786 rows: 5000 tail: 85 index: 6974
Stream: column 0 section ROW_INDEX start: 3 length 17
- Stream: column 1 section ROW_INDEX start: 20 length 164
- Stream: column 2 section ROW_INDEX start: 184 length 173
- Stream: column 2 section BLOOM_FILTER start: 357 length 6494
- Stream: column 3 section ROW_INDEX start: 6851 length 87
- Stream: column 1 section DATA start: 6938 length 20029
- Stream: column 2 section DATA start: 26967 length 40035
- Stream: column 3 section DATA start: 67002 length 3543
- Stream: column 3 section LENGTH start: 70545 length 25
- Stream: column 3 section DICTIONARY_DATA start: 70570 length 133
+ Stream: column 1 section ROW_INDEX start: 20 length 166
+ Stream: column 2 section ROW_INDEX start: 186 length 169
+ Stream: column 2 section BLOOM_FILTER start: 355 length 6535
+ Stream: column 3 section ROW_INDEX start: 6890 length 87
+ Stream: column 1 section DATA start: 6977 length 20035
+ Stream: column 2 section DATA start: 27012 length 40050
+ Stream: column 3 section DATA start: 67062 length 3543
+ Stream: column 3 section LENGTH start: 70605 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 70630 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
Encoding column 3: DICTIONARY_V2[35]
Row group indices for column 2:
Entry 0: count: 1000 hasNull: false min: -9200577545527640566 max: 9175500305011173751 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -9203618157670445774 max: 9208123824411178101 positions: 0,4098,488
- Entry 2: count: 1000 hasNull: false min: -9218592812243954469 max: 9221351515892923972 positions: 10003,2294,464
- Entry 3: count: 1000 hasNull: false min: -9206585617947511272 max: 9167703224425685487 positions: 20006,490,440
- Entry 4: count: 1000 hasNull: false min: -9206645795733282496 max: 9221614132680747961 positions: 20006,8686,416
+ Entry 1: count: 1000 hasNull: false min: -9203618157670445774 max: 9208123824411178101 positions: 4099,2,488
+ Entry 2: count: 1000 hasNull: false min: -9218592812243954469 max: 9221351515892923972 positions: 12297,6,464
+ Entry 3: count: 1000 hasNull: false min: -9206585617947511272 max: 9167703224425685487 positions: 20495,10,440
+ Entry 4: count: 1000 hasNull: false min: -9206645795733282496 max: 9221614132680747961 positions: 28693,14,416
Bloom filters for column 2:
Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4931 loadFactor: 0.5136 expectedFpp: 0.009432924
Entry 1: numHashFunctions: 7 bitCount: 9600 popCount: 4956 loadFactor: 0.5163 expectedFpp: 0.009772834
@@ -67,27 +67,27 @@ Stripes:
Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4971 loadFactor: 0.5178 expectedFpp: 0.009981772
Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4949 loadFactor: 0.5155 expectedFpp: 0.009676614
Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9347 loadFactor: 0.9736 expectedFpp: 0.829482
- Stripe: offset: 70788 data: 63754 rows: 5000 tail: 85 index: 6917
- Stream: column 0 section ROW_INDEX start: 70788 length 17
- Stream: column 1 section ROW_INDEX start: 70805 length 162
- Stream: column 2 section ROW_INDEX start: 70967 length 171
- Stream: column 2 section BLOOM_FILTER start: 71138 length 6484
- Stream: column 3 section ROW_INDEX start: 77622 length 83
- Stream: column 1 section DATA start: 77705 length 20029
- Stream: column 2 section DATA start: 97734 length 40035
- Stream: column 3 section DATA start: 137769 length 3532
- Stream: column 3 section LENGTH start: 141301 length 25
- Stream: column 3 section DICTIONARY_DATA start: 141326 length 133
+ Stripe: offset: 70848 data: 63775 rows: 5000 tail: 85 index: 6965
+ Stream: column 0 section ROW_INDEX start: 70848 length 17
+ Stream: column 1 section ROW_INDEX start: 70865 length 164
+ Stream: column 2 section ROW_INDEX start: 71029 length 168
+ Stream: column 2 section BLOOM_FILTER start: 71197 length 6533
+ Stream: column 3 section ROW_INDEX start: 77730 length 83
+ Stream: column 1 section DATA start: 77813 length 20035
+ Stream: column 2 section DATA start: 97848 length 40050
+ Stream: column 3 section DATA start: 137898 length 3532
+ Stream: column 3 section LENGTH start: 141430 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 141455 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
Encoding column 3: DICTIONARY_V2[35]
Row group indices for column 2:
Entry 0: count: 1000 hasNull: false min: -9218450653857701562 max: 9189819526332228512 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -9220818777591257749 max: 9178821722829648113 positions: 0,4098,488
- Entry 2: count: 1000 hasNull: false min: -9220031433030423388 max: 9210838931786956852 positions: 10003,2294,464
- Entry 3: count: 1000 hasNull: false min: -9208195729739635607 max: 9222259462014003839 positions: 20006,490,440
- Entry 4: count: 1000 hasNull: false min: -9174271499932339698 max: 9212277876771676916 positions: 20006,8686,416
+ Entry 1: count: 1000 hasNull: false min: -9220818777591257749 max: 9178821722829648113 positions: 4099,2,488
+ Entry 2: count: 1000 hasNull: false min: -9220031433030423388 max: 9210838931786956852 positions: 12297,6,464
+ Entry 3: count: 1000 hasNull: false min: -9208195729739635607 max: 9222259462014003839 positions: 20495,10,440
+ Entry 4: count: 1000 hasNull: false min: -9174271499932339698 max: 9212277876771676916 positions: 28693,14,416
Bloom filters for column 2:
Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4971 loadFactor: 0.5178 expectedFpp: 0.009981772
Entry 1: numHashFunctions: 7 bitCount: 9600 popCount: 4988 loadFactor: 0.5196 expectedFpp: 0.010223193
@@ -95,27 +95,27 @@ Stripes:
Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4962 loadFactor: 0.5169 expectedFpp: 0.009855959
Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4966 loadFactor: 0.5173 expectedFpp: 0.009911705
Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9344 loadFactor: 0.9733 expectedFpp: 0.8276205
- Stripe: offset: 141544 data: 63766 rows: 5000 tail: 85 index: 6924
- Stream: column 0 section ROW_INDEX start: 141544 length 17
- Stream: column 1 section ROW_INDEX start: 141561 length 159
- Stream: column 2 section ROW_INDEX start: 141720 length 171
- Stream: column 2 section BLOOM_FILTER start: 141891 length 6487
- Stream: column 3 section ROW_INDEX start: 148378 length 90
- Stream: column 1 section DATA start: 148468 length 20029
- Stream: column 2 section DATA start: 168497 length 40035
- Stream: column 3 section DATA start: 208532 length 3544
- Stream: column 3 section LENGTH start: 212076 length 25
- Stream: column 3 section DICTIONARY_DATA start: 212101 length 133
+ Stripe: offset: 141673 data: 63787 rows: 5000 tail: 85 index: 6971
+ Stream: column 0 section ROW_INDEX start: 141673 length 17
+ Stream: column 1 section ROW_INDEX start: 141690 length 163
+ Stream: column 2 section ROW_INDEX start: 141853 length 168
+ Stream: column 2 section BLOOM_FILTER start: 142021 length 6533
+ Stream: column 3 section ROW_INDEX start: 148554 length 90
+ Stream: column 1 section DATA start: 148644 length 20035
+ Stream: column 2 section DATA start: 168679 length 40050
+ Stream: column 3 section DATA start: 208729 length 3544
+ Stream: column 3 section LENGTH start: 212273 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 212298 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
Encoding column 3: DICTIONARY_V2[35]
Row group indices for column 2:
Entry 0: count: 1000 hasNull: false min: -9211978436552246208 max: 9179058898902097152 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -9195645160817780503 max: 9189147759444307708 positions: 0,4098,488
- Entry 2: count: 1000 hasNull: false min: -9202888157616520823 max: 9193561362676960747 positions: 10003,2294,464
- Entry 3: count: 1000 hasNull: false min: -9216318198067839390 max: 9221286760675829363 positions: 20006,490,440
- Entry 4: count: 1000 hasNull: false min: -9218342074710552826 max: 9222303228623055266 positions: 20006,8686,416
+ Entry 1: count: 1000 hasNull: false min: -9195645160817780503 max: 9189147759444307708 positions: 4099,2,488
+ Entry 2: count: 1000 hasNull: false min: -9202888157616520823 max: 9193561362676960747 positions: 12297,6,464
+ Entry 3: count: 1000 hasNull: false min: -9216318198067839390 max: 9221286760675829363 positions: 20495,10,440
+ Entry 4: count: 1000 hasNull: false min: -9218342074710552826 max: 9222303228623055266 positions: 28693,14,416
Bloom filters for column 2:
Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4967 loadFactor: 0.5174 expectedFpp: 0.009925688
Entry 1: numHashFunctions: 7 bitCount: 9600 popCount: 5002 loadFactor: 0.521 expectedFpp: 0.01042575
@@ -123,27 +123,27 @@ Stripes:
Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4943 loadFactor: 0.5149 expectedFpp: 0.009594797
Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4930 loadFactor: 0.5135 expectedFpp: 0.009419539
Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9333 loadFactor: 0.9722 expectedFpp: 0.82082444
- Stripe: offset: 212319 data: 63796 rows: 5000 tail: 85 index: 6925
- Stream: column 0 section ROW_INDEX start: 212319 length 17
- Stream: column 1 section ROW_INDEX start: 212336 length 162
- Stream: column 2 section ROW_INDEX start: 212498 length 170
- Stream: column 2 section BLOOM_FILTER start: 212668 length 6485
- Stream: column 3 section ROW_INDEX start: 219153 length 91
- Stream: column 1 section DATA start: 219244 length 20029
- Stream: column 2 section DATA start: 239273 length 40035
- Stream: column 3 section DATA start: 279308 length 3574
- Stream: column 3 section LENGTH start: 282882 length 25
- Stream: column 3 section DICTIONARY_DATA start: 282907 length 133
+ Stripe: offset: 212516 data: 63817 rows: 5000 tail: 85 index: 6964
+ Stream: column 0 section ROW_INDEX start: 212516 length 17
+ Stream: column 1 section ROW_INDEX start: 212533 length 165
+ Stream: column 2 section ROW_INDEX start: 212698 length 167
+ Stream: column 2 section BLOOM_FILTER start: 212865 length 6524
+ Stream: column 3 section ROW_INDEX start: 219389 length 91
+ Stream: column 1 section DATA start: 219480 length 20035
+ Stream: column 2 section DATA start: 239515 length 40050
+ Stream: column 3 section DATA start: 279565 length 3574
+ Stream: column 3 section LENGTH start: 283139 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 283164 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
Encoding column 3: DICTIONARY_V2[35]
Row group indices for column 2:
Entry 0: count: 1000 hasNull: false min: -9222731174895935707 max: 9214167447015056056 positions: 0,0,0
- Entry 1: count: 1000 hasNull: false min: -9222758097219661129 max: 9221043130193737406 positions: 0,4098,488
- Entry 2: count: 1000 hasNull: false min: -9174483776261243438 max: 9208134757538374043 positions: 10003,2294,464
- Entry 3: count: 1000 hasNull: false min: -9174329712613510612 max: 9197412874152820822 positions: 20006,490,440
- Entry 4: count: 1000 hasNull: false min: -9221162005892422758 max: 9220625004936875965 positions: 20006,8686,416
+ Entry 1: count: 1000 hasNull: false min: -9222758097219661129 max: 9221043130193737406 positions: 4099,2,488
+ Entry 2: count: 1000 hasNull: false min: -9174483776261243438 max: 9208134757538374043 positions: 12297,6,464
+ Entry 3: count: 1000 hasNull: false min: -9174329712613510612 max: 9197412874152820822 positions: 20495,10,440
+ Entry 4: count: 1000 hasNull: false min: -9221162005892422758 max: 9220625004936875965 positions: 28693,14,416
Bloom filters for column 2:
Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4951 loadFactor: 0.5157 expectedFpp: 0.009704026
Entry 1: numHashFunctions: 7 bitCount: 9600 popCount: 4969 loadFactor: 0.5176 expectedFpp: 0.009953696
@@ -151,17 +151,17 @@ Stripes:
Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4941 loadFactor: 0.5147 expectedFpp: 0.009567649
Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4993 loadFactor: 0.5201 expectedFpp: 0.010295142
Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9353 loadFactor: 0.9743 expectedFpp: 0.8332165
- Stripe: offset: 283125 data: 12940 rows: 1000 tail: 78 index: 1468
- Stream: column 0 section ROW_INDEX start: 283125 length 12
- Stream: column 1 section ROW_INDEX start: 283137 length 38
- Stream: column 2 section ROW_INDEX start: 283175 length 41
- Stream: column 2 section BLOOM_FILTER start: 283216 length 1337
- Stream: column 3 section ROW_INDEX start: 284553 length 40
- Stream: column 1 section DATA start: 284593 length 4007
- Stream: column 2 section DATA start: 288600 length 8007
- Stream: column 3 section DATA start: 296607 length 768
- Stream: column 3 section LENGTH start: 297375 length 25
- Stream: column 3 section DICTIONARY_DATA start: 297400 length 133
+ Stripe: offset: 283382 data: 12943 rows: 1000 tail: 78 index: 1468
+ Stream: column 0 section ROW_INDEX start: 283382 length 12
+ Stream: column 1 section ROW_INDEX start: 283394 length 38
+ Stream: column 2 section ROW_INDEX start: 283432 length 41
+ Stream: column 2 section BLOOM_FILTER start: 283473 length 1337
+ Stream: column 3 section ROW_INDEX start: 284810 length 40
+ Stream: column 1 section DATA start: 284850 length 4007
+ Stream: column 2 section DATA start: 288857 length 8010
+ Stream: column 3 section DATA start: 296867 length 768
+ Stream: column 3 section LENGTH start: 297635 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 297660 length 133
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
@@ -172,6 +172,6 @@ Stripes:
Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4948 loadFactor: 0.5154 expectedFpp: 0.00966294
Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 4948 loadFactor: 0.5154 expectedFpp: 0.00966294
-File length: 298155 bytes
+File length: 298416 bytes
Padding length: 0 bytes
Padding ratio: 0%