You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2016/07/29 21:18:58 UTC
[2/6] orc git commit: ORC-84. Create a separate java tool module.
http://git-wip-us.apache.org/repos/asf/orc/blob/b2f84ce4/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
----------------------------------------------------------------------
diff --git a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
new file mode 100644
index 0000000..10cc87d
--- /dev/null
+++ b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
@@ -0,0 +1,645 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assume.assumeTrue;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.PrintStream;
+import java.sql.Date;
+import java.sql.Timestamp;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.orc.ColumnStatistics;
+import org.apache.orc.CompressionKind;
+import org.apache.orc.OrcConf;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.StripeStatistics;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.Writer;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestFileDump {
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir"));
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Before
+ public void openFileSystem () throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ fs.setWorkingDirectory(workDir);
+ testFilePath = new Path("TestFileDump.testDump.orc");
+ fs.delete(testFilePath, false);
+ }
+
+ static TypeDescription getMyRecordType() {
+ return TypeDescription.createStruct()
+ .addField("i", TypeDescription.createInt())
+ .addField("l", TypeDescription.createLong())
+ .addField("s", TypeDescription.createString());
+ }
+
+ static void appendMyRecord(VectorizedRowBatch batch,
+ int i,
+ long l,
+ String str) {
+ ((LongColumnVector) batch.cols[0]).vector[batch.size] = i;
+ ((LongColumnVector) batch.cols[1]).vector[batch.size] = l;
+ if (str == null) {
+ batch.cols[2].noNulls = false;
+ batch.cols[2].isNull[batch.size] = true;
+ } else {
+ ((BytesColumnVector) batch.cols[2]).setVal(batch.size,
+ str.getBytes());
+ }
+ batch.size += 1;
+ }
+
+ static TypeDescription getAllTypesType() {
+ return TypeDescription.createStruct()
+ .addField("b", TypeDescription.createBoolean())
+ .addField("bt", TypeDescription.createByte())
+ .addField("s", TypeDescription.createShort())
+ .addField("i", TypeDescription.createInt())
+ .addField("l", TypeDescription.createLong())
+ .addField("f", TypeDescription.createFloat())
+ .addField("d", TypeDescription.createDouble())
+ .addField("de", TypeDescription.createDecimal())
+ .addField("t", TypeDescription.createTimestamp())
+ .addField("dt", TypeDescription.createDate())
+ .addField("str", TypeDescription.createString())
+ .addField("c", TypeDescription.createChar().withMaxLength(5))
+ .addField("vc", TypeDescription.createVarchar().withMaxLength(10))
+ .addField("m", TypeDescription.createMap(
+ TypeDescription.createString(),
+ TypeDescription.createString()))
+ .addField("a", TypeDescription.createList(TypeDescription.createInt()))
+ .addField("st", TypeDescription.createStruct()
+ .addField("i", TypeDescription.createInt())
+ .addField("s", TypeDescription.createString()));
+ }
+
+ static void appendAllTypes(VectorizedRowBatch batch,
+ boolean b,
+ byte bt,
+ short s,
+ int i,
+ long l,
+ float f,
+ double d,
+ HiveDecimalWritable de,
+ Timestamp t,
+ DateWritable dt,
+ String str,
+ String c,
+ String vc,
+ Map<String, String> m,
+ List<Integer> a,
+ int sti,
+ String sts) {
+ int row = batch.size++;
+ ((LongColumnVector) batch.cols[0]).vector[row] = b ? 1 : 0;
+ ((LongColumnVector) batch.cols[1]).vector[row] = bt;
+ ((LongColumnVector) batch.cols[2]).vector[row] = s;
+ ((LongColumnVector) batch.cols[3]).vector[row] = i;
+ ((LongColumnVector) batch.cols[4]).vector[row] = l;
+ ((DoubleColumnVector) batch.cols[5]).vector[row] = f;
+ ((DoubleColumnVector) batch.cols[6]).vector[row] = d;
+ ((DecimalColumnVector) batch.cols[7]).vector[row].set(de);
+ ((TimestampColumnVector) batch.cols[8]).set(row, t);
+ ((LongColumnVector) batch.cols[9]).vector[row] = dt.getDays();
+ ((BytesColumnVector) batch.cols[10]).setVal(row, str.getBytes());
+ ((BytesColumnVector) batch.cols[11]).setVal(row, c.getBytes());
+ ((BytesColumnVector) batch.cols[12]).setVal(row, vc.getBytes());
+ MapColumnVector map = (MapColumnVector) batch.cols[13];
+ int offset = map.childCount;
+ map.offsets[row] = offset;
+ map.lengths[row] = m.size();
+ map.childCount += map.lengths[row];
+ for(Map.Entry<String, String> entry: m.entrySet()) {
+ ((BytesColumnVector) map.keys).setVal(offset, entry.getKey().getBytes());
+ ((BytesColumnVector) map.values).setVal(offset++,
+ entry.getValue().getBytes());
+ }
+ ListColumnVector list = (ListColumnVector) batch.cols[14];
+ offset = list.childCount;
+ list.offsets[row] = offset;
+ list.lengths[row] = a.size();
+ list.childCount += list.lengths[row];
+ for(int e=0; e < a.size(); ++e) {
+ ((LongColumnVector) list.child).vector[offset + e] = a.get(e);
+ }
+ StructColumnVector struct = (StructColumnVector) batch.cols[15];
+ ((LongColumnVector) struct.fields[0]).vector[row] = sti;
+ ((BytesColumnVector) struct.fields[1]).setVal(row, sts.getBytes());
+ }
+
+ public static void checkOutput(String expected,
+ String actual) throws Exception {
+ BufferedReader eStream =
+ new BufferedReader(new FileReader
+ (TestJsonFileDump.getFileFromClasspath(expected)));
+ BufferedReader aStream =
+ new BufferedReader(new FileReader(actual));
+ String expectedLine = eStream.readLine().trim();
+ while (expectedLine != null) {
+ String actualLine = aStream.readLine().trim();
+ Assert.assertEquals(expectedLine, actualLine);
+ expectedLine = eStream.readLine();
+ expectedLine = expectedLine == null ? null : expectedLine.trim();
+ }
+ Assert.assertNull(eStream.readLine());
+ Assert.assertNull(aStream.readLine());
+ eStream.close();
+ aStream.close();
+ }
+
+ @Test
+ public void testDump() throws Exception {
+ TypeDescription schema = getMyRecordType();
+ conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION");
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(schema)
+ .compress(CompressionKind.ZLIB)
+ .stripeSize(100000)
+ .rowIndexStride(1000));
+ Random r1 = new Random(1);
+ String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
+ "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
+ "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
+ "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
+ "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
+ "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
+ "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
+ "we", "had", "everything", "before", "us,", "we", "had", "nothing",
+ "before", "us,", "we", "were", "all", "going", "direct", "to",
+ "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
+ "way"};
+ VectorizedRowBatch batch = schema.createRowBatch(1000);
+ for(int i=0; i < 21000; ++i) {
+ appendMyRecord(batch, r1.nextInt(), r1.nextLong(),
+ words[r1.nextInt(words.length)]);
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ }
+ if (batch.size > 0) {
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ PrintStream origOut = System.out;
+ String outputFilename = "orc-file-dump.out";
+ FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString(), "--rowindex=1,2,3"});
+ System.out.flush();
+ System.setOut(origOut);
+
+
+ checkOutput(outputFilename, workDir + File.separator + outputFilename);
+ }
+
+ @Test
+ public void testDataDump() throws Exception {
+ TypeDescription schema = getAllTypesType();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .rowIndexStride(1000));
+ VectorizedRowBatch batch = schema.createRowBatch(1000);
+ Map<String, String> m = new HashMap<String, String>(2);
+ m.put("k1", "v1");
+ appendAllTypes(batch,
+ true,
+ (byte) 10,
+ (short) 100,
+ 1000,
+ 10000L,
+ 4.0f,
+ 20.0,
+ new HiveDecimalWritable("4.2222"),
+ new Timestamp(1416967764000L),
+ new DateWritable(new Date(1416967764000L)),
+ "string",
+ "hello",
+ "hello",
+ m,
+ Arrays.asList(100, 200),
+ 10, "foo");
+ m.clear();
+ m.put("k3", "v3");
+ appendAllTypes(
+ batch,
+ false,
+ (byte)20,
+ (short)200,
+ 2000,
+ 20000L,
+ 8.0f,
+ 40.0,
+ new HiveDecimalWritable("2.2222"),
+ new Timestamp(1416967364000L),
+ new DateWritable(new Date(1411967764000L)),
+ "abcd",
+ "world",
+ "world",
+ m,
+ Arrays.asList(200, 300),
+ 20, "bar");
+ writer.addRowBatch(batch);
+
+ writer.close();
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString(), "-d"});
+ System.out.flush();
+ System.setOut(origOut);
+ String[] lines = myOut.toString().split("\n");
+ Assert.assertEquals("{\"b\":true,\"bt\":10,\"s\":100,\"i\":1000,\"l\":10000,\"f\":4,\"d\":20,\"de\":\"4.2222\",\"t\":\"2014-11-25 18:09:24.0\",\"dt\":\"2014-11-25\",\"str\":\"string\",\"c\":\"hello\",\"vc\":\"hello\",\"m\":[{\"_key\":\"k1\",\"_value\":\"v1\"}],\"a\":[100,200],\"st\":{\"i\":10,\"s\":\"foo\"}}", lines[0]);
+ Assert.assertEquals("{\"b\":false,\"bt\":20,\"s\":200,\"i\":2000,\"l\":20000,\"f\":8,\"d\":40,\"de\":\"2.2222\",\"t\":\"2014-11-25 18:02:44.0\",\"dt\":\"2014-09-28\",\"str\":\"abcd\",\"c\":\"world\",\"vc\":\"world\",\"m\":[{\"_key\":\"k3\",\"_value\":\"v3\"}],\"a\":[200,300],\"st\":{\"i\":20,\"s\":\"bar\"}}", lines[1]);
+ }
+
+ // Test that if the fraction of rows that have distinct strings is greater than the configured
+ // threshold dictionary encoding is turned off. If dictionary encoding is turned off the length
+ // of the dictionary stream for the column will be 0 in the ORC file dump.
+ @Test
+ public void testDictionaryThreshold() throws Exception {
+ TypeDescription schema = getMyRecordType();
+ Configuration conf = new Configuration();
+ conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION");
+ conf.setFloat(OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.getAttribute(), 0.49f);
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.ZLIB)
+ .rowIndexStride(1000)
+ .bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch(1000);
+ Random r1 = new Random(1);
+ String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
+ "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
+ "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
+ "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
+ "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
+ "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
+ "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
+ "we", "had", "everything", "before", "us,", "we", "had", "nothing",
+ "before", "us,", "we", "were", "all", "going", "direct", "to",
+ "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
+ "way"};
+ int nextInt = 0;
+ for(int i=0; i < 21000; ++i) {
+ // Write out the same string twice, this guarantees the fraction of rows with
+ // distinct strings is 0.5
+ if (i % 2 == 0) {
+ nextInt = r1.nextInt(words.length);
+ // Append the value of i to the word, this guarantees when an index or word is repeated
+ // the actual string is unique.
+ words[nextInt] += "-" + i;
+ }
+ appendMyRecord(batch, r1.nextInt(), r1.nextLong(), words[nextInt]);
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ }
+ if (batch.size != 0) {
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ PrintStream origOut = System.out;
+ String outputFilename = "orc-file-dump-dictionary-threshold.out";
+ FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString(), "--rowindex=1,2,3"});
+ System.out.flush();
+ System.setOut(origOut);
+
+ checkOutput(outputFilename, workDir + File.separator + outputFilename);
+ }
+
+ @Test
+ public void testBloomFilter() throws Exception {
+ TypeDescription schema = getMyRecordType();
+ conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION");
+ OrcFile.WriterOptions options = OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.ZLIB)
+ .bufferSize(10000)
+ .rowIndexStride(1000)
+ .bloomFilterColumns("S");
+ Writer writer = OrcFile.createWriter(testFilePath, options);
+ Random r1 = new Random(1);
+ String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
+ "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
+ "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
+ "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
+ "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
+ "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
+ "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
+ "we", "had", "everything", "before", "us,", "we", "had", "nothing",
+ "before", "us,", "we", "were", "all", "going", "direct", "to",
+ "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
+ "way"};
+ VectorizedRowBatch batch = schema.createRowBatch(1000);
+ for(int i=0; i < 21000; ++i) {
+ appendMyRecord(batch, r1.nextInt(), r1.nextLong(),
+ words[r1.nextInt(words.length)]);
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ }
+ if (batch.size > 0) {
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ PrintStream origOut = System.out;
+ String outputFilename = "orc-file-dump-bloomfilter.out";
+ FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString(), "--rowindex=3"});
+ System.out.flush();
+ System.setOut(origOut);
+
+
+ checkOutput(outputFilename, workDir + File.separator + outputFilename);
+ }
+
+ @Test
+ public void testBloomFilter2() throws Exception {
+ TypeDescription schema = getMyRecordType();
+ conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION");
+ OrcFile.WriterOptions options = OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.ZLIB)
+ .bufferSize(10000)
+ .rowIndexStride(1000)
+ .bloomFilterColumns("l")
+ .bloomFilterFpp(0.01);
+ VectorizedRowBatch batch = schema.createRowBatch(1000);
+ Writer writer = OrcFile.createWriter(testFilePath, options);
+ Random r1 = new Random(1);
+ String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
+ "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
+ "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
+ "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
+ "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
+ "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
+ "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
+ "we", "had", "everything", "before", "us,", "we", "had", "nothing",
+ "before", "us,", "we", "were", "all", "going", "direct", "to",
+ "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
+ "way"};
+ for(int i=0; i < 21000; ++i) {
+ appendMyRecord(batch, r1.nextInt(), r1.nextLong(),
+ words[r1.nextInt(words.length)]);
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ }
+ if (batch.size > 0) {
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ PrintStream origOut = System.out;
+ String outputFilename = "orc-file-dump-bloomfilter2.out";
+ FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString(), "--rowindex=2"});
+ System.out.flush();
+ System.setOut(origOut);
+
+
+ checkOutput(outputFilename, workDir + File.separator + outputFilename);
+ }
+
+ private static BytesWritable bytes(int... items) {
+ BytesWritable result = new BytesWritable();
+ result.setSize(items.length);
+ for (int i = 0; i < items.length; ++i) {
+ result.getBytes()[i] = (byte) items[i];
+ }
+ return result;
+ }
+
+ private void appendRow(VectorizedRowBatch batch, BytesWritable bytes,
+ String str) {
+ int row = batch.size++;
+ if (bytes == null) {
+ batch.cols[0].noNulls = false;
+ batch.cols[0].isNull[row] = true;
+ } else {
+ ((BytesColumnVector) batch.cols[0]).setVal(row, bytes.getBytes(),
+ 0, bytes.getLength());
+ }
+ if (str == null) {
+ batch.cols[1].noNulls = false;
+ batch.cols[1].isNull[row] = true;
+ } else {
+ ((BytesColumnVector) batch.cols[1]).setVal(row, str.getBytes());
+ }
+ }
+
+ @Test
+ public void testHasNull() throws Exception {
+ TypeDescription schema =
+ TypeDescription.createStruct()
+ .addField("bytes1", TypeDescription.createBinary())
+ .addField("string1", TypeDescription.createString());
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .rowIndexStride(1000)
+ .stripeSize(10000)
+ .bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch(5000);
+ // STRIPE 1
+ // RG1
+ for(int i=0; i<1000; i++) {
+ appendRow(batch, bytes(1, 2, 3), "RG1");
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ // RG2
+ for(int i=0; i<1000; i++) {
+ appendRow(batch, bytes(1, 2, 3), null);
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ // RG3
+ for(int i=0; i<1000; i++) {
+ appendRow(batch, bytes(1, 2, 3), "RG3");
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ // RG4
+ for (int i = 0; i < 1000; i++) {
+ appendRow(batch, bytes(1,2,3), null);
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ // RG5
+ for(int i=0; i<1000; i++) {
+ appendRow(batch, bytes(1, 2, 3), null);
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ // STRIPE 2
+ for (int i = 0; i < 5000; i++) {
+ appendRow(batch, bytes(1,2,3), null);
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ // STRIPE 3
+ for (int i = 0; i < 5000; i++) {
+ appendRow(batch, bytes(1,2,3), "STRIPE-3");
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ // STRIPE 4
+ for (int i = 0; i < 5000; i++) {
+ appendRow(batch, bytes(1,2,3), null);
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+
+ // check the file level stats
+ ColumnStatistics[] stats = reader.getStatistics();
+ assertEquals(20000, stats[0].getNumberOfValues());
+ assertEquals(20000, stats[1].getNumberOfValues());
+ assertEquals(7000, stats[2].getNumberOfValues());
+ assertEquals(false, stats[0].hasNull());
+ assertEquals(false, stats[1].hasNull());
+ assertEquals(true, stats[2].hasNull());
+
+ // check the stripe level stats
+ List<StripeStatistics> stripeStats = reader.getStripeStatistics();
+ // stripe 1 stats
+ StripeStatistics ss1 = stripeStats.get(0);
+ ColumnStatistics ss1_cs1 = ss1.getColumnStatistics()[0];
+ ColumnStatistics ss1_cs2 = ss1.getColumnStatistics()[1];
+ ColumnStatistics ss1_cs3 = ss1.getColumnStatistics()[2];
+ assertEquals(false, ss1_cs1.hasNull());
+ assertEquals(false, ss1_cs2.hasNull());
+ assertEquals(true, ss1_cs3.hasNull());
+
+ // stripe 2 stats
+ StripeStatistics ss2 = stripeStats.get(1);
+ ColumnStatistics ss2_cs1 = ss2.getColumnStatistics()[0];
+ ColumnStatistics ss2_cs2 = ss2.getColumnStatistics()[1];
+ ColumnStatistics ss2_cs3 = ss2.getColumnStatistics()[2];
+ assertEquals(false, ss2_cs1.hasNull());
+ assertEquals(false, ss2_cs2.hasNull());
+ assertEquals(true, ss2_cs3.hasNull());
+
+ // stripe 3 stats
+ StripeStatistics ss3 = stripeStats.get(2);
+ ColumnStatistics ss3_cs1 = ss3.getColumnStatistics()[0];
+ ColumnStatistics ss3_cs2 = ss3.getColumnStatistics()[1];
+ ColumnStatistics ss3_cs3 = ss3.getColumnStatistics()[2];
+ assertEquals(false, ss3_cs1.hasNull());
+ assertEquals(false, ss3_cs2.hasNull());
+ assertEquals(false, ss3_cs3.hasNull());
+
+ // stripe 4 stats
+ StripeStatistics ss4 = stripeStats.get(3);
+ ColumnStatistics ss4_cs1 = ss4.getColumnStatistics()[0];
+ ColumnStatistics ss4_cs2 = ss4.getColumnStatistics()[1];
+ ColumnStatistics ss4_cs3 = ss4.getColumnStatistics()[2];
+ assertEquals(false, ss4_cs1.hasNull());
+ assertEquals(false, ss4_cs2.hasNull());
+ assertEquals(true, ss4_cs3.hasNull());
+
+ // Test file dump
+ PrintStream origOut = System.out;
+ String outputFilename = "orc-file-has-null.out";
+ FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString(), "--rowindex=2"});
+ System.out.flush();
+ System.setOut(origOut);
+ // If called with an expression evaluating to false, the test will halt
+ // and be ignored.
+ assumeTrue(!System.getProperty("os.name").startsWith("Windows"));
+ TestFileDump.checkOutput(outputFilename, workDir + File.separator + outputFilename);
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/b2f84ce4/java/tools/src/test/org/apache/orc/tools/TestJsonFileDump.java
----------------------------------------------------------------------
diff --git a/java/tools/src/test/org/apache/orc/tools/TestJsonFileDump.java b/java/tools/src/test/org/apache/orc/tools/TestJsonFileDump.java
new file mode 100644
index 0000000..eadc216
--- /dev/null
+++ b/java/tools/src/test/org/apache/orc/tools/TestJsonFileDump.java
@@ -0,0 +1,149 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.PrintStream;
+import java.net.URL;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.CompressionKind;
+import org.apache.orc.OrcConf;
+import org.apache.orc.OrcFile;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.Writer;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestJsonFileDump {
+
+ public static String getFileFromClasspath(String name) {
+ URL url = ClassLoader.getSystemResource(name);
+ if (url == null) {
+ throw new IllegalArgumentException("Could not find " + name);
+ }
+ return url.getPath();
+ }
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir"));
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Before
+ public void openFileSystem () throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ fs.setWorkingDirectory(workDir);
+ testFilePath = new Path("TestFileDump.testDump.orc");
+ fs.delete(testFilePath, false);
+ }
+
+ static void checkOutput(String expected,
+ String actual) throws Exception {
+ BufferedReader eStream =
+ new BufferedReader(new FileReader(getFileFromClasspath(expected)));
+ BufferedReader aStream =
+ new BufferedReader(new FileReader(actual));
+ String expectedLine = eStream.readLine();
+ while (expectedLine != null) {
+ String actualLine = aStream.readLine();
+ assertEquals(expectedLine, actualLine);
+ expectedLine = eStream.readLine();
+ }
+ assertNull(eStream.readLine());
+ assertNull(aStream.readLine());
+ }
+
+ @Test
+ public void testJsonDump() throws Exception {
+ TypeDescription schema = TypeDescription.createStruct()
+ .addField("i", TypeDescription.createInt())
+ .addField("l", TypeDescription.createLong())
+ .addField("s", TypeDescription.createString());
+ conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION");
+ OrcFile.WriterOptions options = OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.ZLIB)
+ .bufferSize(10000)
+ .rowIndexStride(1000)
+ .bloomFilterColumns("s");
+ Writer writer = OrcFile.createWriter(testFilePath, options);
+ Random r1 = new Random(1);
+ String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
+ "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
+ "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
+ "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
+ "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
+ "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
+ "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
+ "we", "had", "everything", "before", "us,", "we", "had", "nothing",
+ "before", "us,", "we", "were", "all", "going", "direct", "to",
+ "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
+ "way"};
+ VectorizedRowBatch batch = schema.createRowBatch(1000);
+ for(int i=0; i < 21000; ++i) {
+ ((LongColumnVector) batch.cols[0]).vector[batch.size] = r1.nextInt();
+ ((LongColumnVector) batch.cols[1]).vector[batch.size] = r1.nextLong();
+ if (i % 100 == 0) {
+ batch.cols[2].noNulls = false;
+ batch.cols[2].isNull[batch.size] = true;
+ } else {
+ ((BytesColumnVector) batch.cols[2]).setVal(batch.size,
+ words[r1.nextInt(words.length)].getBytes());
+ }
+ batch.size += 1;
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ }
+ if (batch.size > 0) {
+ writer.addRowBatch(batch);
+ }
+
+ writer.close();
+ PrintStream origOut = System.out;
+ String outputFilename = "orc-file-dump.json";
+ FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString(), "-j", "-p", "--rowindex=3"});
+ System.out.flush();
+ System.setOut(origOut);
+
+
+ checkOutput(outputFilename, workDir + File.separator + outputFilename);
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/b2f84ce4/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
new file mode 100644
index 0000000..18fd2fb
--- /dev/null
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
@@ -0,0 +1,179 @@
+Structure for TestFileDump.testDump.orc
+File Version: 0.12 with HIVE_13083
+Rows: 21000
+Compression: ZLIB
+Compression size: 4096
+Type: struct<i:int,l:bigint,s:string>
+
+Stripe Statistics:
+ Stripe 1:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2146021688 max: 2147223299 sum: 515792826
+ Column 2: count: 5000 hasNull: false min: -9218592812243954469 max: 9221614132680747961
+ Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19280
+ Stripe 2:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2146733128 max: 2147001622 sum: 7673427
+ Column 2: count: 5000 hasNull: false min: -9220818777591257749 max: 9222259462014003839
+ Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19504
+ Stripe 3:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2146993718 max: 2147378179 sum: 132660742551
+ Column 2: count: 5000 hasNull: false min: -9218342074710552826 max: 9222303228623055266
+ Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19641
+ Stripe 4:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2146658006 max: 2145520931 sum: 8533549236
+ Column 2: count: 5000 hasNull: false min: -9222758097219661129 max: 9221043130193737406
+ Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19470
+ Stripe 5:
+ Column 0: count: 1000 hasNull: false
+ Column 1: count: 1000 hasNull: false min: -2146245500 max: 2146378640 sum: 51299706363
+ Column 2: count: 1000 hasNull: false min: -9208193203370316142 max: 9218567213558056476
+ Column 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3866
+
+File Statistics:
+ Column 0: count: 21000 hasNull: false
+ Column 1: count: 21000 hasNull: false min: -2146993718 max: 2147378179 sum: 193017464403
+ Column 2: count: 21000 hasNull: false min: -9222758097219661129 max: 9222303228623055266
+ Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
+
+Stripes:
+ Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 951
+ Stream: column 0 section ROW_INDEX start: 3 length 17
+ Stream: column 1 section ROW_INDEX start: 20 length 166
+ Stream: column 2 section ROW_INDEX start: 186 length 169
+ Stream: column 3 section ROW_INDEX start: 355 length 87
+ Stream: column 3 section BLOOM_FILTER start: 442 length 512
+ Stream: column 1 section DATA start: 954 length 20035
+ Stream: column 2 section DATA start: 20989 length 40050
+ Stream: column 3 section DATA start: 61039 length 3543
+ Stream: column 3 section LENGTH start: 64582 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 64607 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 3:
+ Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3862 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: Darkness, max: worst sum: 3884 positions: 0,659,149
+ Entry 2: count: 1000 hasNull: false min: Darkness, max: worst sum: 3893 positions: 0,1531,3
+ Entry 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3798 positions: 0,2281,32
+ Entry 4: count: 1000 hasNull: false min: Darkness, max: worst sum: 3843 positions: 0,3033,45
+ Bloom filters for column 3:
+ Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 1: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 2: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Stripe: offset: 64826 data: 63775 rows: 5000 tail: 86 index: 944
+ Stream: column 0 section ROW_INDEX start: 64826 length 17
+ Stream: column 1 section ROW_INDEX start: 64843 length 164
+ Stream: column 2 section ROW_INDEX start: 65007 length 168
+ Stream: column 3 section ROW_INDEX start: 65175 length 83
+ Stream: column 3 section BLOOM_FILTER start: 65258 length 512
+ Stream: column 1 section DATA start: 65770 length 20035
+ Stream: column 2 section DATA start: 85805 length 40050
+ Stream: column 3 section DATA start: 125855 length 3532
+ Stream: column 3 section LENGTH start: 129387 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 129412 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 3:
+ Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3923 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: Darkness, max: worst sum: 3869 positions: 0,761,12
+ Entry 2: count: 1000 hasNull: false min: Darkness, max: worst sum: 3817 positions: 0,1472,70
+ Entry 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3931 positions: 0,2250,43
+ Entry 4: count: 1000 hasNull: false min: Darkness, max: worst sum: 3964 positions: 0,2978,88
+ Bloom filters for column 3:
+ Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 1: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 2: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Stripe: offset: 129631 data: 63787 rows: 5000 tail: 86 index: 950
+ Stream: column 0 section ROW_INDEX start: 129631 length 17
+ Stream: column 1 section ROW_INDEX start: 129648 length 163
+ Stream: column 2 section ROW_INDEX start: 129811 length 168
+ Stream: column 3 section ROW_INDEX start: 129979 length 90
+ Stream: column 3 section BLOOM_FILTER start: 130069 length 512
+ Stream: column 1 section DATA start: 130581 length 20035
+ Stream: column 2 section DATA start: 150616 length 40050
+ Stream: column 3 section DATA start: 190666 length 3544
+ Stream: column 3 section LENGTH start: 194210 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 194235 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 3:
+ Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3817 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: Darkness, max: worst sum: 4008 positions: 0,634,174
+ Entry 2: count: 1000 hasNull: false min: Darkness, max: worst sum: 3999 positions: 0,1469,69
+ Entry 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3817 positions: 0,2133,194
+ Entry 4: count: 1000 hasNull: false min: Darkness, max: worst sum: 4000 positions: 0,3005,43
+ Bloom filters for column 3:
+ Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 1: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 2: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Stripe: offset: 194454 data: 63817 rows: 5000 tail: 86 index: 952
+ Stream: column 0 section ROW_INDEX start: 194454 length 17
+ Stream: column 1 section ROW_INDEX start: 194471 length 165
+ Stream: column 2 section ROW_INDEX start: 194636 length 167
+ Stream: column 3 section ROW_INDEX start: 194803 length 91
+ Stream: column 3 section BLOOM_FILTER start: 194894 length 512
+ Stream: column 1 section DATA start: 195406 length 20035
+ Stream: column 2 section DATA start: 215441 length 40050
+ Stream: column 3 section DATA start: 255491 length 3574
+ Stream: column 3 section LENGTH start: 259065 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 259090 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 3:
+ Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3901 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: Darkness, max: worst sum: 3900 positions: 0,431,431
+ Entry 2: count: 1000 hasNull: false min: Darkness, max: worst sum: 3909 positions: 0,1485,52
+ Entry 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3947 positions: 0,2196,104
+ Entry 4: count: 1000 hasNull: false min: Darkness, max: worst sum: 3813 positions: 0,2934,131
+ Bloom filters for column 3:
+ Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 1: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 2: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Stripe: offset: 259309 data: 12943 rows: 1000 tail: 78 index: 432
+ Stream: column 0 section ROW_INDEX start: 259309 length 12
+ Stream: column 1 section ROW_INDEX start: 259321 length 38
+ Stream: column 2 section ROW_INDEX start: 259359 length 41
+ Stream: column 3 section ROW_INDEX start: 259400 length 40
+ Stream: column 3 section BLOOM_FILTER start: 259440 length 301
+ Stream: column 1 section DATA start: 259741 length 4007
+ Stream: column 2 section DATA start: 263748 length 8010
+ Stream: column 3 section DATA start: 271758 length 768
+ Stream: column 3 section LENGTH start: 272526 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 272551 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 3:
+ Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3866 positions: 0,0,0
+ Bloom filters for column 3:
+ Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+ Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7
+
+File length: 273307 bytes
+Padding length: 0 bytes
+Padding ratio: 0%
+________________________________________________________________________________________________________________________
+
http://git-wip-us.apache.org/repos/asf/orc/blob/b2f84ce4/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
new file mode 100644
index 0000000..fa5cc2d
--- /dev/null
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
@@ -0,0 +1,179 @@
+Structure for TestFileDump.testDump.orc
+File Version: 0.12 with HIVE_13083
+Rows: 21000
+Compression: ZLIB
+Compression size: 4096
+Type: struct<i:int,l:bigint,s:string>
+
+Stripe Statistics:
+ Stripe 1:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2146021688 max: 2147223299 sum: 515792826
+ Column 2: count: 5000 hasNull: false min: -9218592812243954469 max: 9221614132680747961
+ Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19280
+ Stripe 2:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2146733128 max: 2147001622 sum: 7673427
+ Column 2: count: 5000 hasNull: false min: -9220818777591257749 max: 9222259462014003839
+ Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19504
+ Stripe 3:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2146993718 max: 2147378179 sum: 132660742551
+ Column 2: count: 5000 hasNull: false min: -9218342074710552826 max: 9222303228623055266
+ Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19641
+ Stripe 4:
+ Column 0: count: 5000 hasNull: false
+ Column 1: count: 5000 hasNull: false min: -2146658006 max: 2145520931 sum: 8533549236
+ Column 2: count: 5000 hasNull: false min: -9222758097219661129 max: 9221043130193737406
+ Column 3: count: 5000 hasNull: false min: Darkness, max: worst sum: 19470
+ Stripe 5:
+ Column 0: count: 1000 hasNull: false
+ Column 1: count: 1000 hasNull: false min: -2146245500 max: 2146378640 sum: 51299706363
+ Column 2: count: 1000 hasNull: false min: -9208193203370316142 max: 9218567213558056476
+ Column 3: count: 1000 hasNull: false min: Darkness, max: worst sum: 3866
+
+File Statistics:
+ Column 0: count: 21000 hasNull: false
+ Column 1: count: 21000 hasNull: false min: -2146993718 max: 2147378179 sum: 193017464403
+ Column 2: count: 21000 hasNull: false min: -9222758097219661129 max: 9222303228623055266
+ Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
+
+Stripes:
+ Stripe: offset: 3 data: 63786 rows: 5000 tail: 85 index: 6974
+ Stream: column 0 section ROW_INDEX start: 3 length 17
+ Stream: column 1 section ROW_INDEX start: 20 length 166
+ Stream: column 2 section ROW_INDEX start: 186 length 169
+ Stream: column 2 section BLOOM_FILTER start: 355 length 6535
+ Stream: column 3 section ROW_INDEX start: 6890 length 87
+ Stream: column 1 section DATA start: 6977 length 20035
+ Stream: column 2 section DATA start: 27012 length 40050
+ Stream: column 3 section DATA start: 67062 length 3543
+ Stream: column 3 section LENGTH start: 70605 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 70630 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 2:
+ Entry 0: count: 1000 hasNull: false min: -9200577545527640566 max: 9175500305011173751 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -9203618157670445774 max: 9208123824411178101 positions: 4099,2,488
+ Entry 2: count: 1000 hasNull: false min: -9218592812243954469 max: 9221351515892923972 positions: 12297,6,464
+ Entry 3: count: 1000 hasNull: false min: -9206585617947511272 max: 9167703224425685487 positions: 20495,10,440
+ Entry 4: count: 1000 hasNull: false min: -9206645795733282496 max: 9221614132680747961 positions: 28693,14,416
+ Bloom filters for column 2:
+ Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4931 loadFactor: 0.5136 expectedFpp: 0.009432924
+ Entry 1: numHashFunctions: 7 bitCount: 9600 popCount: 4956 loadFactor: 0.5163 expectedFpp: 0.009772834
+ Entry 2: numHashFunctions: 7 bitCount: 9600 popCount: 4971 loadFactor: 0.5178 expectedFpp: 0.009981772
+ Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4971 loadFactor: 0.5178 expectedFpp: 0.009981772
+ Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4949 loadFactor: 0.5155 expectedFpp: 0.009676614
+ Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9347 loadFactor: 0.9736 expectedFpp: 0.829482
+ Stripe: offset: 70848 data: 63775 rows: 5000 tail: 85 index: 6965
+ Stream: column 0 section ROW_INDEX start: 70848 length 17
+ Stream: column 1 section ROW_INDEX start: 70865 length 164
+ Stream: column 2 section ROW_INDEX start: 71029 length 168
+ Stream: column 2 section BLOOM_FILTER start: 71197 length 6533
+ Stream: column 3 section ROW_INDEX start: 77730 length 83
+ Stream: column 1 section DATA start: 77813 length 20035
+ Stream: column 2 section DATA start: 97848 length 40050
+ Stream: column 3 section DATA start: 137898 length 3532
+ Stream: column 3 section LENGTH start: 141430 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 141455 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 2:
+ Entry 0: count: 1000 hasNull: false min: -9218450653857701562 max: 9189819526332228512 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -9220818777591257749 max: 9178821722829648113 positions: 4099,2,488
+ Entry 2: count: 1000 hasNull: false min: -9220031433030423388 max: 9210838931786956852 positions: 12297,6,464
+ Entry 3: count: 1000 hasNull: false min: -9208195729739635607 max: 9222259462014003839 positions: 20495,10,440
+ Entry 4: count: 1000 hasNull: false min: -9174271499932339698 max: 9212277876771676916 positions: 28693,14,416
+ Bloom filters for column 2:
+ Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4971 loadFactor: 0.5178 expectedFpp: 0.009981772
+ Entry 1: numHashFunctions: 7 bitCount: 9600 popCount: 4988 loadFactor: 0.5196 expectedFpp: 0.010223193
+ Entry 2: numHashFunctions: 7 bitCount: 9600 popCount: 5002 loadFactor: 0.521 expectedFpp: 0.01042575
+ Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4962 loadFactor: 0.5169 expectedFpp: 0.009855959
+ Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4966 loadFactor: 0.5173 expectedFpp: 0.009911705
+ Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9344 loadFactor: 0.9733 expectedFpp: 0.8276205
+ Stripe: offset: 141673 data: 63787 rows: 5000 tail: 85 index: 6971
+ Stream: column 0 section ROW_INDEX start: 141673 length 17
+ Stream: column 1 section ROW_INDEX start: 141690 length 163
+ Stream: column 2 section ROW_INDEX start: 141853 length 168
+ Stream: column 2 section BLOOM_FILTER start: 142021 length 6533
+ Stream: column 3 section ROW_INDEX start: 148554 length 90
+ Stream: column 1 section DATA start: 148644 length 20035
+ Stream: column 2 section DATA start: 168679 length 40050
+ Stream: column 3 section DATA start: 208729 length 3544
+ Stream: column 3 section LENGTH start: 212273 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 212298 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 2:
+ Entry 0: count: 1000 hasNull: false min: -9211978436552246208 max: 9179058898902097152 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -9195645160817780503 max: 9189147759444307708 positions: 4099,2,488
+ Entry 2: count: 1000 hasNull: false min: -9202888157616520823 max: 9193561362676960747 positions: 12297,6,464
+ Entry 3: count: 1000 hasNull: false min: -9216318198067839390 max: 9221286760675829363 positions: 20495,10,440
+ Entry 4: count: 1000 hasNull: false min: -9218342074710552826 max: 9222303228623055266 positions: 28693,14,416
+ Bloom filters for column 2:
+ Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4967 loadFactor: 0.5174 expectedFpp: 0.009925688
+ Entry 1: numHashFunctions: 7 bitCount: 9600 popCount: 5002 loadFactor: 0.521 expectedFpp: 0.01042575
+ Entry 2: numHashFunctions: 7 bitCount: 9600 popCount: 4964 loadFactor: 0.5171 expectedFpp: 0.009883798
+ Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4943 loadFactor: 0.5149 expectedFpp: 0.009594797
+ Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4930 loadFactor: 0.5135 expectedFpp: 0.009419539
+ Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9333 loadFactor: 0.9722 expectedFpp: 0.82082444
+ Stripe: offset: 212516 data: 63817 rows: 5000 tail: 85 index: 6964
+ Stream: column 0 section ROW_INDEX start: 212516 length 17
+ Stream: column 1 section ROW_INDEX start: 212533 length 165
+ Stream: column 2 section ROW_INDEX start: 212698 length 167
+ Stream: column 2 section BLOOM_FILTER start: 212865 length 6524
+ Stream: column 3 section ROW_INDEX start: 219389 length 91
+ Stream: column 1 section DATA start: 219480 length 20035
+ Stream: column 2 section DATA start: 239515 length 40050
+ Stream: column 3 section DATA start: 279565 length 3574
+ Stream: column 3 section LENGTH start: 283139 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 283164 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 2:
+ Entry 0: count: 1000 hasNull: false min: -9222731174895935707 max: 9214167447015056056 positions: 0,0,0
+ Entry 1: count: 1000 hasNull: false min: -9222758097219661129 max: 9221043130193737406 positions: 4099,2,488
+ Entry 2: count: 1000 hasNull: false min: -9174483776261243438 max: 9208134757538374043 positions: 12297,6,464
+ Entry 3: count: 1000 hasNull: false min: -9174329712613510612 max: 9197412874152820822 positions: 20495,10,440
+ Entry 4: count: 1000 hasNull: false min: -9221162005892422758 max: 9220625004936875965 positions: 28693,14,416
+ Bloom filters for column 2:
+ Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4951 loadFactor: 0.5157 expectedFpp: 0.009704026
+ Entry 1: numHashFunctions: 7 bitCount: 9600 popCount: 4969 loadFactor: 0.5176 expectedFpp: 0.009953696
+ Entry 2: numHashFunctions: 7 bitCount: 9600 popCount: 4994 loadFactor: 0.5202 expectedFpp: 0.010309587
+ Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4941 loadFactor: 0.5147 expectedFpp: 0.009567649
+ Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4993 loadFactor: 0.5201 expectedFpp: 0.010295142
+ Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9353 loadFactor: 0.9743 expectedFpp: 0.8332165
+ Stripe: offset: 283382 data: 12943 rows: 1000 tail: 78 index: 1468
+ Stream: column 0 section ROW_INDEX start: 283382 length 12
+ Stream: column 1 section ROW_INDEX start: 283394 length 38
+ Stream: column 2 section ROW_INDEX start: 283432 length 41
+ Stream: column 2 section BLOOM_FILTER start: 283473 length 1337
+ Stream: column 3 section ROW_INDEX start: 284810 length 40
+ Stream: column 1 section DATA start: 284850 length 4007
+ Stream: column 2 section DATA start: 288857 length 8010
+ Stream: column 3 section DATA start: 296867 length 768
+ Stream: column 3 section LENGTH start: 297635 length 25
+ Stream: column 3 section DICTIONARY_DATA start: 297660 length 133
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DICTIONARY_V2[35]
+ Row group indices for column 2:
+ Entry 0: count: 1000 hasNull: false min: -9208193203370316142 max: 9218567213558056476 positions: 0,0,0
+ Bloom filters for column 2:
+ Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4948 loadFactor: 0.5154 expectedFpp: 0.00966294
+ Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 4948 loadFactor: 0.5154 expectedFpp: 0.00966294
+
+File length: 298416 bytes
+Padding length: 0 bytes
+Padding ratio: 0%
+________________________________________________________________________________________________________________________
+